In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
from pathlib import Path
import ujson, os
from tqdm import tqdm
from bootleg.symbols.entity_profile import EntityProfile

# Entity Profile Tutorial

In this tutorial, we will show you how to modify and interact with our entity metadata.

### Requirements

You will need to download the following files for this notebook:
- Pretrained Bootleg uncased model and config [here](https://bootleg-data.s3-us-west-2.amazonaws.com/models/lateset/bootleg_uncased.tar.gz). Cased model and config [here](https://bootleg-data.s3-us-west-2.amazonaws.com/models/lateset/bootleg_cased.tar.gz)
- Entity data [here](https://bootleg-data.s3-us-west-2.amazonaws.com/data/lateset/entity_db.tar.gz)

For convenience, you can run the commands below (from the root directory of the repo) to download all the above files and unpack them to `models` and `data` directories. It will take several minutes to download all the files.

```
    # use cased for cased model
    bash tutorials/download_model.sh uncased
    bash tutorials/download_data.sh
```


### Load up the entity profile
Inside the cache directory is
* entity_mappings: where aliases and entity information is stored. We also have the original unfiltered alias to candidate mapping we used for training on Wikipedia. For all other uses, we use the alias to candidate map called `alias2qids.json`, with higher quality aliases.
* type_mappings: where type information is stored. There will be one subfolder per type system. In the `wiki` subfolder, we have a mapping from Wikidata title to Wikidata QID for the types. The `relations` subfolder is where we keep our relationship types and treat them as types in our model.
* kg_mappings: where kg information is stored

When we load a entity profile, we can put it in `edit_mode` to allow us to make changes. Don't forget to set that flag below to edit.

See our read the docs [here](https://bootleg.readthedocs.io/en/latest/gettingstarted/entity_profile.html) for more information on our entity profiles.

In [3]:
# MODIFY THE PATH TO THE DOWNLOADED ENTITY_DB DATA. I saved mine in ../tutorial_data
entity_profile_cache = Path("../tutorial_data/data/entity_db")
# Print out directory structure
for fold in entity_profile_cache.iterdir():
    # Skip showing our prep directory as that's used when loading a model
    if fold.name in ["prep"]:
        continue
    print(fold.name)
    for sub_file in fold.iterdir():
        print("   ", sub_file.name)
        if sub_file.is_dir():
            for subsub_file in sub_file.iterdir():
                print("       ", subsub_file.name)

type_mappings
    hyena_coarse
        config.json
        qid2typenames.json
        type_vocab.json
        qid2typeids.json
    hyena
        qid2typeids.json
        config.json
        type_vocab.json
        qid2typenames.json
    relations
        qid2typeids.json
        config.json
        qid2typenames.json
        type_vocab.json
    wiki
        type_vocab_to_wikidataqid.json
        config.json
        qid2typeids.json
        type_vocab.json
        qid2typenames.json
entity_mappings
    alias2id.json
    alias2qids_unfiltered.json
    config.json
    qid2eid.json
    qid2title.json
    alias2qids.json
kg_mappings
    config.json
    kg_adj.txt
    qid2relations.json


We call `load_from_cache` to load in a profile. If you only want to modify or edit only type information or only kg information, we provide flags to turn off loading some data. In particular, the `no_kg` flag turns off KG information, `no_type` flag turns off type information, and `type_systems_to_load` will specify which types system subfolders to load (`None` means load all).

**Note** that if you do not load up a subset of metadata, you cannot add, remove, or otherwise examine that data. If you set `no_kg = True`, for example, you can't add a new KG connection. This also means if you call `save`, that metadata will not be saved. 

In [4]:
import time
st = time.time()
# Load up ALL profile data - don't forget to set edit_mode = True
# As edit_mode triggers the profile to build some index structures for fast editing, the loading takes a few minutes for all of wiki
ep = EntityProfile.load_from_cache(entity_profile_cache, edit_mode=True, verbose=True)
print(f"Loaded full ep in {time.time() - st}")
st = time.time()

# Load up NO KG information
'''
ep = EntityProfile.load_from_cache(entity_profile_cache, edit_mode=True, verbose=True, no_kg=True)
print(f"Loaded full ep without KG in {time.time() - st}")
st = time.time()
'''
# Load up NO TYPE information
'''
ep = EntityProfile.load_from_cache(entity_profile_cache, edit_mode=True, verbose=True, no_type=True)
print(f"Loaded full ep without type in {time.time() - st}")
st = time.time()
'''

# Load up only wiki type information
'''
ep = EntityProfile.load_from_cache(entity_profile_cache, edit_mode=True, verbose=True, no_kg=True, type_systems_to_load=["wiki"])
print(f"Loaded full ep without KG and only wikidata type in {time.time() - st}")
'''

Loading Entity Symbols


Building edit mode objs: 100%|██████████| 15202497/15202497 [00:40<00:00, 378992.50it/s]


Loading Type Symbols from ../tutorial_data/data/entity_db/type_mappings/hyena_coarse


Building edit mode objs: 100%|██████████| 5832699/5832699 [00:02<00:00, 2095855.72it/s]


Loading Type Symbols from ../tutorial_data/data/entity_db/type_mappings/hyena


Building edit mode objs: 100%|██████████| 5832699/5832699 [00:15<00:00, 382345.45it/s]


Loading Type Symbols from ../tutorial_data/data/entity_db/type_mappings/relations


Building edit mode objs: 100%|██████████| 5832699/5832699 [00:15<00:00, 371134.01it/s]


Loading Type Symbols from ../tutorial_data/data/entity_db/type_mappings/wiki


Building edit mode objs: 100%|██████████| 5832699/5832699 [00:08<00:00, 713689.19it/s]


Loading KG Symbols


Checking relations and building edit mode objs: 100%|██████████| 5832699/5832699 [01:45<00:00, 55485.86it/s] 

Loaded full ep in 510.0096580982208





'\nep = EntityProfile.load_from_cache(entity_profile_cache, edit_mode=True, verbose=True, no_kg=True, type_systems_to_load=["wiki"])\nprint(f"Loaded full ep without KG and only wikidata type in {time.time() - st}")\n'

### Let's see what operations you can call

In [5]:
object_methods = [method_name for method_name in dir(ep)
                  if callable(getattr(ep, method_name))]

print(object_methods)

['__class__', '__delattr__', '__dir__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read_profile_file', 'add_entity', 'add_mention', 'add_relation', 'add_type', 'get_all_connections', 'get_all_mentions', 'get_all_qids', 'get_all_types', 'get_all_typesystems', 'get_connections_by_relation', 'get_eid', 'get_entities_of_type', 'get_mentions', 'get_mentions_with_scores', 'get_qid_cands', 'get_qid_count_cands', 'get_title', 'get_types', 'is_connected', 'load_from_cache', 'load_from_jsonl', 'mention_exists', 'prune_to_entities', 'qid_exists', 'reidentify_entity', 'remove_mention', 'remove_relation', 'remove_type', 'save', 'save_to_jsonl', 'update_entity']


In [6]:
# Get the title of an entity
print("Title:", ep.get_title("Q62446736"))

# Get mentions for an entity
print("Mentions:", ep.get_mentions("Q62446736"))

# Get type systems
print("Type Systems:", ep.get_all_typesystems())

# Get some types
print("Sample Wikidata Types:", ep.get_all_types("wiki")[:5])

Title: Apple TV+
Mentions: {'apple tv', 'apple worldwide video', 'appletv', 'apple', 'apple tv plus'}
Type Systems: ['hyena_coarse', 'hyena', 'relations', 'wiki']
Sample Wikidata Types: ['town in China', 'tehsil of India', 'subdistrict of China', 'faculty', 'pier']


### Modify the types

Suppose you think the QID Q62446736 should really be a computer type instead of a tv type. First we need to see what types the QID is and find a possible replacement type. Then we need to actually remove and add the types.

In [7]:
# First get existing types
qid = "Q62446736"
type_system = "wiki"
print("Existing Types:", ep.get_types(qid, type_system))

# Get all possible types with the word computer in it
all_types = ep.get_all_types(type_system)

comp_types = [t for t in all_types if "computer" in t.lower()]
print(len(comp_types))
print(comp_types)

Existing Types: ['video streaming service']
73
['computer program', 'minicomputer', 'computer network', 'computer model', 'tablet computer', 'computer network protocol', 'computer model series', 'supercomputer', 'computer scientist', '3D computer graphics software', 'computer', 'vector supercomputer', 'computer system', 'computer form factor', 'personal computer', 'computer-aided engineering', 'computer memory', 'home computer', 'computer language', 'computer science term', 'computer monitor', 'microcomputer', 'first generation computer', 'decimal computer', 'computer key', 'computer programming', 'computer surveillance', 'portable computer', 'computer science', 'computer file', 'one-of-a-kind computer', 'computer architecture', 'computer file management', 'computer-aided design software', 'computer security software', 'computer hardware', 'single-board computer', 'computer-animated film', 'computer data storage', 'desktop computer', 'computer worm', 'computer magazine', 'computer alge

In [8]:
# Remove type
ep.remove_type(qid, "video streaming service", type_system)
# Add type
ep.add_type(qid, "computer", type_system)

print("Modified Types:", ep.get_types(qid, type_system))

Modified Types: ['computer']


### Modify the relations

Suppose you think Q178194 should not have the relation P910 with Q8439242 anymore. Don't worry if you misspecify the relation pair. If the pair doesn't exist, we do nothing.

In [9]:
qid = "Q62446736"
print("Existing Connections:", ep.get_all_connections(qid))

# # Remove relation
ep.remove_relation(qid, "P31", "Q59152282")

print("Modified Connections:", ep.get_all_connections(qid))

Existing Connections: {'P31': ['Q59152282'], 'P137': ['Q312'], 'P127': ['Q312'], 'P17': ['Q30'], 'P407': ['Q1860'], 'P452': ['Q723685'], 'P749': ['Q312'], 'P1454': ['Q891723'], 'P910': ['Q49225405'], 'P1889': ['Q270285']}
Modified Connections: {'P137': ['Q312'], 'P127': ['Q312'], 'P17': ['Q30'], 'P407': ['Q1860'], 'P452': ['Q723685'], 'P749': ['Q312'], 'P1454': ['Q891723'], 'P910': ['Q49225405'], 'P1889': ['Q270285']}


### Add a new entity

To add a new entity, we need to provide the following json object to our entity profile
```
{
    "entity_id": "C000",
    "mentions": [["dog", 10.0], ["dogg", 7.0], ["animal", 4.0]],
    "title": "Dog",
    "types": {"hyena": ["animal"], "wiki": ["dog"]},
    "relations": [
        {"relation": "sibling", "object": "Q345"},
        {"relation": "sibling", "object": "Q567"},
    ],
}
```

In [10]:
title = "Some New Entity"
# The numeric value is the score associated with the mention
mentions = [["computer", 10.0], ["sparkle device", 12.0]]
wiki_types = ["computer"]
d = {
    "entity_id": "NQ1",
    "mentions": mentions,
    "title": title,
    "types": {"wiki": wiki_types},
}
if not ep.qid_exists("NQ1"):
    ep.add_entity(d)

### Remove unused entities

Lastly, for space reasons, it'd be nice to remove the QIDs that are no longer needed in this dump. For that, we can call `prune_to_entities`. This operation will remove all entities not in the set of entities given. In will throw an error, however, if you ask it to remove an entity that doesn't exist.

**Important** we with *reindex* the entities after this call. You *must* call the `fit_to_profile` method described below for these changes to take affect with the model.

In [11]:
# Get entities to keep based on those that have the types in `types_to_add`
type_system = "wiki"
types_to_add = ["computer", "fruit", "meat", "country", "national association football team"]
entities_of_type = set()
for ty in types_to_add:
    entities_of_type.update(set(ep.get_entities_of_type(ty, type_system)))

# Make sure they are all in the dump
for qid in tqdm(entities_of_type):
    if not ep.qid_exists(qid):
        print(f"{qid} does not exists")
        break

100%|██████████| 1523/1523 [00:00<00:00, 265433.60it/s]


In [12]:
print(f"Starting number of entities: {len(ep.get_all_qids())}")
ep.prune_to_entities(entities_of_type)
print(f"Ending number of entities: {len(ep.get_all_qids())}")

Starting number of entities: 5832700
Pruning entity data
Pruning hyena_coarse data
Pruning hyena data
Pruning relations data
Pruning wiki data
Pruning kg data
Ending number of entities: 1523


In [13]:
# Save the new profile
ep.save(entity_profile_cache.parent / "new_profile_wiki")

# Fit Model

If you added or removed entities to your set, you'll need to "refit" your model so the entity embeddings are aligned by running the `fit_profiles` method. You can optionally pass in the model config to `fit_profiles`, and it will adjust the config automatically.

We do not support refitting a `mini` model, but don't worry. We'll show you how to refit a larger model and then create your own `mini` model. For this, you will need the `raw_train_metadata`, described below.

If you only adjusted the mappings and don't need to refit the model, you can skip the `fit_profiles` cells below.

### Download train metadata

If you want to make a `mini` model or want to pass our embedding init vector to `fit_to_profile` (recommended but not required), you will need some metadata associated with our training data to use to create the mini model and to recreate some internal model mappings for running. We have provided those raw files for download at https://bootleg-data.s3-us-west-2.amazonaws.com/data/lateset/raw_train_metadata.tar.gz. This dump also contains the same entity embedding initialization vectors we used for our uncased and cased models in case you want to initialize any new entity embeddings with those vectors. This is totally optional.

Download and dump to `{tutorial_data_dir}/data` folder (or where you'd like)

In [None]:
!echo $tutorial_data_dir/data
!wget https://bootleg-data.s3-us-west-2.amazonaws.com/data/latest/raw_train_metadata.tar.gz -P $tutorial_data_dir/data
!tar -xzvf $tutorial_data_dir/data/raw_train_metadata.tar.gz -C $tutorial_data_dir/data

### Title Embeddings

Another component of our models is our contextualized title embedding that is generated from BERT. To make runtime faster, this is saved as a static embedding file in `<data_config.entity_dir>/<data_config.entity_prep_dir>`. If you call `fit_profiles` (below), we will try to automatically add the new titles. You can turn this off by setting `no_title_emb` to `True`. If this functions works without errors or warning, you can skip the rest of the discussion in this section. If there is an error, to facilitate debugging, we describe our creation process below.

Title embeddings are created in a two-step process.

 1. We pass each entity title through BERT and save the output. This happens by calling
     ```
     python3 -m bootleg.utils.preprocessing.build_static_embeddings \
       --save_file <PATH_TO_SAVE_FILE> \
       --entity_dir ../tutorial_data/data/entity_db \
       --bert_model  bert-base-uncased \
       --output_method pt \
       --batch_size 1024
    ```
 2. The `<PATH_TO_SAVE_FILE>` is passed as the `emb_file` input argument in our config to our title embedding. If the model is being run for the first time, it will prep the title embeddings (adding UNK/PAD rows and zero embeddings for QIDs not in the saved embedding) and save in the `<data_config.entity_dir>/<data_config.entity_prep_dir>` (e.g., `../tutorial_data/data/entity_db/prep`). Upon rerunning, we will use the cached file in the prep directory instead of recreating from scratch.

We provide the saved embedding file before prepping in the downloaded metadata for reference.

When running `fit_profiles`, we add new titles to the embeddings by loading the prepped embedding file and inserting the new embeddings. If a config is provided, we will look in `<data_config.entity_dir>/<data_config.entity_prep_dir>`. If not, we will look in `<train_entity_profile>/prep`. If neither are found, we will not modify the title embeddings. If this happens, you can either do nothing and proceed as normal. This will result in all new embeddings getting a 0 embedding for the title. Otherwise, and we recommend this, you can call `bootleg.utils.preprocessing.build_static_embeddings` to rebuild from scratch on the new profile. If you do this, keep track of the `<PATH_TO_SAVE_FILE>`. You will see where to add this to the config below.

In [14]:
# Load up paths. This path should have the `data` and `models` subfolders. It should be equivalent to
# entity_profile_cache.parent.parent
tutorial_data_dir = Path("../tutorial_data/")

# Base model config to modify
old_config_path = str(tutorial_data_dir/"models/bootleg_uncased/bootleg_config.yaml")
# Provide save path for the new bootleg config yaml file. This can be anywhere.
new_config_save_path = "np_bootleg_config.yaml"
# Base model pth path to modify
model_path = str(tutorial_data_dir/"models/bootleg_uncased/bootleg_wiki.pth")
# Provice model path for new bootleg model. This can be anywhere
new_model_path = "np_bootleg_model.pth"
# Path where you saved the adjusted entity profile above
new_entity_path = str(entity_profile_cache.parent / "new_profile_wiki")

In [15]:
from types import SimpleNamespace
from bootleg.utils.entity_profile.fit_to_profile import fit_profiles

args = SimpleNamespace(
    # If you would like to use the same vector we used to intialize our model, download the raw_train_metadata (shown below) and set the path here to the init vec.
    init_vec_file = None,
    train_entity_profile = str(entity_profile_cache),
    new_entity_profile = str(entity_profile_cache.parent / "new_profile_wiki"),
    model_path = model_path,
    model_config = old_config_path,
    save_model_path = new_model_path,
    save_model_config = new_config_save_path,
    # If you renamed any QIDs, pass the renaming mapping here as a path to the saved mapping dictionary.
    oldqid2newqid = None,
    # If you do not want us to adjust title embeddings, set to True.
    no_title_emb = False,
    # Bert model to use to generate title embeddings. Set to cased if using cased model.
    bert_model="bert-base-uncased",
    # If you'd like us to user a different cache_dir when loading the Hugging Face BERT model, add that path below.
    bert_model_cache=None,
    # If you want to use the CPU to generate new titles embeddings, set to True.
    cpu=False,
)

fit_profiles(args)

{
    "init_vec_file": null,
    "train_entity_profile": "..\/tutorial_data\/data\/entity_db",
    "new_entity_profile": "..\/tutorial_data\/data\/new_profile_wiki",
    "oldqid2newqid": null,
    "no_title_emb": false,
    "bert_model": "bert-base-uncased",
    "bert_model_cache": null,
    "cpu": false,
    "model_path": "..\/tutorial_data\/models\/bootleg_uncased\/bootleg_wiki.pth",
    "model_config": "..\/tutorial_data\/models\/bootleg_uncased\/bootleg_config.yaml",
    "save_model_path": "np_bootleg_model.pth",
    "save_model_config": "np_bootleg_config.yaml"
}
Loading train entity profile from ../tutorial_data/data/entity_db
Loading new entity profile from ../tutorial_data/data/new_profile_wiki
Loading model from ../tutorial_data/models/bootleg_uncased/bootleg_wiki.pth.
Loaded model.
Setting init vector to be all zeros
Saving model at np_bootleg_model.pth
Looking for title embedding in ../tutorial_data/data/entity_db/prep
Attempting to refit title static_table_static_wiki_0122_

Adding new titles: 100%|██████████| 1/1 [00:00<00:00, 66.07it/s]


Dumped config to np_bootleg_config.yaml


If you **just** need to adjust the config, run the following

In [27]:
from bootleg.utils.entity_profile.fit_to_profile import modify_config

# As the model is unchanged, we pass the original model_path
modify_config(old_config_path=old_config_path,
              new_config_path=new_config_save_path,
              model_save_path=new_model_path,
              new_entity_path=new_entity_path)

Dumped config to np_bootleg_config.yaml


### Create mini model (optional)
Once you have a new profile and a fit model, you now have the option to first make a new mini model before running the adjusted model with our annotator (or on your own data). Like above, we will automatically modify the config if you pass it in.

In [16]:
from types import SimpleNamespace
from bootleg.utils.entity_profile.compress_topk_entity_embeddings import compress_topk_embeddings

percent_embedding_to_drop = 0.95
mini_model_path = "np_bootleg_config_mini.yaml"
mini_config_path = "np_bootleg_model_mini.pth"

args = SimpleNamespace(
    # Path to json of QID to count in training data. Downloaded in raw_train_metadata.
    qid2count = str(tutorial_data_dir / "data" / "raw_train_metadata" / "qid_cnts_train.json"),
    perc_emb_drop = percent_embedding_to_drop,
    # This is not needed to run a model
    save_qid2topk_file = str(entity_profile_cache.parent / "new_profile_wiki" / "entity_mappings" / f"qid2eid_top{int(100*(1-percent_embedding_to_drop))}.json"),
    entity_dir = str(entity_profile_cache.parent / "new_profile_wiki"),
    model_path = new_model_path,
    model_config = new_config_save_path,
    save_model_path = mini_model_path,
    save_model_config = mini_config_path,
)

compress_topk_embeddings(args)

Loading entity symbols from ../tutorial_data/data/new_profile_wiki/entity_mappings
Loading qid2count from ../tutorial_data/data/raw_train_metadata/qid_cnts_train.json
Filtering qids
Removing the least popular 0.95 embeddings
Dropping 1446 qids out of 1523
Filtering embeddings
Loading model from np_bootleg_model.pth.


Modifying weights: 100%|██████████| 80/80 [00:00<00:00, 46713.67it/s]
Verifying embeddings: 100%|██████████| 1523/1523 [00:00<00:00, 599917.82it/s]
Setting new ids: 100%|██████████| 1523/1523 [00:00<00:00, 135277.21it/s]

Loaded model.
Verified 0.050558108995403805 percent of embeddings are the same
odict_keys(['learned_entity_embedding.weight', 'eid2topkeid'])
Saving model at np_bootleg_config_mini.yaml





Saving topk to eid at ../tutorial_data/data/new_profile_wiki/entity_mappings/../tutorial_data/data/new_profile_wiki/entity_mappings/qid2eid_top5.json
Dumped config to np_bootleg_model_mini.pth


### Run model
Before running the annotator, we need to load and sanity check the config. We pass this into the annotator.

In [17]:
# Load and sanity check config
import yaml

# !!! Set this to what config you want to use (if you made a mini model, then set this to `mini_config_path`)
config_to_load = new_config_save_path # OR mini_config_path, depending on if you had a mini model or not

# !!! If we were unable to add new titles and you manually called `build_static_embeddings`
# emb_file_title = <PATH_TO_SAVE_FILE>
# !!! If you don't want to adjust titles use the following (use static_wiki_0122_title.pt for our "cased" model)
# emb_file_title = os.path.join(str(tutorial_data_dir), "data", "raw_train_metadata", "static_wiki_0122_title_uncased.pt")
# !!! If we successfully modified the titles, we will ignore
emb_file_title = None

# Load config
with open(config_to_load) as file:
    config = yaml.load(file, Loader=yaml.FullLoader)


for ent in config["data_config"]["ent_embeddings"]:
    # For sanity, we set the raw entity regularization mapping to the path in raw_train_metadata. If you did not
    # download this folder, that's okay. The model should run without it.
    if ent["load_class"] == "LearnedEntityEmb":
        if "regularize_mapping" in ent["args"]:
            ent["args"]["regularize_mapping"] = os.path.join(str(tutorial_data_dir), "data", "raw_train_metadata", "qid2reg_pow.csv")
    # If you had to manually adjust title embeddings, that file will be correctly set here. If everything worked fine
    # with `fit_profiles`, we set emb_file_title to be None as the model won't need to load from the raw file.
    if ent["load_class"] == "StaticEmb" and ent["key"] == "title_static":
        if "emb_file" in ent["args"] and emb_file_title is not None:
            ent["args"]["emb_file"] = emb_file_title
            
print(ujson.dumps(config, indent=4))

{
    "data_config": {
        "data_dir": "data",
        "data_prep_dir": "prep",
        "dev_dataset": {
            "file": "merged_sampled.jsonl",
            "use_weak_label": true
        },
        "emb_dir": "..\/tutorial_data\/data\/new_profile_wiki",
        "ent_embeddings": [
            {
                "args": {
                    "learned_embedding_size": 200,
                    "regularize_mapping": "..\/tutorial_data\/data\/raw_train_metadata\/qid2reg_pow.csv"
                },
                "cpu": false,
                "freeze": false,
                "key": "learned",
                "load_class": "LearnedEntityEmb"
            },
            {
                "args": {
                    "emb_file": "data\/raw_train_metadata\/static_wiki_0122_title_uncased.pt",
                    "proj": 256
                },
                "cpu": false,
                "freeze": false,
                "key": "title_static",
                "load_class": "StaticEmb"
   

In [19]:
# Load new annotator with our config - notice how it does have to reprep some things
from bootleg.end2end.bootleg_annotator import BootlegAnnotator

# You can also pass `return_embs=True` to get the embeddings
ann = BootlegAnnotator(config=config, device=-1, return_embs=False)

[2021-04-28 17:22:14,898][INFO] emmental.meta:122 - Logging was already initialized to use bootleg_logs/wiki_full_ft/2021_04_28/15_32_06/eb46bca7.  To configure logging manually, call emmental.init_logging before initialiting Meta.
[2021-04-28 17:22:14,943][INFO] emmental.meta:62 - Loading Emmental default config from /dfs/scratch0/lorr1/env_bootleg_38/lib/python3.8/site-packages/emmental/emmental-default-config.yaml.
[2021-04-28 17:22:14,944][INFO] emmental.meta:171 - Updating Emmental config from user provided config.
[2021-04-28 17:22:14,945][INFO] emmental.utils.seed:27 - Set random seed to 1234.
[2021-04-28 17:22:16,521][INFO] emmental.model:72 - Created emmental model Bootleg that contains task set().
[2021-04-28 17:22:19,615][INFO] bootleg.tasks.task_getters:12 - Loading embeddings...
[2021-04-28 17:22:19,864][INFO] emmental.task:78 - Created task: NED
[2021-04-28 17:22:19,866][INFO] emmental.model:108 - Moving bert module to CPU.
[2021-04-28 17:22:19,873][INFO] emmental.model:1

In [20]:
# These are some of the aliases our model will possible extract from sentences...they are all about computers!
print(list(ann.all_aliases_trie.keys())[:10])

['sa', 'saint vincent', 'saint vincent and the grenadines', 'saint vincent and the grenadines national u20 football team', 'saint vincent and the grenadines national under20 football team', 'saint vincent and the grenadines national football team', 'saint vincent and the grenadines national team', 'saint vincent and the grenadines u20', 'saint vincent and grenadines', 'saint vincent amp the grenadines', 'saint vincent national football team', 'saint vincent the grenadines', 'saint vicent and the grenadines', 'saint kitts and', 'saint kitts and nevis', 'saint kitts and nevis national football team', 'saint kitts and nevis national team', 'saint kitts and nevis national under20 football team', 'saint kitts and nevis federation of', 'saint kitts amp nevis', 'saint kitts amp nevis national football team', 'saint kitts nevis', 'saint kittsnevis', 'saint lucia', 'saint lucia national football team', 'saint lucia national team', 'saint lucia national under17 football team', 'saint lucia u17 m

In [21]:
# Extract some mentions...notice that there is less ambiguity as well because we removed a lot of QIDs from our dump
ann.label_mentions("How did San Marino score")

{'qids': [['Q185350']],
 'probs': [[0.6569564342498779]],
 'titles': [['San Marino national football team']],
 'cands': [[['Q238',
    'Q185350',
    'Q1813168',
    'Q7414851',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1',
    '-1']]],
 'cand_probs': [[array([0.29356992, 0.65695643, 0.02778879, 0.02168491, 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ],
         dtype=float32)]],
 'spans': [[[2, 4]]],
 'aliases': [['san marino']]}