## Tutorial on how to use ELQEntityLinking model using text descriptions

In [1]:
## imports
import os
from pprint import pprint
from time import time
from elqel.entity_linking import ELQEntityLinker

## parameters
biencoder_path = "./biencoder/pytorch_model.bin"
models_path = "./models/"
annotations_dir = "./ann_preds" # directory to save predictions

## Example

To produce predictions, ELQ requires text inputs as lists of dictionaries in `python`-format, as in the example provided below:

In [2]:
# data to annotate 
data_example = [{
    "id": "BM-A_1936-1012-44",
    "text": "Figure (woman) wearing Rainbow Dance costume. Made of red, black, blue, gold painted earthenware.".lower(),
},

    {
    "id": "BM-A_1940-0716-13",
    "text": "Figure (Gaṇesa). Folk deity, seated feet crossed holding 2 lotuses. Made of bronze.".lower(),
}
]

# directory to store annotations
if not os.path.exists(annotations_dir):
    os.makedirs(annotations_dir)

Once input data is properly formatted, predictions can be obtained as follows:

In [3]:
# instantiate model
elq_model = ELQEntityLinker(models_path=models_path,
                            biencoder_path=biencoder_path,
                            prediction_type="unique")

Loading biencoder model
Loading candidate entities
Loading id2title
Loading id2text
Loading id2wikidata


In [4]:
# predict
predictions = elq_model.entity_linking(data_example)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.33it/s]

Prediction type: unique
[32mAnnotations (raw) saved: False[0m





**Predictions Raw format example**

The example below shows the raw format output of the model predictions.

- `id` (str): unique text identifier
- `pred_triples` (list of tuples): Each tuple contains wikipedia_id, and indicates the tagged item using start and end tokens
- `pred_tuples_string` (list of lists): Each sublist contains linked aat title and tagged item (str), respectively
- `scores` (list of floats): List of scores for the best candidate for each tagged item
- `text` (str): text being annotated
- `tokens` tokens (list of ints): tokenized text

In [5]:
pprint(predictions[0])

{'id': 'BM-A_1936-1012-44',
 'pred_triples': [('31871', 8, 9),
                  ('30731', 19, 20),
                  ('30193', 20, 23),
                  ('64866', 1, 2)],
 'pred_tuples_string': [['costume', 'costume'],
                        ['painted', 'painted'],
                        ['earthen ware', 'earthenware'],
                        ['human figures', 'figure']],
 'scores': [8.123689651489258,
            5.847492218017578,
            5.380870819091797,
            4.930703163146973],
 'text': 'figure (woman) wearing rainbow dance costume. made of red, black, '
         'blue, gold painted earthenware.',
 'tokens': [101,
            3275,
            1006,
            2450,
            1007,
            4147,
            10098,
            3153,
            9427,
            1012,
            2081,
            1997,
            2417,
            1010,
            2304,
            1010,
            2630,
            1010,
            2751,
            4993,
            3

To convert predictions to a pandas DataFrame format, use the `preds2dataframe` method:

In [6]:
predictions_df = elq_model.preds2dataframe(save_path = os.path.join(annotations_dir, "predictions_df.csv")) 
predictions_df

[32mAnnotations (DF) saved: ./ann_preds/predictions_df.csv[0m


Unnamed: 0,id,text,chunk_text,chunk_start,chunk_end,aat
0,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,figure,0,6,300404114
1,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,costume,37,44,300178802
2,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,painted,77,84,300161986
3,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,earthenware,85,96,300140803
4,BM-A_1940-0716-13,"figure (ganesa). folk deity, seated feet cross...",figure,0,6,300189808
5,BM-A_1940-0716-13,"figure (ganesa). folk deity, seated feet cross...",feet,36,40,300310200
6,BM-A_1940-0716-13,"figure (ganesa). folk deity, seated feet cross...",bronze,76,82,300010957


Using `"multiple"` as `prediction_type` in `ELQEntityLinker` will include information for the best candidates found by the model. An example is provided below.

In [7]:
# instantiate model
elq_model = ELQEntityLinker(models_path=models_path,
                             biencoder_path=biencoder_path,
                             prediction_type="multiple")

Loading biencoder model
Loading candidate entities
Loading id2title
Loading id2text
Loading id2wikidata


In [8]:
# entity linking
predictions_mult = elq_model.entity_linking(data_example)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.44it/s]

Prediction type: multiple
[32mAnnotations (raw) saved: False[0m





**Predictions Raw format example**

The example below shows the raw format output of the model predictions.

- `id` (str): unique text identifier
- `pred_triples` (list of tuples): Each tuple contains:
        - wikipedia_ids for the best candidates
        - tagged item using start and end tokens
- `pred_tuples_string` (list of lists): Each sublist contains:
        - list linked aat title for the best candidates 
        - tagged item (str)
- `scores` (list of lists): Each list contains the list of scores for the best candidate for each tagged item
- `text` (str): text being annotated
- `tokens` tokens (list of ints): tokenized text

In [9]:
pprint(predictions_mult[0])

{'id': 'BM-A_1936-1012-44',
 'pred_triples': [(['64866',
                    '33638',
                    '68703',
                    '58071',
                    '48890',
                    '23377',
                    '44481',
                    '65317',
                    '34316',
                    '23401'],
                   1,
                   2),
                  (['31871',
                    '56725',
                    '40719',
                    '56531',
                    '23247',
                    '56533',
                    '38754',
                    '56532',
                    '38712',
                    '33638'],
                   8,
                   9),
                  (['30731',
                    '25473',
                    '25137',
                    '64177',
                    '58165',
                    '30730',
                    '24135',
                    '25056',
                    '43621',
                    '61153'],
         

In [10]:
predictions_df_mult = elq_model.preds2dataframe(save_path = os.path.join(annotations_dir, "predictions_mult_df.csv"))
predictions_df_mult

[32mAnnotations (DF) saved: ./ann_preds/predictions_mult_df.csv[0m


Unnamed: 0,id,text,chunk_text,chunk_start,chunk_end,aat,aat_str,best_score,candidates
0,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,figure,0,6,300404114,human figures,4.930703,"{'300404114': ('human figures', 4.930703163146..."
1,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,costume,37,44,300178802,costume,8.12369,"{'300178802': ('costume', 8.123689651489258), ..."
2,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,painted,77,84,300161986,painted,5.847492,"{'300161986': ('painting', -14.500349044799805..."
3,BM-A_1936-1012-44,figure (woman) wearing rainbow dance costume. ...,earthenware,85,96,300140803,earthen ware,5.380871,"{'300140803': ('earthenware', -5.8615231513977..."
4,BM-A_1940-0716-13,"figure (ganesa). folk deity, seated feet cross...",figure,0,6,300189808,figures,10.217789,"{'300189808': ('figures', 10.217788696289062),..."
5,BM-A_1940-0716-13,"figure (ganesa). folk deity, seated feet cross...",feet,36,40,300310200,foot,5.481364,"{'300310200': ('feet', -10.685483932495117), '..."
6,BM-A_1940-0716-13,"figure (ganesa). folk deity, seated feet cross...",bronze,76,82,300010957,copper-tin alloy,10.927487,"{'300010957': ('copper-tin alloy', 10.92748737..."
