In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import pandas as pd
import json
import re
import nltk
import string

def create_regex_pattern(term):
    # function that creates a regex pattern with word boundaries
    return r'\b' + re.escape(term) + r'\b'

def get_sentences_containing_entity(row):
    content = row['content_en']
    ent = row['text']
    sents_srs = pd.Series(nltk.sent_tokenize(content))
    reg = create_regex_pattern(ent)
    sents_containing_ent = sents_srs[
        sents_srs.str.contains(reg, regex=True)
    ].tolist()
    return sents_containing_ent

def print_progress_bar(iteration, total, bar_length=50):
    progress = float(iteration) / float(total)
    arrow = '=' * int(round(progress * bar_length) - 1)
    spaces = ' ' * (bar_length - len(arrow))

    print(f'Progress: [{arrow + spaces}] {int(progress * 100)}%', end='\r')

def build_query_for_bert_ned(
    entity_text, sentence_mention, option
):
    query = " Is '{entity_mention}' in the context of: '{sentence_mention}', referring to [SEP] {option}?".format(
        entity_mention=entity_text,
        sentence_mention=sentence_mention,
        option=option
    )
    return query

def join_text_tokens(tokens):
    text = ''.join(
        [
            tokens[i] if tokens[i] in string.punctuation else ' ' + tokens[i] for i in range(len(tokens))
        ]
    ).strip()
    return text

def find_surrounding_text(target_word, sentence, n_tokens=5):
    
    start_indices = [
        m.start() for m in re.finditer(
            r'\b' + re.escape(target_word) + r'\b', 
            sentence, 
            flags=re.IGNORECASE)
    ]
    
    surrounding_words = []
    for ix in start_indices:
        before_txt = sentence[:ix]
        before_tkns = nltk.word_tokenize(before_txt)
        before_n_tokens = before_tkns[-n_tokens:]
        after_txt = sentence[ix+len(target_word):]
        after_tkns = nltk.word_tokenize(after_txt)
        after_n_tokens = after_tkns[:n_tokens]
        all_tokens = before_n_tokens + [target_word] + after_n_tokens 
        surrounding_txt = join_text_tokens(all_tokens)
        surrounding_words.append(surrounding_txt)
    return surrounding_words

class BertNED():
    
    def __init__(self):
        # predictions will be made on the gpu if there is a gpu available
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu"
        )
        # load the BERT NED model
        self.model = BertForSequenceClassification.from_pretrained(
            'JordiAb/BERT_NED'
        ).eval().to(self.device)
        # load the BERT NED tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            'JordiAb/BERT_NED'
        )
    
    def _predict_proba(self, input_ids, attention_mask):
        """
        Make a call to our BERT model in order to compute the probability of a tokenized string like:
        " Is '{entity}' in the context of: '{context}', referring to [SEP] {option}?"
        Receives:
         - input_ids: Tensor: Tokenized string
         - attention_mask: Tensor: Tokenized attention mask
        Returns:
         - np.array([prob0, prob1]): numpy array containing the two probabilities. Probability of belongin to
           class 0 and probability of belonging to class 1.
        """
        # move input ids to GPU (if available)
        iids = input_ids.to(self.device)
         # move attention mask to GPU (if available)
        att_msk = attention_mask.to(self.device)
        with torch.no_grad(): # avoid gradient computation to save memory
            # forward pass of the model
            outputs = self.model(
                input_ids=iids, 
                token_type_ids=None, 
                attention_mask=att_msk
            )
    
        # get logits of prediction
        logits = outputs.logits
        # Use softmax to get probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=1)
        # is meant for one observation so return probabilities[0], move the resulting tensor to cpu and return it as numpy array
        return probabilities[0].cpu().numpy()
    
    def _compute_scores(self, entity, context, options):
        """
        Given a single entity and context where the entity is mentioned.
        For all options, encode the input as expected by our BERT model and predict the probability 
        of a single option being the correct option given the context where the entity is being mentioned.
        Receives:
         - entity: str: The entity to desambiguate.
         - context: str: Sentence or context window where the entity is being mentioned.
         - options: list: List of descriptions of the posible entities.
        Returns:
         - scores: np.array: (n_options x 2) matrix of probabilities for each option. Each option has 2 probabilities. 
           Probability of belongin to class 0 and probability of belonging to class 1.
        Example:
         entity = "Donald Trump"
         context = "A personal aide to U.S. President Donald Trump gave positive to coronavirus",
         options =  [
            'Donald Trump, president of the United States from 2017 to 2021',
            'Donald Trump, American physician',
            'Donald Trump, Wikimedia disambiguation page',
            'Donald Trump, song by Mac Miller',
            'Donald Trump, segment of an episode of Last Week Tonight',
            "Donald Trump, character Donald Trump in Anthony Davis's opera The Central Park Five",
            '2016 United States presidential election, 58th quadrennial U.S. presidential election'
         ]
         >>> compute_scores(entity, context, options)
         >>> array(
          [[9.2529238e-04, 9.9907470e-01],
           [9.9969673e-01, 3.0326008e-04],
           [9.9955863e-01, 4.4132231e-04],
           [9.9969578e-01, 3.0429140e-04],
           [9.9968457e-01, 3.1540715e-04],
           [9.9969685e-01, 3.0312332e-04],
           [9.9966502e-01, 3.3492901e-04]], 
         dtype=float32
        )
        """
        scores = []
        # For each option in the list of possible options ...
        for option in options:
            n_tokens = 30
            # we will keep looping
            while True:
                
                # build the query string required by our BERT model. Namely:
                #" Is '{entity}' in the context of: '{context}', referring to [SEP] {option}?"
                query = build_query_for_bert_ned(
                    entity, context, option
                )
                
                # encode and tokenize the query string
                encoded_dict = self.tokenizer.encode_plus(
                    query,                           # Sentence to encode.
                    add_special_tokens = True,       # Add '[CLS]' and '[SEP]'
                    #max_length = 512,                # Pad & truncate all sentences.
                    #padding='max_length',            # Make sure this applies padding as needed
                    #truncation=True,
                    return_attention_mask = True,    # Construct attention masks.
                    return_tensors = 'pt',           # Return pytorch tensors.
                )
    
                # condition required to stop the loop
                if encoded_dict['input_ids'].shape[1]<=512:
                    # tokenized query string is less than the max length expected by our BERT model
                    # we can proceed to compute the probability of this option.
                    prob = self._predict_proba(
                        input_ids=encoded_dict['input_ids'], 
                        attention_mask=encoded_dict['attention_mask']
                    )
                    # append probability of option
                    scores.append(prob)
                    # break from while True loop
                    break
                
                # condition was not met, meaning the tokenized string was longer than max length expected by our BERT model
                # we will shorten the context by getting the "n_tokens" that sorround the entity mention
                contexts = find_surrounding_text(
                    target_word=entity, 
                    sentence=context, 
                    n_tokens=n_tokens
                )
                # entity might be mentioned several times on a single sentece, just get the first mention to keep things simple
                # or "None" if for some strange reason, there is no entity mention in the sentences (which shouldn't happend if preprocessing was done correctly)
                context = 'None' if len(contexts)==0 else contexts[0]
                # decrease the number of tokens that surround the entity mention 
                n_tokens-= 5
                # keep looping ...
        return np.array(scores)
    
    def desambiguate(self, entity, contexts, options, verbose=False):
        """
        Function used to desambiguate an `entity` when there are several possible entity matches.
        Uses a BERT based model that predicts a probability of an option being the correct option
        given the context where the entity is being mentioned.
        -------------------------------
        Receives:
         - entity: str: The entity to desambiguate.
         - contexts: list: List of sentences or context windows where the entity is being mentioned.
         - options: list: List of descriptions of the posible entities.
        Returns:
         - best_option_ix: int: Index where the option with the highest probability is located in the options list
        Example:
         entity = "Donald Trump"
         contexts = [
           "A personal aide to U.S. President Donald Trump gave positive to coronavirus", 
           "It's not the first scare for Donald Trump. In early March it was announced that Brazilian President Jair Bolsonaro, gave positive to coronavirus, after participating in a meeting in Florida where the US president was."
         ]
         options = [
            'Donald Trump, president of the United States from 2017 to 2021',
            'Donald Trump, American physician',
            'Donald Trump, Wikimedia disambiguation page',
            'Donald Trump, song by Mac Miller',
            'Donald Trump, segment of an episode of Last Week Tonight',
            "Donald Trump, character Donald Trump in Anthony Davis's opera The Central Park Five",
            '2016 United States presidential election, 58th quadrennial U.S. presidential election'
        ]
        >>> desambiguate(entity, contexts, options, verbose=True)
            Entity: Donald Trump
            Options:
            	Donald Trump, president of the United States from 2017 to 2021: 0.9990746974945068
            	Donald Trump, American physician: 0.00032277879654429853
            	Donald Trump, Wikimedia disambiguation page: 0.00044132230686955154
            	Donald Trump, song by Mac Miller: 0.0003152454155497253
            	Donald Trump, segment of an episode of Last Week Tonight: 0.00031540714553557336
            	Donald Trump, character Donald Trump in Anthony Davis's opera The Central Park Five: 0.00030414783395826817
            	2016 United States presidential election, 58th quadrennial U.S. presidential election: 0.0005287989042699337
            Best option: Donald Trump, president of the United States from 2017 to 2021
            -------
        >>> 0
        """
        
        sents_scores = []
        # entity might be mentioned in several sentences (contexts)
        # for each context where entity is mentioned, compute scores of all options
        for context in contexts:
            scores = self._compute_scores(entity, context, options)
            # append probabilities of belonging to class 1 (for each option)
            sents_scores.append(scores[:,1])
        
        # matrix with n_contexts rows and n_options columns
        cont_scores_mat = np.array(sents_scores)
        # get option with better probability across all contexts
        best_options = np.max(cont_scores_mat, axis=0)
        # get index of best option
        best_option_ix = np.argmax(best_options)
    
        if verbose:
            print('Entity: {0}'.format(entity))
            print("Options:")
            for o, s in zip(options, np.max(np.array(sents_scores), axis=0)):
                print("\t{0}: {1}".format(o,s))
            print("Best option: {0}".format(options[best_option_ix]))
            print('-------')
    
        return best_option_ix

In [2]:
entity = "Donald Trump"
contexts = [
   "A personal aide to U.S. President Donald Trump gave positive to coronavirus", 
   "It's not the first scare for Donald Trump. In early March it was announced that Brazilian President Jair Bolsonaro, gave positive to coronavirus, after participating in a meeting in Florida where the US president was."
]
options = [
    'Donald Trump, president of the United States from 2017 to 2021',
    'Donald Trump, American physician',
    'Donald Trump, Wikimedia disambiguation page',
    'Donald Trump, song by Mac Miller',
    'Donald Trump, segment of an episode of Last Week Tonight',
    "Donald Trump, character Donald Trump in Anthony Davis's opera The Central Park Five",
    '2016 United States presidential election, 58th quadrennial U.S. presidential election'
]

In [3]:
bert_ned = BertNED()

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [4]:
bert_ned.desambiguate(entity, contexts, options, verbose=True)

Entity: Donald Trump
Options:
	Donald Trump, president of the United States from 2017 to 2021: 0.9990746974945068
	Donald Trump, American physician: 0.00032277879654429853
	Donald Trump, Wikimedia disambiguation page: 0.00044132230686955154
	Donald Trump, song by Mac Miller: 0.0003152454155497253
	Donald Trump, segment of an episode of Last Week Tonight: 0.00031540714553557336
	Donald Trump, character Donald Trump in Anthony Davis's opera The Central Park Five: 0.00030414783395826817
	2016 United States presidential election, 58th quadrennial U.S. presidential election: 0.0005287989042699337
Best option: Donald Trump, president of the United States from 2017 to 2021
-------


0

In [12]:
bert_ned._compute_scores(entity, contexts[0], options)

array([[9.2529238e-04, 9.9907470e-01],
       [9.9969673e-01, 3.0326008e-04],
       [9.9955863e-01, 4.4132231e-04],
       [9.9969578e-01, 3.0429140e-04],
       [9.9968457e-01, 3.1540715e-04],
       [9.9969685e-01, 3.0312332e-04],
       [9.9966502e-01, 3.3492901e-04]], dtype=float32)

In [13]:
with open("../extract_ners/ask_sb_sofar_val.json", "r") as file:
    data_missing_ned = json.load(file)

In [14]:
unobserved_df = pd.DataFrame.from_dict(data_missing_ned)

In [16]:
path_file = '../../news_data/news_with_metadata.json'
with open(path_file, 'r') as jfile:
    processed_news_articles = json.load(jfile)

In [17]:
processed_news_articles_df = pd.DataFrame.from_dict(processed_news_articles)

In [18]:
validation_df = pd.merge(
    unobserved_df,
    processed_news_articles_df[['h1', 'content_en']],
    how='left',
    on='h1'
)

In [19]:
del processed_news_articles_df
del processed_news_articles
del data_missing_ned

In [20]:
validation_df

Unnamed: 0,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en
0,Zacatecas,CITY,{'LOCATION': 0.98695854325281},ZACATECAS,Cenace advierte de nuevos cortes de luz “rotat...,"[{'id': 'Q80269', 'display_label': 'Zacatecas'...",Zacatecas-Cenace advierte de nuevos cortes de ...,The news article is most likely referring to: ...,"1. Zacatecas, state of Mexico \n2. Zacatecas, ...",The National Energy Control Center (Cenace) an...
1,Culiacán,PERSON,{'PERSON': 0.43934991439724},CULIACAN,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q211760', 'display_label': 'Culiacán'...",Culiacán-Madres rastreadoras de Sinaloa exigen...,The news article is most likely referring to: ...,"1. Culiacán, capital and largest city in the M...","Culiacán, Sin.- In the esplanade of Palacio de..."
2,Palacio de Gobierno,LOCATION,{'LOCATION': 0.88592453954918},PALACIO DE GOBIERNO,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q385938', 'display_label': 'Governmen...",Palacio de Gobierno-Madres rastreadoras de Sin...,The news article is most likely referring to: ...,"1. Government Palace, seat of the executive br...","Culiacán, Sin.- In the esplanade of Palacio de..."
3,State,ORGANIZATION,{'ORGANIZATION': 0.44361875069213},STATE,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q7275', 'display_label': 'state', 'di...",State-Madres rastreadoras de Sinaloa exigen la...,The news article is most likely referring to: ...,"1. state, organised community living under a s...","Culiacán, Sin.- In the esplanade of Palacio de..."
4,Fresnillo,CITY,{'LOCATION': 0.86880109112666},FRESNILLO,“Ya no hagan de emoción”: Forcejean a llegada ...,"[{'id': 'Q1816543', 'display_label': 'Fresnill...",Fresnillo-“Ya no hagan de emoción”: Forcejean ...,The news article is most likely referring to: ...,"1. Fresnillo, city in Fresnillo Municipality, ...","Fresnillo, Zacatecas.- President Andrés Manuel..."
...,...,...,...,...,...,...,...,...,...,...
51996,Mexico,COUNTRY,{'LOCATION': 0.86121711995959},MEXICO,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q96', 'display_label': 'Mexico', 'dis...",Mexico-Mexico’s Financial Intelligence Unit fr...,The news article is most likely referring to: ...,"1. Mexico, country in North America \n2. Mexic...","Santiago Nieto, the head of the Financial Inte..."
51997,Santiago,CITY,{'LOCATION': 0.60977913237806},SANTIAGO,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q2887', 'display_label': 'Santiago', ...",Santiago-Mexico’s Financial Intelligence Unit ...,The news article is most likely referring to: ...,"1. Santiago, capital city of Chile \n2. St. Ja...","Santiago Nieto, the head of the Financial Inte..."
51998,Peña Nieto,PERSON,{'PERSON': 0.74281429426886},PENA NIETO,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q296741', 'display_label': 'Enrique P...",Peña Nieto-Mexico’s Financial Intelligence Uni...,The news article is most likely referring to: ...,"1. Enrique Peña Nieto, 64th President of Mexic...","Santiago Nieto, the head of the Financial Inte..."
51999,Pemex,ORGANIZATION,{'ORGANIZATION': 0.96210365359227},PEMEX,"Cofece investiga, otra vez, comercialización d...","[{'id': 'Q871308', 'display_label': 'Pemex', '...","Pemex-Cofece investiga, otra vez, comercializa...",The news article is most likely referring to: ...,"1. Pemex, Mexican state-owned petroleum compan...",Due to the possible existence of anti-competit...


In [21]:
validation_df.shape

(52001, 10)

In [22]:
validation_df['label_desc_options'] = validation_df['wikidata_search_entries'].apply(
    lambda x: [i.get('label_desc') for i in x]
)

In [23]:
validation_df['sents_with_ent'] = validation_df.apply(lambda row: get_sentences_containing_entity(row), axis=1)

In [24]:
validation_df['sents_len'] = validation_df['sents_with_ent'].apply(len)

In [25]:
# drop observations with no sentence mentions
validation_df.drop(validation_df[validation_df['sents_len']==0].index, inplace=True)

In [26]:
regex_pattern = "The news article is most likely referring to:"
validation_df['sb_option_given'] = validation_df['sb_answer'].str[len(regex_pattern):]
validation_df['sb_option_given'] = validation_df['sb_option_given'].str.replace('\n', '').str.strip()

In [27]:
regex_pattern = r"(\d+.)(.*)"
validation_df[['index_of_option_given', 'sb_option_given']] = validation_df['sb_option_given'].str.extract(regex_pattern)
validation_df['index_of_option_given'] = validation_df['index_of_option_given'].str.replace(".", "").str.strip()
validation_df['sb_option_given'] = validation_df['sb_option_given'].str.strip()

In [28]:
validation_df[validation_df['index_of_option_given'].isnull()]

Unnamed: 0,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en,label_desc_options,sents_with_ent,sents_len,sb_option_given,index_of_option_given


In [29]:
validation_df['index_of_option_given'] = (validation_df['index_of_option_given'].astype(int) - 1)

In [30]:
validation_df = validation_df.reset_index(drop=True).reset_index()

In [31]:
validation_df.head(5)

Unnamed: 0,index,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en,label_desc_options,sents_with_ent,sents_len,sb_option_given,index_of_option_given
0,0,Zacatecas,CITY,{'LOCATION': 0.98695854325281},ZACATECAS,Cenace advierte de nuevos cortes de luz “rotat...,"[{'id': 'Q80269', 'display_label': 'Zacatecas'...",Zacatecas-Cenace advierte de nuevos cortes de ...,The news article is most likely referring to: ...,"1. Zacatecas, state of Mexico \n2. Zacatecas, ...",The National Energy Control Center (Cenace) an...,"[Zacatecas, state of Mexico, Zacatecas, city i...","[The blackout started at 7:58 a.m. on Monday, ...",1,"Zacatecas, city in Zacatecas Municipality, Zac...",1
1,1,Culiacán,PERSON,{'PERSON': 0.43934991439724},CULIACAN,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q211760', 'display_label': 'Culiacán'...",Culiacán-Madres rastreadoras de Sinaloa exigen...,The news article is most likely referring to: ...,"1. Culiacán, capital and largest city in the M...","Culiacán, Sin.- In the esplanade of Palacio de...","[Culiacán, capital and largest city in the Mex...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Culiacán, capital and largest city in the Mexi...",0
2,2,Palacio de Gobierno,LOCATION,{'LOCATION': 0.88592453954918},PALACIO DE GOBIERNO,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q385938', 'display_label': 'Governmen...",Palacio de Gobierno-Madres rastreadoras de Sin...,The news article is most likely referring to: ...,"1. Government Palace, seat of the executive br...","Culiacán, Sin.- In the esplanade of Palacio de...","[Government Palace, seat of the executive bran...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Palacio de Gobierno de Jalisco, Government bui...",1
3,3,State,ORGANIZATION,{'ORGANIZATION': 0.44361875069213},STATE,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q7275', 'display_label': 'state', 'di...",State-Madres rastreadoras de Sinaloa exigen la...,The news article is most likely referring to: ...,"1. state, organised community living under a s...","Culiacán, Sin.- In the esplanade of Palacio de...","[state, organised community living under a sys...","[On 16 February, the Attorney General of the S...",1,"state, organised community living under a syst...",0
4,4,Fresnillo,CITY,{'LOCATION': 0.86880109112666},FRESNILLO,“Ya no hagan de emoción”: Forcejean a llegada ...,"[{'id': 'Q1816543', 'display_label': 'Fresnill...",Fresnillo-“Ya no hagan de emoción”: Forcejean ...,The news article is most likely referring to: ...,"1. Fresnillo, city in Fresnillo Municipality, ...","Fresnillo, Zacatecas.- President Andrés Manuel...","[Fresnillo, city in Fresnillo Municipality, Za...","[Fresnillo, Zacatecas.- President Andrés Manue...",1,"Fresnillo, city in Fresnillo Municipality, Zac...",0


In [58]:
bert_ned = BertNED()

In [59]:
all_obs = validation_df.shape[0]
best_ixs = []
for i, obs in enumerate(validation_df.to_dict(orient='records')):
    print_progress_bar(iteration=i, total=all_obs)
    try:
        best_option_ix = bert_ned.desambiguate(
            entity=obs['text'], 
            contexts=obs['sents_with_ent'], 
            options=obs['label_desc_options']
        )
        best_ixs.append(best_option_ix)
    except Exception as e:
        print(i)
        raise e



Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors




In [60]:
validation_df['bert_best_option_ix'] = best_ixs

In [62]:
validation_df.head(5)

Unnamed: 0,index,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en,label_desc_options,sents_with_ent,sents_len,sb_option_given,index_of_option_given,bert_best_option_ix
0,0,Zacatecas,CITY,{'LOCATION': 0.98695854325281},ZACATECAS,Cenace advierte de nuevos cortes de luz “rotat...,"[{'id': 'Q80269', 'display_label': 'Zacatecas'...",Zacatecas-Cenace advierte de nuevos cortes de ...,The news article is most likely referring to: ...,"1. Zacatecas, state of Mexico \n2. Zacatecas, ...",The National Energy Control Center (Cenace) an...,"[Zacatecas, state of Mexico, Zacatecas, city i...","[The blackout started at 7:58 a.m. on Monday, ...",1,"Zacatecas, city in Zacatecas Municipality, Zac...",1,1
1,1,Culiacán,PERSON,{'PERSON': 0.43934991439724},CULIACAN,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q211760', 'display_label': 'Culiacán'...",Culiacán-Madres rastreadoras de Sinaloa exigen...,The news article is most likely referring to: ...,"1. Culiacán, capital and largest city in the M...","Culiacán, Sin.- In the esplanade of Palacio de...","[Culiacán, capital and largest city in the Mex...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Culiacán, capital and largest city in the Mexi...",0,0
2,2,Palacio de Gobierno,LOCATION,{'LOCATION': 0.88592453954918},PALACIO DE GOBIERNO,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q385938', 'display_label': 'Governmen...",Palacio de Gobierno-Madres rastreadoras de Sin...,The news article is most likely referring to: ...,"1. Government Palace, seat of the executive br...","Culiacán, Sin.- In the esplanade of Palacio de...","[Government Palace, seat of the executive bran...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Palacio de Gobierno de Jalisco, Government bui...",1,0
3,3,State,ORGANIZATION,{'ORGANIZATION': 0.44361875069213},STATE,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q7275', 'display_label': 'state', 'di...",State-Madres rastreadoras de Sinaloa exigen la...,The news article is most likely referring to: ...,"1. state, organised community living under a s...","Culiacán, Sin.- In the esplanade of Palacio de...","[state, organised community living under a sys...","[On 16 February, the Attorney General of the S...",1,"state, organised community living under a syst...",0,0
4,4,Fresnillo,CITY,{'LOCATION': 0.86880109112666},FRESNILLO,“Ya no hagan de emoción”: Forcejean a llegada ...,"[{'id': 'Q1816543', 'display_label': 'Fresnill...",Fresnillo-“Ya no hagan de emoción”: Forcejean ...,The news article is most likely referring to: ...,"1. Fresnillo, city in Fresnillo Municipality, ...","Fresnillo, Zacatecas.- President Andrés Manuel...","[Fresnillo, city in Fresnillo Municipality, Za...","[Fresnillo, Zacatecas.- President Andrés Manue...",1,"Fresnillo, city in Fresnillo Municipality, Zac...",0,0


In [63]:
validation_df['bert!=sb'] = validation_df['bert_best_option_ix'] != validation_df['index_of_option_given']

In [68]:
print(
    "Accuracy against StableBelgua Answers: {0}".format(1 - validation_df['bert!=sb'].mean())
)

Accuracy against StableBelgua Answers: 0.8655023371340282


In [69]:
validation_df[validation_df['bert!=sb']]

Unnamed: 0,index,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en,label_desc_options,sents_with_ent,sents_len,sb_option_given,index_of_option_given,bert_best_option_ix,bert!=sb
2,2,Palacio de Gobierno,LOCATION,{'LOCATION': 0.88592453954918},PALACIO DE GOBIERNO,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q385938', 'display_label': 'Governmen...",Palacio de Gobierno-Madres rastreadoras de Sin...,The news article is most likely referring to: ...,"1. Government Palace, seat of the executive br...","Culiacán, Sin.- In the esplanade of Palacio de...","[Government Palace, seat of the executive bran...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Palacio de Gobierno de Jalisco, Government bui...",1,0,True
8,8,Rocca di Papa,CITY,{'ORGANIZATION': 0.59307377723601},ROCCA DI PAPA,Detienen a narcofamilia en Italia; niña de 7 a...,"[{'id': 'Q243228', 'display_label': 'Rocca di ...",Rocca di Papa-Detienen a narcofamilia en Itali...,The news article is most likely referring to: ...,"1. Rocca di Papa, Italian comune \n2. Rocca di...",The Italian police arrested mother and daughte...,"[Rocca di Papa, Italian comune, Rocca di Papa,...",[According to information published this Tuesd...,1,"Rocca di Papa, chief town of the homonym munic...",1,5,True
9,9,Velletri,PERSON,{'PERSON': 0.579953265056},VELLETRI,Detienen a narcofamilia en Italia; niña de 7 a...,"[{'id': 'Q189236', 'display_label': 'Velletri'...",Velletri-Detienen a narcofamilia en Italia; ni...,The news article is most likely referring to: ...,"1. Velletri, Italian comune \n2. Velletri, chi...",The Italian police arrested mother and daughte...,"[Velletri, Italian comune, Velletri, chief tow...","[The family earned up to $2,200 a day, accordi...",1,"Velletri, Italian comune",0,3,True
12,12,Latina,CITY,{'LOCATION': 0.9307302010545},LATINA,Detienen a narcofamilia en Italia; niña de 7 a...,"[{'id': 'Q13410', 'display_label': 'Latina', '...",Latina-Detienen a narcofamilia en Italia; niña...,The news article is most likely referring to: ...,"1. Latina, Italian comune \n2. Latino, people ...",The Italian police arrested mother and daughte...,"[Latina, Italian comune, Latino, people in the...","[The women, according to La Repubblica, are ac...",2,"Latina, Italian comune",0,6,True
31,31,Tonalá,LOCATION,{'LOCATION': 0.47620629578128},TONALA,"Golfo, CJNG y Sinaloa, detrás del tráfico de m...","[{'id': 'Q2636407', 'display_label': 'Tonalá',...","Tonalá-Golfo, CJNG y Sinaloa, detrás del tráfi...",The news article is most likely referring to: ...,"1. Tonalá, municipality of Chiapas, Mexico \n2...","Cells of the Gulf cartels, Sinaloa and Jalisco...","[Tonalá, municipality of Chiapas, Mexico, Tona...","[These localities are Arriaga, Tonalá, Cintala...",1,"Tonalá, municipality of Chiapas, Mexico",0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51106,51106,Justice,ORGANIZATION,{'ORGANIZATION': 0.60568609396161},JUSTICE,Una pizca de los desencuentros teóricos en Amé...,"[{'id': 'Q16533', 'display_label': 'judge', 'd...",Justice-Una pizca de los desencuentros teórico...,The news article is most likely referring to: ...,"1. judge, official who presides over court pro...","A. Bartra, professor at the UAM (Campesindios....","[judge, official who presides over court proce...","[In the need to find and articulate efforts, i...",1,"justice, broad idea of a situation where peopl...",1,0,True
51112,51112,IAFA,ORGANIZATION,{'ORGANIZATION': 0.98692864204234},IAFA,México ocupa presidencia del Consejo de Autori...,"[{'id': 'Q21015750', 'display_label': 'Interna...",IAFA-México ocupa presidencia del Consejo de A...,The news article is most likely referring to: ...,1. International Association for the Fantastic...,The Ministry of Foreign Affairs reported that ...,[International Association for the Fantastic i...,[The Ministry of Foreign Affairs reported that...,1,IAFA Award,5,3,True
51114,51114,AIFM,ORGANIZATION,{'ORGANIZATION': 0.75837510148933},AIFM,México ocupa presidencia del Consejo de Autori...,"[{'id': 'Q119477776', 'display_label': 'AIFM -...",AIFM-México ocupa presidencia del Consejo de A...,The news article is most likely referring to: ...,1. AIFM - Associazione italiana Finiture dei M...,The Ministry of Foreign Affairs reported that ...,[AIFM - Associazione italiana Finiture dei Met...,"[In a statement, the Chancellery stated that t...",3,"Apoptosis-inducing factor, mitochondrion-assoc...",3,2,True
51124,51124,Altos,CITY,{'LOCATION': 0.83581962406358},ALTOS,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q22060469', 'display_label': 'Altos',...",Altos-Mexico’s Financial Intelligence Unit fre...,The news article is most likely referring to: ...,"1. Altos, Brazilian municipality of the state ...","Santiago Nieto, the head of the Financial Inte...","[Altos, Brazilian municipality of the state of...","[In an interview, Santiago Nieto explained tha...",1,"Altos, city and district in Cordillera, Paraguay",1,6,True


In [150]:
for ent in sorted(validation_df[validation_df['bert!=sb']]['text'].unique()):
    print(ent)

7th & Union
A.C
A.T. Kearney
AAA
ABC
AFI
AFP
AGA
AIC
AICM
AIDA
AIFM
AJ
AL
ALARMA
ALDF
ALYC
AMANC
AMG
AMIA
ANAM
ANDA
AOL
AP
APA
ASA
ASF
ASIP
ASL
Aafia Siddiqui
Aaron Robinson
Aaron Stein
Abarca
Abasolo
Abbott
Abdul
Abelardo L. Rodríguez
Abingdon
Abraham Lincoln
Abraham Mendoza
Abramo
Abu Muhammad
Abundis
Academy of Engineering
Academy of Fine Arts
Acapulco
Acatlán de
Acayucan
Acciona
Aceves
Achilles
Acklins
Acteal
Actopan
Acuña
Administrative Committee
Adolfo López Mateos
Adolfo Marsillach
Adrian
Adriana Figueroa
Adriana Pérez
Adriel
Adrien
Adrián González
Adrián Marcelo
Adrián Vázquez
Adrián Ávalos
Aeromexico
Afghanistan
Afore
Africa
Agache
Agua Dulce
Aguirre
Agusto
Ahmad
Ahmaud Arbery
Aila
Aimée
Air Force Command
Air Force Museum
Airbus Group
Akihabara
Al
Al Arabiya
Al Jazeera
Al Qaeda
Al Zubarah
Al-Quds
Alaba
Alan Garcia
Alan Ituriel
Alatorre
Alatrist
Alba
Alba Flores
Albert Einstein
Albert Zhang
Alberti
Albon
Alcala
Alcázar de
Aldama
Aldesa
Alebrijes
Alejandra
Alejandro
Alejandro Ag

In [158]:
validation_df[
    (validation_df['bert!=sb'])&
    (validation_df['text']=='Enrique Peña Nieto')
]

Unnamed: 0,index,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en,label_desc_options,sents_with_ent,sents_len,sb_option_given,index_of_option_given,bert_best_option_ix,bert!=sb
22067,22067,Enrique Peña Nieto,PERSON,{'PERSON': 0.85535035710227},ENRIQUE PENA NIETO,“Le creo al ministro Zaldívar”: AMLO tras decl...,"[{'id': 'Q296741', 'display_label': 'Enrique P...",Enrique Peña Nieto-“Le creo al ministro Zaldív...,The news article is most likely referring to: ...,"1. Enrique Peña Nieto, 64th President of Mexic...",President Andrés Manuel López Obrador said tha...,"[Enrique Peña Nieto, 64th President of Mexico,...","[In Palacio Nacional, the federal governor who...",1,Enrique Peña Nieto conflict of interest scanda...,1,0,True
32914,32914,Enrique Peña Nieto,PERSON,{'PERSON': 0.99699559005229},ENRIQUE PENA NIETO,"Críticas de exsecretarios de Salud, por intere...","[{'id': 'Q296741', 'display_label': 'Enrique P...",Enrique Peña Nieto-Críticas de exsecretarios d...,The news article is most likely referring to: ...,"1. Enrique Peña Nieto, 64th President of Mexic...",President Andrés Manuel López Obrador said he ...,"[Enrique Peña Nieto, 64th President of Mexico,...",[In the virtual forum “The Moment of the Epide...,1,Enrique Peña Nieto conflict of interest scanda...,1,0,True
47489,47489,Enrique Peña Nieto,PERSON,{'PERSON': 0.88057265245543},ENRIQUE PENA NIETO,La Mansión del Bienestar: el fracaso de la doc...,"[{'id': 'Q296741', 'display_label': 'Enrique P...",Enrique Peña Nieto-La Mansión del Bienestar: e...,The news article is most likely referring to: ...,"1. Enrique Peña Nieto, 64th President of Mexic...",Even the hesitant has denoised the lifestyle o...,"[Enrique Peña Nieto, 64th President of Mexico,...","[In national life this is not a novel episode,...",1,Enrique Peña Nieto conflict of interest scanda...,1,0,True


In [161]:
ix = np.random.choice(validation_df[
    (validation_df['bert!=sb'])&
    (validation_df['text']=='Enrique Peña Nieto')
].index)
#ix = np.random.randint(0,validation_df[validation_df['bert!=sb']].shape[0])

In [162]:
ix

32914

In [163]:
ent = validation_df[validation_df['bert!=sb']].loc[ix]['text']

options = validation_df[validation_df['bert!=sb']].loc[ix]['label_desc_options']

sb_answer_ix = validation_df[validation_df['bert!=sb']].loc[ix]['index_of_option_given']
sb_answer = options[sb_answer_ix]
bert_answer_ix = validation_df[validation_df['bert!=sb']].loc[ix]['bert_best_option_ix']
bert_answer = options[bert_answer_ix]

print("Entity: {0}".format(ent))
print("SB answer: {0}".format(sb_answer))
print("Bert answer: {0}".format(bert_answer))

Entity: Enrique Peña Nieto
SB answer: Enrique Peña Nieto conflict of interest scandal, Political scandal in Mexico in 2015
Bert answer: Enrique Peña Nieto, 64th President of Mexico


In [145]:
validation_df[validation_df['bert!=sb']].loc[ix]['sents_with_ent']

['The "PLEITO" CALDERÓN VS RODRÍGUEZ PRATS Anaya Cortés eluded to answer the lawsuit of former President Felipe Calderón Hinojosa and Juan José Rodríguez Pratts and to have orchestrated it.']

In [146]:
validation_df[validation_df['bert!=sb']].loc[ix]['content_en']

'From Ernesto Ruffo Appel’s discovery, who yesterday confirmed that he does want to be the PAN’s candidate for the presidency of the Republic for 2018 and that some of his co-religionists consider that he has greater merit in leading Acción Nacional than many of the aspirants, the national leader of the albiazul, Ricardo Anaya Cortés, pointed out: “I have deep admiration and respect for him.” Questioned that the former president has more experience and trajectory than Anaya Cortés himself, Margarita Zavala and others, the PAN leader recalled that Ernesto was the first governor in Mexico\'s history not to be emanated from the PRI. “It was the first postulated by the PAN in that historic triumph of 1989, I have nothing but admiration and respect for it,” he stressed of the Baja Californian who is already beginning to receive support on social networks and trusts that the “Ruffomania” will be unleashed. In support of the PAN candidate for Saltillo’s mayor’s office, Esther Quintana Salinas

In [None]:
bert_ned = BertNED()

In [56]:
bert_ned._compute_scores(
    entity=validation_df.iloc[41308]['text'], 
    context=validation_df.iloc[41308]['sents_with_ent'][1], 
    options=validation_df.iloc[41308]['label_desc_options']
)

Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors


746
These are the 10 best paid careers in Mexico Engineering in Urban Mobility: UPIEM Engineering in Energy Business Sustainable: UPIEM Industrial Robotic Engineering: ESIME Unit Azcapotzalco Engineering in Environmental Sciences: ENCB Engineering in Automotive Systems
80
83
83
90
87
82
86


array([[4.6752747e-03, 9.9532473e-01],
       [9.7041839e-01, 2.9581612e-02],
       [9.9965370e-01, 3.4633253e-04],
       [9.9969864e-01, 3.0143259e-04],
       [9.9970406e-01, 2.9587332e-04],
       [9.9969995e-01, 3.0008118e-04],
       [9.9969935e-01, 3.0068430e-04]], dtype=float32)

In [29]:
import time

In [31]:
start = time.time()
validation_df['bert_best_option_ix'] = validation_df.apply(
    lambda row: desambiguate(
        entity=row['text'], 
        contexts=row['sents_with_ent'], 
        options=row['label_desc_options']
    ), axis=1
)
end = time.time()
print(end-start)

Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The expanded size of the tensor (746) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 746].  Tensor sizes: [1, 512]

In [32]:
validation_df

Unnamed: 0,index,text,ner,nerConfidences,clean_text,h1,wikidata_search_entries,ix,sb_answer,options_given,content_en,label_desc_options,sents_with_ent,sents_len,sb_option_given,index_of_option_given
0,0,Zacatecas,CITY,{'LOCATION': 0.98695854325281},ZACATECAS,Cenace advierte de nuevos cortes de luz “rotat...,"[{'id': 'Q80269', 'display_label': 'Zacatecas'...",Zacatecas-Cenace advierte de nuevos cortes de ...,The news article is most likely referring to: ...,"1. Zacatecas, state of Mexico \n2. Zacatecas, ...",The National Energy Control Center (Cenace) an...,"[Zacatecas, state of Mexico, Zacatecas, city i...","[The blackout started at 7:58 a.m. on Monday, ...",1,"Zacatecas, city in Zacatecas Municipality, Zac...",1
1,1,Culiacán,PERSON,{'PERSON': 0.43934991439724},CULIACAN,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q211760', 'display_label': 'Culiacán'...",Culiacán-Madres rastreadoras de Sinaloa exigen...,The news article is most likely referring to: ...,"1. Culiacán, capital and largest city in the M...","Culiacán, Sin.- In the esplanade of Palacio de...","[Culiacán, capital and largest city in the Mex...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Culiacán, capital and largest city in the Mexi...",0
2,2,Palacio de Gobierno,LOCATION,{'LOCATION': 0.88592453954918},PALACIO DE GOBIERNO,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q385938', 'display_label': 'Governmen...",Palacio de Gobierno-Madres rastreadoras de Sin...,The news article is most likely referring to: ...,"1. Government Palace, seat of the executive br...","Culiacán, Sin.- In the esplanade of Palacio de...","[Government Palace, seat of the executive bran...","[Culiacán, Sin.- In the esplanade of Palacio d...",1,"Palacio de Gobierno de Jalisco, Government bui...",1
3,3,State,ORGANIZATION,{'ORGANIZATION': 0.44361875069213},STATE,Madres rastreadoras de Sinaloa exigen la desti...,"[{'id': 'Q7275', 'display_label': 'state', 'di...",State-Madres rastreadoras de Sinaloa exigen la...,The news article is most likely referring to: ...,"1. state, organised community living under a s...","Culiacán, Sin.- In the esplanade of Palacio de...","[state, organised community living under a sys...","[On 16 February, the Attorney General of the S...",1,"state, organised community living under a syst...",0
4,4,Fresnillo,CITY,{'LOCATION': 0.86880109112666},FRESNILLO,“Ya no hagan de emoción”: Forcejean a llegada ...,"[{'id': 'Q1816543', 'display_label': 'Fresnill...",Fresnillo-“Ya no hagan de emoción”: Forcejean ...,The news article is most likely referring to: ...,"1. Fresnillo, city in Fresnillo Municipality, ...","Fresnillo, Zacatecas.- President Andrés Manuel...","[Fresnillo, city in Fresnillo Municipality, Za...","[Fresnillo, Zacatecas.- President Andrés Manue...",1,"Fresnillo, city in Fresnillo Municipality, Zac...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51126,51126,Mexico,COUNTRY,{'LOCATION': 0.86121711995959},MEXICO,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q96', 'display_label': 'Mexico', 'dis...",Mexico-Mexico’s Financial Intelligence Unit fr...,The news article is most likely referring to: ...,"1. Mexico, country in North America \n2. Mexic...","Santiago Nieto, the head of the Financial Inte...","[Mexico, country in North America, Mexico City...","[In an interview, Santiago Nieto explained tha...",1,"Mexico, country in North America",0
51127,51127,Santiago,CITY,{'LOCATION': 0.60977913237806},SANTIAGO,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q2887', 'display_label': 'Santiago', ...",Santiago-Mexico’s Financial Intelligence Unit ...,The news article is most likely referring to: ...,"1. Santiago, capital city of Chile \n2. St. Ja...","Santiago Nieto, the head of the Financial Inte...","[Santiago, capital city of Chile, St. James th...","[Santiago Nieto, the head of the Financial Int...",3,"Santiago, male given name",2
51128,51128,Peña Nieto,PERSON,{'PERSON': 0.74281429426886},PENA NIETO,Mexico’s Financial Intelligence Unit freezes 1...,"[{'id': 'Q296741', 'display_label': 'Enrique P...",Peña Nieto-Mexico’s Financial Intelligence Uni...,The news article is most likely referring to: ...,"1. Enrique Peña Nieto, 64th President of Mexic...","Santiago Nieto, the head of the Financial Inte...","[Enrique Peña Nieto, 64th President of Mexico,...","[Moreover, Nieto Said severe other officers fr...",1,"Enrique Peña Nieto, 64th President of Mexico",0
51129,51129,Pemex,ORGANIZATION,{'ORGANIZATION': 0.96210365359227},PEMEX,"Cofece investiga, otra vez, comercialización d...","[{'id': 'Q871308', 'display_label': 'Pemex', '...","Pemex-Cofece investiga, otra vez, comercializa...",The news article is most likely referring to: ...,"1. Pemex, Mexican state-owned petroleum compan...",Due to the possible existence of anti-competit...,"[Pemex, Mexican state-owned petroleum company,...",[This is a new investigation into the same pro...,1,"Pemex, Mexican state-owned petroleum company",0
