In [5]:
from transformers import BertModel, AutoTokenizer, AutoModelForMaskedLM
from scipy.special import softmax
import numpy as np

In [3]:
model_name = "bert-base-cased"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
mask = tokenizer.mask_token
mask

'[MASK]'

In [9]:
sentence = f"I want to {mask} pizza for tonight."
sentence

'I want to [MASK] pizza for tonight.'

In [10]:
tokens = tokenizer.tokenize(sentence)
tokens

['I', 'want', 'to', '[MASK]', 'pizza', 'for', 'tonight', '.']

In [11]:
encoded_inputs = tokenizer(sentence, return_tensors = 'pt')
encoded_inputs

{'input_ids': tensor([[  101,   146,  1328,  1106,   103, 13473,  1111,  3568,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
outputs = model(**encoded_inputs)
outputs

MaskedLMOutput(loss=None, logits=tensor([[[ -7.3723,  -7.2489,  -7.4421,  ...,  -6.3119,  -5.9369,  -6.4257],
         [ -7.9311,  -8.2282,  -8.0326,  ...,  -6.7387,  -6.4877,  -6.9525],
         [-12.0500, -11.7972, -12.5776,  ...,  -8.4518,  -6.7310,  -8.2586],
         ...,
         [-10.2204, -10.4315,  -9.9992,  ...,  -7.9570,  -6.7194,  -9.3618],
         [-12.4471, -12.5367, -12.5614,  ...,  -9.9085,  -9.4219, -11.1769],
         [-14.3657, -14.5227, -15.0017,  ..., -11.9715, -11.6569, -13.4498]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [13]:
logits = outputs.logits.detach().numpy()[0]
logits

array([[ -7.3722887,  -7.248853 ,  -7.4421344, ...,  -6.311859 ,
         -5.9368916,  -6.4256763],
       [ -7.9311113,  -8.228209 ,  -8.03259  , ...,  -6.738739 ,
         -6.487725 ,  -6.952522 ],
       [-12.050004 , -11.797209 , -12.577597 , ...,  -8.45177  ,
         -6.7310143,  -8.258563 ],
       ...,
       [-10.2204   , -10.431477 ,  -9.9992485, ...,  -7.9569864,
         -6.719398 ,  -9.36179  ],
       [-12.447113 , -12.536692 , -12.561384 , ...,  -9.908545 ,
         -9.421904 , -11.176948 ],
       [-14.365712 , -14.522718 , -15.001669 , ..., -11.971543 ,
        -11.656921 , -13.449783 ]], dtype=float32)

In [15]:
mask_logits = logits[tokens.index(mask) + 1]
confidence_scores = softmax(mask_logits)
confidence_scores

array([2.9159755e-10, 4.0784717e-10, 5.2928140e-10, ..., 8.4446100e-10,
       6.2026415e-09, 1.6282877e-09], dtype=float32)

In [17]:
for i in np.argsort(confidence_scores)[::-1][:5]:
    pred_token = tokenizer.decode(i)
    score = confidence_scores[i]
    
    # print(pred_token, score)
    print(sentence.replace(mask, pred_token), score)

I want to have pizza for tonight. 0.25729194
I want to get pizza for tonight. 0.17849576
I want to eat pizza for tonight. 0.15555455
I want to make pizza for tonight. 0.11422367
I want to order pizza for tonight. 0.09823079
