In [1]:
import pandas as pd
import numpy as np
import spacy

from transformers import HerbertTokenizer, RobertaModel

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForNextSentencePrediction

from transformers import pipeline
from torch.nn import functional as F

## BERT with many masks

In [2]:
# https://discuss.huggingface.co/t/multiple-mask-tokens/174/2

In [4]:
tokenizer_en = BertTokenizer.from_pretrained('bert-base-cased')
input_txt = "[MASK] [MASK] [MASK] what should be done next, even it was not easy."
inputs = tokenizer_en(input_txt, return_tensors='pt')

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [5]:
model_en = BertForMaskedLM.from_pretrained('bert-base-cased')
 
outputs = model_en(**inputs)
predictions = outputs[0]
sorted_preds, sorted_idx = predictions[0].sort(dim=-1, descending=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
m = sorted_preds.shape[0] - 1

In [7]:
predicted_index = [sorted_idx[i, 0].item() for i in range(0,m)]
predicted_token = [tokenizer_en.convert_ids_to_tokens([predicted_index[x]])[0] for x in range(1,m)]
print(' '.join(predicted_token))

He knew knew what should be done next , even it was not easy .


In [8]:
model_nxt_en = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

prompt = "The child came home from school."
next_sentence = "He played soccer after school."

encoding = tokenizer_en.encode_plus(prompt, next_sentence, return_tensors='pt')

outputs = model_nxt_en(**encoding)[0]
softmax = F.softmax(outputs, dim = 1)
print(softmax)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[9.9999e-01, 1.0484e-05]], grad_fn=<SoftmaxBackward0>)


In [9]:
softmax[0,1] / softmax[0,0]

tensor(1.0484e-05, grad_fn=<DivBackward0>)

### Polish

In [10]:
# https://huggingface.co/dkleczek/bert-base-polish-uncased-v1

In [2]:
model_pl = BertForMaskedLM.from_pretrained("dkleczek/bert-base-polish-uncased-v1")
tokenizer_pl = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")
nlp = pipeline('fill-mask', model=model_pl, tokenizer=tokenizer_pl)

NameError: name 'BertForMaskedLM' is not defined

In [1]:
input_txt = "Adam [MASK] wielkim polskim [MASK] był."
inputs = tokenizer_pl(input_txt, return_tensors='pt')

NameError: name 'tokenizer_pl' is not defined

In [56]:
outputs = model_pl(**inputs)
predictions = outputs[0]

In [57]:
sorted_preds, sorted_idx = predictions[0].sort(dim=-1, descending=True)
m = sorted_preds.shape[0] - 1
predicted_index = [sorted_idx[i, 0].item() for i in range(0,m)]
predicted_token = [tokenizer_pl.convert_ids_to_tokens([predicted_index[x]])[0] for x in range(1,m)]
print(' '.join(predicted_token))

Adam II wielkim polskim człowiekiem był .


In [58]:
model_nxt_pl = BertForNextSentencePrediction.from_pretrained('dkleczek/bert-base-polish-uncased-v1')

prompt = "Nad rzeczką opodal krzaczka."
next_sentence = "Mieszkała kaczka-dziwaczka"

encoding = tokenizer_pl.encode_plus(prompt, next_sentence, return_tensors='pt')

outputs = model_nxt_pl(**encoding)[0]
softmax = F.softmax(outputs, dim = 1)
print(softmax)

Some weights of the model checkpoint at dkleczek/bert-base-polish-uncased-v1 were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[0.9733, 0.0267]], grad_fn=<SoftmaxBackward0>)


In [59]:
# small values - good
# big first one and small second one - means those sentences should go one after another

In [60]:
softmax[0,1] / softmax[0,0]

tensor(0.0274, grad_fn=<DivBackward0>)

In [61]:
prompt = "Zygmunt III Waza był wszechstronnie wykształcony, biegle władał pięcioma językami, w tym językiem polskim."
next_sentence = "Był dobrym gospodarzem, trzykrotnie zwiększył dochody skarbu królewskiego."

encoding = tokenizer_pl.encode_plus(prompt, next_sentence, return_tensors='pt')

outputs = model_nxt_pl(**encoding)[0]
softmax = F.softmax(outputs, dim = 1)
print(softmax, '\n')

print(softmax[0,1] / softmax[0,0])

tensor([[0.9983, 0.0017]], grad_fn=<SoftmaxBackward0>) 

tensor(0.0017, grad_fn=<DivBackward0>)
