In [14]:
import pandas as pd
import numpy as np
import spacy

from transformers import HerbertTokenizer, RobertaModel

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

from transformers import pipeline


## BERT with many masks

In [2]:
# https://discuss.huggingface.co/t/multiple-mask-tokens/174/2

In [8]:
tokenizer_en = BertTokenizer.from_pretrained('bert-base-cased')
input_txt = "[MASK] [MASK] [MASK] what should be done next, even it was not easy."
inputs = tokenizer_en(input_txt, return_tensors='pt')

In [9]:
model_en = BertForMaskedLM.from_pretrained('bert-base-cased')
 
outputs = model_en(**inputs)
predictions = outputs[0]
sorted_preds, sorted_idx = predictions[0].sort(dim=-1, descending=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
m = sorted_preds.shape[0] - 1

In [11]:
predicted_index = [sorted_idx[i, 0].item() for i in range(0,m)]
predicted_token = [tokenizer.convert_ids_to_tokens([predicted_index[x]])[0] for x in range(1,m)]
print(' '.join(predicted_token))

He knew knew what should be done next , even it was not easy .


### Polish

In [12]:
# https://huggingface.co/dkleczek/bert-base-polish-uncased-v1

In [20]:
model_pl = BertForMaskedLM.from_pretrained("dkleczek/bert-base-polish-cased-v1")
tokenizer_pl = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-cased-v1")
nlp = pipeline('fill-mask', model=model_pl, tokenizer=tokenizer_pl)

Some weights of the model checkpoint at dkleczek/bert-base-polish-cased-v1 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
input_txt = "Adam [MASK] wielkim polskim [MASK] był."
inputs = tokenizer_pl(input_txt, return_tensors='pt')

In [22]:
outputs = model_pl(**inputs)
predictions = outputs[0]

In [23]:
sorted_preds, sorted_idx = predictions[0].sort(dim=-1, descending=True)
m = sorted_preds.shape[0] - 1
predicted_index = [sorted_idx[i, 0].item() for i in range(0,m)]
predicted_token = [tokenizer_pl.convert_ids_to_tokens([predicted_index[x]])[0] for x in range(1,m)]
print(' '.join(predicted_token))

Adam II wielkim polskim człowiekiem był .
