In [1]:
import torch

In [3]:
import transformers

In [4]:
from transformers import BertTokenizer,BertForMaskedLM

In [5]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
model=BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
with open("clean.txt",'r') as fp:
    text=fp.read().split('\n')

In [7]:
text[:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

In [8]:
inputs=tokenizer(text[:8],return_tensors='pt',padding=True,truncation=True)

In [9]:
inputs.input_ids.shape

torch.Size([8, 240])

In [10]:
inputs["label"]=inputs.input_ids.detach().clone()

In [12]:
rand=torch.rand(inputs.input_ids.shape)

In [14]:
mask_arr=(rand<0.15)*(inputs.input_ids!=101)*(inputs.input_ids!=102)*(inputs.input_ids!=0)

In [17]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False]])

In [18]:
mask_arr.shape

torch.Size([8, 240])

In [24]:
selection=[]
for i in range(len(mask_arr)):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

In [29]:
selection[:2]

[[4, 10], [16]]

In [27]:
for i in range(len(mask_arr)):
    inputs.input_ids[i,selection[i]]=103

In [35]:
from torch.utils.data import Dataset,DataLoader

In [47]:
class CustomDataSet(Dataset):
    def __init__(self,encodings):
        self.encodings=encodings
        
    def __len__(self):
        return self.encodings.input_ids.shape[0]
    
    def __getitem__(self,idx):
        sample={
            key: torch.tensor(val[idx]) for key,val in self.encodings.items()   
        }
        return sample

In [48]:
dataset=CustomDataSet(inputs)

In [49]:
batch_size=2

In [50]:
loader=DataLoader(dataset,batch_size=batch_size,shuffle=True)

In [54]:
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [57]:
optim=torch.optim.AdamW(model.parameters(),lr=1e-5)

In [58]:
from tqdm import tqdm

In [61]:
epochs=2
for epoch in range(epochs):
    loop=tqdm(loader,leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids=batch["input_ids"]
        attention_mask=batch["attention_mask"]
        labels=batch["label"]
        outputs=model(input_ids,attention_mask=attention_mask,labels=labels)
        loss=outputs.loss
        loss.backward()
        optim.step()
        
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  key: torch.tensor(val[idx]) for key,val in self.encodings.items()
Epoch 0: 100%|████████████████████████████████████████████████████████████████| 4/4 [00:53<00:00, 13.31s/it, loss=15.6]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 4/4 [00:45<00:00, 11.38s/it, loss=9.31]


In [68]:
with torch.no_grad():
    op=model(inputs.input_ids,attention_mask=inputs.attention_mask)

In [69]:
op.keys()

odict_keys(['logits'])

In [71]:
op.logits.shape

torch.Size([8, 240, 30522])

In [72]:
score=torch.nn.functional.softmax(op.logits,dim=-1)

In [73]:
tokens=torch.argmax(score,dim=-1)

In [81]:
actual=inputs.label[3]

In [82]:
pre=tokens[3]

In [83]:
act=tokenizer.decode(actual)

In [84]:
act

'[CLS] from my great - grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [85]:
new_pred=tokenizer.decode(pre)
new_pred

'. from the great - grandfather to not to have attended public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally ;. and and money ; education not been not not,,,,,, that childhood learning learning learning age should grandfather should should to to to. private, but not have been been teachers good,,,,, learning all such occasions occasions time should spend education to to to should. good good extensively not not not have been good,,,, and to learning all all such subjects should money should liberal.ly. " in and. to to from the education man not liberal grandfather,, to to necessity that all these childhood all good subjects grandfather should liberal grandfather to. to to only good public extensively but but have have been good teachers good,,, not also that all these education learning the education man grandfather to should liberal liberal extensively. to and and ; good all education education such good been been had,,, fa