In [1]:
import numpy as np

In [2]:
import torch
import transformers
from transformers import BertTokenizer,BertForPreTraining

In [3]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
model=BertForPreTraining.from_pretrained("bert-base-uncased")

In [4]:
with open("clean.txt",'r') as fp:
    text=fp.read().split("\n")

In [5]:
len(text)

507

In [6]:
text[:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

# NSP

In [7]:
bag=[sentence for paragraph in text for sentence in paragraph.split(".") if sentence!='']

In [8]:
len(bag)

1372

In [9]:
import random

sentence_a=[]
sentence_b=[]
label=[]

for paragraph in text:
    sentences=[
        sentence for sentence in paragraph.split(".") if sentence!=''
    ]
    num_sentences=len(sentences)
    if num_sentences>1:
        start=random.randint(0,num_sentences-2)
        sentence_a.append(sentences[start])
        if random.random()>0.5:
            sentence_b.append(sentences[start+1])
            label.append(0)
            
        else:
            sentence_b.append(bag[random.randint(0,len(bag)-1)])
            label.append(1)

In [10]:
len(sentence_a),len(sentence_b),len(label)

(317, 317, 317)

In [11]:
sentence__a=sentence_a[:15]
sentence__b=sentence_b[:15]
ladel_=label[:15]

In [12]:
inputs=tokenizer(sentence__a,sentence__b,padding=True,truncation=True,return_tensors='pt')

In [13]:
inputs.input_ids.shape

torch.Size([15, 147])

In [14]:
torch.LongTensor([ladel_]).shape

torch.Size([1, 15])

In [15]:
inputs["next_sentence_label"]=torch.LongTensor([ladel_]).T

In [16]:
inputs.next_sentence_label.shape

torch.Size([15, 1])

# MLM

In [17]:
inputs["labels"]=inputs.input_ids.detach().clone()

In [18]:
random=torch.rand((inputs.input_ids.shape))

In [19]:
mask_arr=(random<0.15)*(inputs.input_ids!=101)*(inputs.input_ids!=102)*(inputs.input_ids!=0)

In [20]:
mask_arr.shape

torch.Size([15, 147])

In [21]:
selected=[]

In [22]:
for i in range(len(mask_arr)):
    selection=torch.flatten(mask_arr[i].nonzero()).tolist()
    selected.append(selection)
    inputs.input_ids[i,selection]=103

In [23]:
inputs.input_ids

tensor([[  101,  2013, 21692,  ...,     0,     0,     0],
        [  101,  2002,  2165,  ...,     0,     0,     0],
        [  101,  2582,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  1010,  ...,     0,     0,     0],
        [  101,  1998,  7065,  ...,     0,     0,     0],
        [  101,  2295, 15223,  ...,     0,     0,     0]])

# Training

In [26]:
model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [27]:
from torch.optim import AdamW

In [28]:
optim=AdamW(model.parameters(),lr=5e-5)

In [29]:
from tqdm import tqdm

In [30]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [31]:
inputs.input_ids.shape

torch.Size([15, 147])

In [32]:
class CustomDataSet(torch.utils.data.Dataset):
    def __init__(self,data):
        self.data=data
        
    def __len__(self):
        return self.data.input_ids.shape[0]
    
    def __getitem__(self,idx):
        sample={
            key : torch.Tensor(value[idx]) for key,value in self.data.items()
        }
        return sample

In [33]:
dataset=CustomDataSet(inputs)

In [34]:
loader=torch.utils.data.DataLoader(dataset,batch_size=64,shuffle=True)

In [35]:
loader

<torch.utils.data.dataloader.DataLoader at 0x2acd1fcfa90>

In [36]:
from tqdm import tqdm
epochs=2

for epoch in range(epochs):
    step=tqdm(loader,leave=True)
    for batch in step:
        optim.zero_grad()
        input_ids=batch["input_ids"]
        attention_mask=batch["attention_mask"]
        next_sentence_label=batch["next_sentence_label"]
        token_type_ids=batch["token_type_ids"]
        labels=batch["labels"]
        op=model(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,
                 next_sentence_label=next_sentence_label,labels=labels)
        loss=op.loss
        loss.backward()
        optim.step()
        
        step.set_description(f"Epoch {epoch}")
        step.set_postfix(loss=loss.item())

Epoch 0: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:55<00:00, 55.68s/it, loss=11.9]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 1/1 [01:12<00:00, 72.26s/it, loss=8.05]


In [37]:
with torch.no_grad():
    opp=model(**inputs)

In [38]:
opp.keys()

odict_keys(['loss', 'prediction_logits', 'seq_relationship_logits'])

In [39]:
opp.loss

tensor(6.5477)

In [40]:
opp.prediction_logits.shape

torch.Size([15, 147, 30522])

In [41]:
opp.seq_relationship_logits.shape

torch.Size([15, 2])

# NSP outputs

In [42]:
for i in range(len(opp.seq_relationship_logits)):
    print("actual:",inputs.next_sentence_label[i].item())
    print("predicted:",torch.argmax(opp.seq_relationship_logits[i]).item())
    print("\n")

actual: 0
predicted: 0


actual: 0
predicted: 0


actual: 1
predicted: 1


actual: 0
predicted: 0


actual: 1
predicted: 1


actual: 1
predicted: 1


actual: 1
predicted: 0


actual: 0
predicted: 0


actual: 0
predicted: 0


actual: 0
predicted: 0


actual: 1
predicted: 1


actual: 1
predicted: 1


actual: 0
predicted: 0


actual: 1
predicted: 1


actual: 0
predicted: 0




# MLM outputs 

In [60]:
selected[14]

[7, 12, 17, 18, 23, 24, 26, 28, 45, 56]

In [45]:
inputs.input_ids.shape

torch.Size([15, 147])

In [46]:
opp.prediction_logits.shape

torch.Size([15, 147, 30522])

In [61]:
tokenizer.decode(inputs.input_ids[14,17])

'[ M A S K ]'

In [62]:
tokenizer.decode(inputs.labels[14,17])

't e n'

In [50]:
score=torch.nn.functional.softmax(opp.prediction_logits,dim=-1)

In [51]:
pred=torch.argmax(score,axis=-1)

In [52]:
pred.shape

torch.Size([15, 147])

In [63]:
tokenizer.decode(pred[14,17])

'a s'

In [64]:
opp.loss

tensor(6.5477)

In [65]:
tokenizer.decode(inputs.input_ids[14])

'[CLS] though thou shouldst be going [MASK] live three thousand years [MASK] and as many times [MASK] [MASK] years, still remember [MASK] [MASK] man [MASK] any [MASK] life than this which he now lives, nor lives any other than this which he [MASK] loses [SEP] the longest and shortest are thus brought to [MASK] same [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [66]:
tokenizer.decode(inputs.labels[14])

'[CLS] though thou shouldst be going to live three thousand years, and as many times ten thousand years, still remember that no man loses any other life than this which he now lives, nor lives any other than this which he now loses [SEP] the longest and shortest are thus brought to the same [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [67]:
tokenizer.decode(pred[14])

'. though thou shouldst be going to live three thousand years, and as many times as thousand years, still remember no no man has any other life than this which he now lives, nor lives any other than this which he now loses. the longest and shortest are thus brought to the same. thout, to has he in,,, for many as three ten even he i shortest shortest no no shortest other other end the other in man to lives in has lives life life life life life life now now now thousand now now and which no no shortest has other other other life in which lives now loses, he lives life life, lives now now now now longest shortest shortest shortest shortest shortest no shortest shortest not not end life other the'