In [1]:
from transformers import DistilBertTokenizer,DistilBertForMaskedLM

In [2]:
tokenizer1=DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
model1=DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

In [4]:
with open("/content/clean.txt","r") as fp:
  text=fp.read().split(".")

In [5]:
texts=[]
for sentence in text:
  sentences=sentence.replace("\n"," ")
  texts.append(sentences)

In [6]:
len(texts)

1328

In [39]:
inputs1=tokenizer1(texts[:100],padding=True,truncation=True,return_tensors="pt")

In [8]:
inputs2=tokenizer1(texts[200:300],truncation=True,padding=True,return_tensors="pt")

In [40]:
inputs1["labels"]=inputs1.input_ids.detach().clone()

In [10]:
inputs2["labels"]=inputs2.input_ids.detach().clone()

In [11]:
import torch

In [41]:
random=torch.rand(inputs1.input_ids.shape)

In [13]:
random1=torch.rand(inputs2.input_ids.shape)

In [42]:
mask_arr1=(random<0.15)*(inputs1.input_ids!=101)*(inputs1.input_ids!=102)*(inputs1.input_ids!=0)

In [15]:
mask_arr=(random1<0.15)*(inputs2.input_ids!=101)*(inputs2.input_ids!=102)*(inputs2.input_ids!=0)

In [16]:
select=[]
for i in range(len(mask_arr)):
  selected=torch.flatten(mask_arr[i].nonzero()).tolist()
  select.append(selected)
  inputs2.input_ids[i,selected]=103

In [43]:
for i in range(len(mask_arr1)):
  selected=torch.flatten(mask_arr1[i].nonzero()).tolist()
  inputs1.input_ids[i,selected]=103

In [18]:
model1.train()

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.

In [19]:
from torch.utils.data import Dataset,DataLoader

In [20]:
class CustomDataSet(Dataset):
  def __init__(self,data):
    self.data=data

  def __len__(self):
    return self.data.input_ids.shape[0]

  def __getitem__(self,idx):
    sample={
        key: torch.Tensor(value[idx]) for key,value in self.data.items()
    }
    return sample

In [44]:
train_dataset=CustomDataSet(inputs1)

In [22]:
val_dataset=CustomDataSet(inputs2)

In [45]:
loader=DataLoader(train_dataset,batch_size=25,shuffle=True)

In [24]:
val_loader=DataLoader(val_dataset,batch_size=5,shuffle=True)

In [25]:
from tqdm import tqdm

In [26]:
from torch.optim import AdamW

In [27]:
optima=AdamW(model1.parameters(),lr=5e-5)

In [28]:
epochs=3
for epoch in range(epochs):
  loop=tqdm(loader,leave=True)
  for batch in loop:
    optima.zero_grad()
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    opp=model1(input_ids,attention_mask=attention_mask,labels=labels)
    loss=opp.loss
    loss.backward()
    optima.step()
    loop.set_description(f"Epoch {epoch}")
    loop.set_postfix(loss=loss.item())


Epoch 0: 100%|██████████| 1/1 [00:52<00:00, 52.91s/it, loss=11.2]
Epoch 1: 100%|██████████| 1/1 [00:52<00:00, 52.93s/it, loss=8.91]
Epoch 2: 100%|██████████| 1/1 [00:45<00:00, 45.08s/it, loss=7.54]


VALIDATION

In [29]:
model1.eval()

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.

In [30]:
all_predictions = []
all_labels = []

In [31]:
import torch.nn.functional as F

In [55]:
with torch.no_grad():
  for batch in val_loader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    opp = model1(input_ids, attention_mask=attention_mask,labels=labels)
    print("loss:",opp.loss)
    logits=opp.logits
    logits = F.softmax(logits, dim=-1)
    predicted_ids = logits.argmax(dim=-1)


all_predictions.extend(predicted_ids.tolist())
all_labels.extend(labels.tolist())

loss: tensor(9.4443)
loss: tensor(7.8312)
loss: tensor(8.6227)
loss: tensor(8.6330)
loss: tensor(9.6223)
loss: tensor(9.8002)
loss: tensor(8.0351)
loss: tensor(9.3981)
loss: tensor(7.8095)
loss: tensor(8.9809)
loss: tensor(8.3690)
loss: tensor(8.1738)
loss: tensor(9.4024)
loss: tensor(8.9632)
loss: tensor(8.1413)
loss: tensor(8.8712)
loss: tensor(9.3418)
loss: tensor(7.9674)
loss: tensor(8.9477)
loss: tensor(9.4668)


In [53]:
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix

In [None]:
correct_predictions = 0
total_predictions = 0

for preds, labels in zip(all_predictions, all_labels):
    print("accuracy_score:",accuracy_score(preds,labels))
    for pred, label in zip(preds, labels):
        if pred == label:
            correct_predictions += 1
        total_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Validation Accuracy: {accuracy*100:.4f}")

Inference(PIPELINE)



In [52]:
from transformers import pipeline

In [35]:
fill_mask = pipeline(
    "fill-mask",
    model=model1,
    tokenizer=tokenizer1
)

In [36]:
input_text = "I am going to vishakapatnam [MASK] morning"

In [37]:
result =  fill_mask(input_text)

In [38]:
for pred in result:
  print(pred)

{'score': 0.33215847611427307, 'token': 4826, 'token_str': 'tomorrow', 'sequence': 'i am going to vishakapatnam tomorrow morning'}
{'score': 0.17943145334720612, 'token': 6229, 'token_str': 'till', 'sequence': 'i am going to vishakapatnam till morning'}
{'score': 0.0697280615568161, 'token': 6928, 'token_str': 'monday', 'sequence': 'i am going to vishakapatnam monday morning'}
{'score': 0.03278611600399017, 'token': 2296, 'token_str': 'every', 'sequence': 'i am going to vishakapatnam every morning'}
{'score': 0.029733987525105476, 'token': 9432, 'token_str': 'thursday', 'sequence': 'i am going to vishakapatnam thursday morning'}
