In [41]:
from transformers import AutoTokenizer, BertForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained('allegro/herbert-base-cased')
bert_model = BertForMaskedLM.from_pretrained("allegro/herbert-base-cased")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForMaskedLM: ['cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
import pandas as pd
df = pd.read_excel('NLP_CLEAN.xlsx')

In [43]:
text = df[['nlp_2', 'nlp_3', 'nlp_4', 'nlp_5']].values
text = text.flatten()

In [58]:
inputs = tokenizer(list(text), return_tensors='pt', max_length=60, truncation=True, padding='max_length') 
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.input_ids[0]

tensor([    0,  3379,  1028,  2017,  2134, 23389,  1899,  3379,  1028, 37696,
         1899, 24656,  6585,  1899,  2954,  2025,    87,  3953,  2243,  1899,
        47890,  2483,  1899, 13186,  1011,  1899,  3248, 10321,  3304,  1899,
        30926,  4167,  1899,    57,  2009, 14323,  1899, 25503,  1899,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])

In [59]:
inputs

{'input_ids': tensor([[    0,  3379,  1028,  ...,     1,     1,     1],
        [    0, 47089,  3021,  ...,     1,     1,     1],
        [    0, 44327,  1899,  ...,     1,     1,     1],
        ...,
        [    0,  2351, 16881,  ...,     1,     1,     1],
        [    0, 12863,  1899,  ...,     1,     1,     1],
        [    0,  5862, 34048,  ...,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[    0,  3379,  1028,  ...,     1,     1,     1],
        [    0, 47089,  3021,  ...,     1,     1,     1],
        [    0, 44327, 

In [60]:
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.6732, 0.6339, 0.1024,  ..., 0.6265, 0.1328, 0.1587],
        [0.3380, 0.1709, 0.3890,  ..., 0.8021, 0.5737, 0.1505],
        [0.6612, 0.5807, 0.0407,  ..., 0.2919, 0.0859, 0.8910],
        ...,
        [0.3061, 0.4100, 0.0439,  ..., 0.8145, 0.9591, 0.7011],
        [0.7029, 0.4777, 0.2091,  ..., 0.3848, 0.4556, 0.6926],
        [0.0817, 0.9816, 0.3373,  ..., 0.5345, 0.3194, 0.8380]])

In [61]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 0) * (inputs.input_ids != 1) * (inputs.input_ids != 2)
mask_arr

tensor([[False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        ...,
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [62]:
selection = []

for i in range(mask_arr.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

In [63]:
for i in range(mask_arr.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [64]:
inputs.input_ids[0][:30]

tensor([    0,  3379,   103,  2017,  2134, 23389,  1899,  3379,  1028, 37696,
         1899, 24656,  6585,  1899,  2954,  2025,    87,   103,   103,  1899,
        47890,  2483,  1899,   103,  1011,   103,  3248,   103,  3304,  1899])

In [65]:
class StatmentsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        
    def __len__(self):
        return len(self.encodings.input_ids)

    

In [66]:
dataset = StatmentsDataset(inputs)

In [67]:
data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [68]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x28d12f43fa0>

In [69]:
bert_model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [70]:
from transformers import AdamW

optimizer = AdamW(bert_model.parameters(), lr=1e-3)

In [71]:
from tqdm import tqdm

epochs = 2

for epoch in range(epochs):
    loop = tqdm(data_loader, leave=True)
    for batch in  loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 131/131 [42:54<00:00, 19.65s/it, loss=1.87]
Epoch 1: 100%|██████████| 131/131 [44:28<00:00, 20.37s/it, loss=1.56]


In [133]:
bert_model.save_pretrained("Learning_Pains")