In [5]:
import os 
import sys 
import numpy as np
from tqdm.auto import tqdm
from pathlib import Path


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, BertTokenizerFast,RobertaTokenizer
from transformers import RobertaForMaskedLM
from transformers import RobertaConfig
from transformers import AdamW
from tokenizers import ByteLevelBPETokenizer

# specify GPU
device = torch.device("cuda")
os.environ["CUDA_VISIBLE_DEVICES"]= '4'

In [6]:
with open(os.path.join('JAPA_JPER.txt')) as f: 
    data = f.read()

In [7]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
len(data.split('\n'))

1

In [9]:
tokenizer = ByteLevelBPETokenizer()

In [10]:
tokenizer.train(files =['JAPA_JPER.txt'] ,vocab_size=30_522, min_frequency =2, special_tokens = ['<s>',
                                                                                              '<pad>',
                                                                                              '</s>',
                                                                                              '<unk>',
                                                                                              '<mask>',
                                                                                              ])






In [11]:
tokenizer.save_model('./')

['./vocab.json', './merges.txt']

In [12]:
from transformers import RobertaTokenizer

# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('./', max_len=512)

In [13]:
def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
    return tensor

In [14]:
paths = [str(x) for x in Path('./').glob("*.txt")]
paths[:5]

['merges.txt', 'JAPA_JPER.txt', 'dropped text.txt', 'test.txt', 'links.txt']

In [15]:
input_ids = []
mask = [] 
labels = [] 

for path in tqdm(paths):
    with open(path, 'r', encoding = 'utf-8') as f: 
        lines = f.read().split('\n')
    sample = tokenizer(lines, max_length = 512, padding = 'max_length', truncation = True, return_tensors = 'pt' )
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

  0%|          | 0/6 [00:00<?, ?it/s]

In [16]:
len(input_ids)

6

In [17]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [18]:
encodings = {
    'input_ids': input_ids, 
    'attention_mask' : mask, 
    'labels': labels
}

In [19]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {
            key: tensor[i] for key, tensor in self.encodings.items()
        }

In [20]:
dataset = Dataset(encodings)

In [21]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [22]:

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [23]:
model = RobertaForMaskedLM(config)

In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [25]:
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)



In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/1640 [00:00<?, ?it/s]

  0%|          | 0/1640 [00:00<?, ?it/s]

In [None]:
model.save_pretrained('./')  # and don't forget to save filiBERTo!

In [None]:
torch.cuda.device_count()