In [1]:
from pathlib import Path
import torch

torch.cuda.empty_cache()

In [2]:
paths = [str(x) for x in Path('./').glob('*.txt')]

paths[:5]

['recommend0.txt',
 'recommend1.txt',
 'recommend10.txt',
 'recommend11.txt',
 'recommend12.txt']

In [3]:
!pip install tokenizers

Defaulting to user installation because normal site-packages is not writeable


In [4]:
from tokenizers import ByteLevelBPETokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer()

In [6]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                special_tokens=[
                    '<s>', '<pad>', '</s>', '<unk>', '<mask>'
                ])

In [21]:
import os
os.mkdir('recomendayo')

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'recomendayo'

In [7]:
tokenizer.save_model('recomendayo') 

['recomendayo\\vocab.json', 'recomendayo\\merges.txt']

In [8]:
from transformers import RobertaTokenizerFast

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained('recomendayo')

In [10]:
tokenizer('I want to build something')

{'input_ids': [0, 45, 511, 277, 1471, 818, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [11]:
import torch

def mlm(tensor):
    rand = torch.rand(tensor.shape) 
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
    return tensor

In [12]:
from pathlib import Path

paths = [str(x) for x in Path('./').glob('*.txt')]

paths[:5]

['recommend0.txt',
 'recommend1.txt',
 'recommend10.txt',
 'recommend11.txt',
 'recommend12.txt']

In [13]:
from tqdm.auto import tqdm
input_ids = []
mask = []
labels = []

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

  0%|          | 0/22 [00:00<?, ?it/s]

In [14]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [15]:
input_ids[0][:10]

tensor([   0,  389,  271,  617,    4, 9968,   16,    4,  485,  268])

In [16]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [17]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [18]:
dataset = Dataset(encodings)

In [19]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [20]:
from transformers import RobertaConfig

In [21]:
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [24]:
from transformers import RobertaForMaskedLM

In [25]:
model = RobertaForMaskedLM(config)

In [26]:
device = torch.device('cuda')

In [27]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [28]:
from transformers import AdamW

In [29]:
model.train()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [30]:
optim = AdamW(model.parameters(), lr=1e-4)



In [31]:
from tqdm.auto import tqdm

In [32]:
epochs = 1
step = 0

In [None]:
loop = tqdm(dataloader, leave=True)
epochs = 1
step = 0

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/53842 [00:00<?, ?it/s]

  0%|          | 0/53842 [00:00<?, ?it/s]

In [78]:
model.save_pretrained('./recomendayo')

In [79]:
from transformers import pipeline

In [80]:
fill = pipeline('fill-mask', model = 'recomendayo', tokenizer = 'recomendayo')

In [93]:

fill(f' Do you want to grab a{fill.tokenizer.mask_token} for the movie?  ')

[{'score': 0.15044544637203217,
  'token': 3398,
  'token_str': ' movie',
  'sequence': ' Do you want to grab a movie for the movie?  '},
 {'score': 0.059360604733228683,
  'token': 1642,
  'token_str': ' game',
  'sequence': ' Do you want to grab a game for the movie?  '},
 {'score': 0.03674054890871048,
  'token': 10034,
  'token_str': ' playlist',
  'sequence': ' Do you want to grab a playlist for the movie?  '},
 {'score': 0.016073888167738914,
  'token': 2003,
  'token_str': ' window',
  'sequence': ' Do you want to grab a window for the movie?  '},
 {'score': 0.01530035212635994,
  'token': 4229,
  'token_str': ' fan',
  'sequence': ' Do you want to grab a fan for the movie?  '}]