In [None]:
%pip install sacremoses
"""
%pip install wget

import wget
url = 'https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt'
wget.download(url)
"""

In [8]:
import torch
import numpy as np
import transformers
from torch.utils.data import Dataset, DataLoader

import pdb

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model = transformers.RobertaForCausalLM.from_pretrained("allegro/herbert-klej-cased-v1", is_decoder=True).to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [11]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size=512):
        self.tokenizer = tokenizer
        self.block_size = block_size

        # Read the file and tokenize
        with open(file_path, 'r') as file:
            lines = file.readlines()

        # Concatenate all lines into a single string
        text = ''.join(lines)

        # Tokenize the text and create examples
        self.examples = []
        tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        #tokenized_text = tokenizer.encode_plus(text, return_attention_mask=True, return_tensors='pt')[0]
        #pdb.set_trace()

        for i in range(0, len(tokenized_text) - block_size + 1, 2):
            self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))

        #pdb.set_trace()

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long, device=device)


In [12]:
block_size = 100
dataset_train = TextDataset(tokenizer, "tadeusz_train.txt", block_size=block_size)
dataset_valid = TextDataset(tokenizer, "tadeusz_valid.txt", block_size=block_size)
dataset_test = TextDataset(tokenizer, "tadeusz_test.txt", block_size=block_size)

In [13]:
batch_size = 32
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

In [None]:
def calculate_perplexity(model, tokenizer, inputs):
    with torch.no_grad():
        outputs = model(input_ids=inputs, labels=inputs)
        loss = outputs.loss

    perplexity = torch.exp(loss)
    return perplexity.item()

def test_perplexity(model, tokenizer, data_loader):
    model.eval()
    sum, cnt = 0, 0
    for batch in data_loader:
        batch_size = batch.size(0)
        cnt += batch_size
        sum += calculate_perplexity(model, tokenizer, batch) * batch_size
    return sum / cnt

print(test_perplexity(model, tokenizer, valid_loader))

In [15]:
#pdb.set_trace()

In [None]:
all_token_ids = list(tokenizer.get_vocab().values())
special_token_ids = tokenizer.all_special_ids
valid_token_ids = [token_id for token_id in all_token_ids if token_id not in special_token_ids]
len(valid_token_ids)

In [None]:
num_epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005)

num_training_steps = num_epochs * len(train_loader)
lr_scheduler = transformers.get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=300, num_training_steps=num_training_steps
)

model.train()
_iter = 0
for epoch in range(num_epochs):
  print (f'------ Epoch {epoch} ------')
  iter_epoch = 0
  for batch in train_loader:
    n = batch.size(1)
    num = int(0.15 * n)
    selected_indices = torch.tensor(np.random.choice(np.arange(n), num, replace=False))
    num_left_alone = int(0.2 * num)
    indices_left_alone = torch.tensor(np.random.choice(selected_indices, num_left_alone, replace=False))
    num_replaced = int(0.1 * num)
    indices_replaced = torch.tensor(np.random.choice(indices_left_alone, num_replaced, replace=False))

    selected_indices = np.setdiff1d(selected_indices, indices_left_alone)
    #indices_left_alone = np.setdiff1d(indices_left_alone, indices_replaced)

    inputs = torch.clone(batch)
    inputs[:, selected_indices] = tokenizer.mask_token_id
    inputs[:, indices_replaced] = torch.tensor(np.random.choice(valid_token_ids, num_replaced, replace=False), device=device)

    outputs = model(input_ids=inputs, labels=batch)

    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    iter_epoch += 1
    _iter += 1

    if (iter_epoch % 50 == 0):
      print(f'Epoch {epoch} - loss: {loss.item()}, lr: {lr_scheduler._last_lr}, iter: {iter_epoch} / {len(dataset_train) // batch_size}')

  print(f'End of epoch {epoch} - loss: {loss.item()}, lr: {lr_scheduler._last_lr}, iter: {_iter} / {len(dataset_train) // batch_size}')


In [None]:
#input_ids = tokenizer.encode('Pan Tadeusz, choć młodzik, ale prawem gościa\nWysoko siadł przy damach obok jegomościa;\nMiędzy nim i stryjaszkiem jedno pozostało\nPuste miejsce, jak gdyby na kogoś czekało.', return_tensors='pt')
#input_ids = tokenizer.encode('Klucznik na to słowo\nPobladnął, pochylił się, i ciała połową\nWygięty naprzód, stanął, zwisł na jednej nodze,', return_tensors='pt')
input_ids = tokenizer.encode('Jam jest Jacek', return_tensors='pt')
output_ids = model.generate(input_ids.to(device), max_length=200)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text

In [None]:
print(test_perplexity(model, tokenizer, valid_loader))

In [None]:
assert False

In [None]:
#input_ids = tokenizer.encode('Pan Tadeusz, choć młodzik, ale prawem gościa\nWysoko siadł przy damach obok jegomościa;\nMiędzy nim i stryjaszkiem jedno pozostało\nPuste miejsce, jak gdyby na kogoś czekało.', return_tensors='pt')
#input_ids = tokenizer.encode('Klucznik na to słowo\nPobladnął, pochylił się, i ciała połową\nWygięty naprzód, stanął, zwisł na jednej nodze,', return_tensors='pt')
input_ids = tokenizer.encode('Jam jest Jacek', return_tensors='pt', temperature=1)
output_ids = model.generate(input_ids.to(device), max_length=200)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text

In [None]:
print(test_perplexity(model, tokenizer, valid_loader))