### Load libraries and enviromental variables

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from dotenv import load_dotenv
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import neptune
from neptune.utils import stringify_unsupported
from datasets import load_metric

load_dotenv()

True

### Load model and setup device

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained("model/model")
tokenizer = AutoTokenizer.from_pretrained("model/tokenizer/")
print(device)
model.to(device);

cuda


### Load metrics

In [3]:
bleu_metric = load_metric("bleu", trust_remote_code=True)
meteor_metric = load_metric("meteor", trust_remote_code=True)
rouge_metric = load_metric("rouge", trust_remote_code=True)

  bleu_metric = load_metric("bleu", trust_remote_code=True)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\urbii\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\urbii\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\urbii\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Load and prepare data

In [4]:
data = pd.read_csv("data/All_data.csv")
train_indices = data.sample(frac=0.85).index
train_data = data.loc[train_indices].reset_index(drop=True)
valid_data = data.drop(train_indices).reset_index(drop=True)

print(train_data.shape)
print(valid_data.shape)

train_data.head()

(1029, 2)
(182, 2)


Unnamed: 0,pl,mig
0,Czy Pan umie czytać?,Pan czytać umieć
1,Czy Pan jest głuchy?,Pan głuchy
2,To jest numer pogotowia ratunkowego 999.,Pogotowie ratunkowe telefon numer 999
3,Nie umiem wypełnić wniosku.,Ja nie rozumieć co pisac
4,Ja złamałem rękę na ulicy.,Ja ręka ulica złamać


In [5]:
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors="pt", padding=True, truncation=True)
        targets = self.tokenizer(self.target_texts[idx], return_tensors="pt", padding=True, truncation=True)
        return {**inputs, "labels": targets["input_ids"]}

In [6]:
def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze() for item in batch]
    attention_mask = [item['attention_mask'].squeeze() for item in batch]
    labels = [item['labels'].squeeze() for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_dataset = TranslationDataset(train_data.pl, train_data.mig, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

valid_dataset = TranslationDataset(train_data.pl, train_data.mig, tokenizer)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
next(iter(train_dataloader))

{'input_ids': tensor([[  700,    22,  5024,  2601,  3327,    19,     2,     0, 63429, 63429,
          63429],
         [  114,  1247,    85,   432,  5496,     2,     0, 63429, 63429, 63429,
          63429],
         [  178,   421,  3449,    23,    45,    26,  4054,    95,     2,     0,
          63429],
         [   50,  4998,    25, 17012,  2422,     2,     0, 63429, 63429, 63429,
          63429],
         [  362,   606,  7549,  1450,   111,   466,   461,  6252,    61,     7,
              0],
         [   12,   243,  2320,    51,  5040,     7,     0, 63429, 63429, 63429,
          63429],
         [  700,   748,  7959,  7225,  4425,    63,     2,     0, 63429, 63429,
          63429],
         [  926,  5171, 24754,  3870,    19,     7,     0, 63429, 63429, 63429,
          63429]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       

### Create evaluation function

In [22]:
def evaluate_model_on_metrics(model, dataloader, tokenizer, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            predictions = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
            references = [tokenizer.decode(g, skip_special_tokens=True) for g in batch["labels"]]

            all_preds.extend(predictions)
            all_labels.extend(references)

    # Tokenize predictions and references
    all_preds_tokenized = [pred.split() for pred in all_preds]
    all_labels_tokenized = [[label.split()] for label in all_labels]
    

    # Compute metrics
    bleu_score = np.round(bleu_metric.compute(predictions=all_preds_tokenized, references=all_labels_tokenized)['bleu'], 3)
    meteor_score = np.round(meteor_metric.compute(predictions=all_preds, references=all_labels)['meteor'], 3)
    rouge_score = np.round(rouge_metric.compute(predictions=all_preds, references=all_labels)["rougeL"].mid.fmeasure, 3)

    return bleu_score, meteor_score, rouge_score

### Model training

In [13]:
run = neptune.init_run(tags="test run")
lr = 5e-5
num_epochs = 5
optimizer = AdamW(model.parameters(), lr=lr)

run["hyperparameters/learning_rate"] = optimizer.state_dict()['param_groups'][0]['lr']
run["hyperparameters/optimizer"] = "Adam"
run["hyperparameters/betas"] = stringify_unsupported(optimizer.state_dict()['param_groups'][0]['betas'])
run["hyperparameters/eps"] = optimizer.state_dict()['param_groups'][0]['eps']
run["hyperparameters/weight_decay"] = optimizer.state_dict()['param_groups'][0]['weight_decay']
run["datasets/train"].track_files("data/All_data.csv")
run["hyperparameters/num_epochs"] = num_epochs

model.train()
for epoch in range(num_epochs):
    loss_all = 0
    for batch in tqdm(train_dataloader):
        batch = {key: value.to(device) for key, value in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_all += loss.item()
    run["train/loss"].append(np.round(loss_all / len(train_dataloader), 4))
    print(f"Epoch: {epoch + 1}, loss: {np.round(loss_all / len(train_dataloader), 4)}")

    #bleu_score, meteor_score, rouge_score = evaluate_model_on_metrics(model, valid_dataloader, tokenizer, device)
    #run["valid/bleu"].append(bleu_score)
    #run["valid/meteor"].append(meteor_score)
    #run["valid/rouge"].append(rouge_score)
    #print(f"Epoch: {epoch}, BLEU: {bleu_score}, METEOR: {meteor_score}, ROUGE-L: {rouge_score}")
    
run["score/final_loss"] = np.round(loss_all / len(train_dataloader), 4)
run.stop()

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-33


100%|██████████| 129/129 [00:42<00:00,  3.06it/s]


Epoch: 0, loss: 1.4312


100%|██████████| 129/129 [00:40<00:00,  3.18it/s]


Epoch: 1, loss: 0.844


100%|██████████| 129/129 [00:41<00:00,  3.13it/s]


Epoch: 2, loss: 0.5677


100%|██████████| 129/129 [00:40<00:00,  3.19it/s]


Epoch: 3, loss: 0.4214


100%|██████████| 129/129 [00:40<00:00,  3.17it/s]

Epoch: 4, loss: 0.3195
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 2 operations to synchronize with Neptune. Do not kill this process.





[neptune] [info   ] All 2 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-33/metadata


In [23]:
bleu_score, meteor_score, rouge_score = evaluate_model_on_metrics(model, valid_dataloader, tokenizer, device)

In [26]:
print(f"BLEU: {bleu_score} METEOR: {meteor_score}, ROUGE: {rouge_score}")

BLEU: 0.754 METEOR: 0.811, ROUGE: 0.864
