### Load libraries and enviromental variables

In [80]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from dotenv import load_dotenv
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import neptune
from neptune.utils import stringify_unsupported
from datasets import load_metric

load_dotenv()

True

### Load model and setup device

In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained("model/model")
tokenizer = AutoTokenizer.from_pretrained("model/tokenizer/")
print(device)
model.to(device);

cuda


### Load metrics

In [51]:
bleu_metric = load_metric("bleu", trust_remote_code=True)
meteor_metric = load_metric("meteor", trust_remote_code=True)
rouge_metric = load_metric("rouge", trust_remote_code=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\urbii\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\urbii\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\urbii\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Load and prepare data

In [75]:
data = pd.read_csv("data/All_data.csv")
train_indices = data.sample(frac=0.85).index
train_data = data.loc[train_indices].reset_index(drop=True)
valid_data = data.drop(train_indices).reset_index(drop=True)

print(train_data.shape)
print(valid_data.shape)

train_data.head()

(1029, 2)
(182, 2)


Unnamed: 0,pl,mig
0,Niedziela to dzień wolny od pracy.,Niedziela praca wolny
1,Czy Pani rozumie co mówię?,Ja mówić ty rozumieć
2,Proszę się tu położyć.,Prosić ty położyć tu
3,Proszę usiąść ponieważ będziemy musieli napisa...,Ja prosić ty siadać wywiad pisać razem musieć
4,Bałam się o moją rodzinę,Ja rodzina bać co wydarzyć


In [78]:
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors="pt", padding=True, truncation=True)
        targets = self.tokenizer(self.target_texts[idx], return_tensors="pt", padding=True, truncation=True)
        return {**inputs, "labels": targets["input_ids"]}

In [79]:
def collate_fn(batch):
    input_ids = [item['input_ids'].squeeze() for item in batch]
    attention_mask = [item['attention_mask'].squeeze() for item in batch]
    labels = [item['labels'].squeeze() for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_dataset = TranslationDataset(train_data.pl, train_data.mig, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

valid_dataset = TranslationDataset(train_data.pl, train_data.mig, tokenizer)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
next(iter(train_dataloader))

{'input_ids': tensor([[  700, 27744,    80, 16949, 20637,    43,  5640,    40,  1859, 13954,
             40,     2,     0],
         [  700, 12079,    22,  1122,  2000,     2,     0, 63429, 63429, 63429,
          63429, 63429, 63429],
         [  322,    26,   606,  9847,  8607,     2,     0, 63429, 63429, 63429,
          63429, 63429, 63429],
         [  322,   487,  7610,  1238,     2,     0, 63429, 63429, 63429, 63429,
          63429, 63429, 63429],
         [  700, 13136,  2266,     2,     0, 63429, 63429, 63429, 63429, 63429,
          63429, 63429, 63429],
         [  213,  7898,    90,   140,  1123, 11004, 38131,     7,     0, 63429,
          63429, 63429, 63429],
         [  362,   669,  6166, 15898,     7,     0, 63429, 63429, 63429, 63429,
          63429, 63429, 63429],
         [ 3049,  2320,   140, 10055, 24616,     7,     0, 63429, 63429, 63429,
          63429, 63429, 63429]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1,

### Create evaluation function

In [87]:
def evaluate_model_on_metrics(model, dataloader, tokenizer, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            predictions = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
            references = [tokenizer.decode(g, skip_special_tokens=True) for g in batch["labels"]]

            all_preds.extend(predictions)
            all_labels.extend(references)

    # Tokenize predictions and references
    all_preds_tokenized = [pred.split() for pred in all_preds]
    all_labels_tokenized = [[label.split()] for label in all_labels]

    # Compute metrics
    bleu_score = np.round(bleu_metric.compute(predictions=all_preds_tokenized, references=all_labels_tokenized), 3)
    meteor_score = np.round(meteor_metric.compute(predictions=all_preds, references=all_labels), 3)
    rouge_score = np.round(rouge_metric.compute(predictions=all_preds, references=all_labels), 3)

    return bleu_score, meteor_score, rouge_score

### Model training

In [88]:
run = neptune.init_run(tags="test run")
lr = 5e-5
num_epochs = 10
optimizer = AdamW(model.parameters(), lr=lr)

run["hyperparameters/learning_rate"] = optimizer.state_dict()['param_groups'][0]['lr']
run["hyperparameters/optimizer"] = "Adam"
run["hyperparameters/betas"] = stringify_unsupported(optimizer.state_dict()['param_groups'][0]['betas'])
run["hyperparameters/eps"] = optimizer.state_dict()['param_groups'][0]['eps']
run["hyperparameters/weight_decay"] = optimizer.state_dict()['param_groups'][0]['weight_decay']
run["datasets/train"].track_files("data/All_data.csv")
run["hyperparameters/num_epochs"] = num_epochs
run["sys/device"] = device

model.train()
for epoch in range(num_epochs):
    loss_all = 0
    for batch in tqdm(train_dataloader):
        batch = {key: value.to(device) for key, value in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_all += loss.item()
    run["train/loss"].append(np.round(loss_all / len(train_dataloader), 4))
    print(f"Epoch: {epoch}, loss: {np.round(loss_all / len(train_dataloader), 4)}")

    bleu_score, meteor_score, rouge_score = evaluate_model_on_metrics(model, valid_dataloader, tokenizer, device)
    run["valid/bleu"].append(bleu_score["bleu"])
    run["valid/meteor"].append(meteor_score["meteor"])
    run["valid/rouge"].append(rouge_score["rougeL"].mid.fmeasure)
    print(f"Epoch: {epoch}, BLEU: {bleu_score['bleu']}, METEOR: {meteor_score['meteor']}, ROUGE-L: {rouge_score['rougeL'].mid.fmeasure}")
    
run["score/final_loss"] = np.round(loss_all / len(train_dataloader), 4)
run.stop()

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-30


100%|██████████| 129/129 [00:11<00:00, 11.10it/s]


Epoch: 0, loss: 0.8986
Epoch: 0, BLEU: 0.5203396170541483, METEOR: 0.6965631173017779, ROUGE-L: 0.776431939879722


100%|██████████| 129/129 [00:10<00:00, 11.95it/s]


Epoch: 1, loss: 0.4709
Epoch: 1, BLEU: 0.6676348630138037, METEOR: 0.7729933271479248, ROUGE-L: 0.832503478069937


100%|██████████| 129/129 [00:10<00:00, 12.22it/s]


Epoch: 2, loss: 0.2845
Epoch: 2, BLEU: 0.7837010400328199, METEOR: 0.8259982354442593, ROUGE-L: 0.8762372350660732


100%|██████████| 129/129 [00:11<00:00, 11.24it/s]


Epoch: 3, loss: 0.2196
Epoch: 3, BLEU: 0.8165261506325275, METEOR: 0.8399752129966479, ROUGE-L: 0.8874548294879647


100%|██████████| 129/129 [00:10<00:00, 12.39it/s]


Epoch: 4, loss: 0.1753
Epoch: 4, BLEU: 0.8382393973387714, METEOR: 0.8440210643098864, ROUGE-L: 0.8906145136751651


100%|██████████| 129/129 [00:10<00:00, 12.09it/s]


Epoch: 5, loss: 0.1485
Epoch: 5, BLEU: 0.8237773666341363, METEOR: 0.8559493852755075, ROUGE-L: 0.899519585111904


100%|██████████| 129/129 [00:10<00:00, 12.22it/s]


Epoch: 6, loss: 0.136
Epoch: 6, BLEU: 0.850454779495627, METEOR: 0.853933588019353, ROUGE-L: 0.8984082055510618


100%|██████████| 129/129 [00:10<00:00, 12.23it/s]


Epoch: 7, loss: 0.1269
Epoch: 7, BLEU: 0.8311820908307257, METEOR: 0.8531668842461698, ROUGE-L: 0.8973131496385051


100%|██████████| 129/129 [00:11<00:00, 10.99it/s]


Epoch: 8, loss: 0.124
Epoch: 8, BLEU: 0.8550901223884615, METEOR: 0.8547328168102447, ROUGE-L: 0.8981729357458214


100%|██████████| 129/129 [00:11<00:00, 11.54it/s]


Epoch: 9, loss: 0.1152
Epoch: 9, BLEU: 0.8562555060906211, METEOR: 0.8541771268599927, ROUGE-L: 0.8981397651587153
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 5 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 5 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-30/metadata
