### Load libraries and enviromental variables

In [1]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

from config import DATA_DIR, MODEL_DIR, TOKENIZER_DIR, BASE_DIR
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from dotenv import load_dotenv
import pandas as pd
from torch.utils.data import DataLoader
from torch.optim import Adam
from tqdm import tqdm
import numpy as np
import neptune
from neptune.utils import stringify_unsupported
import evaluate
from custom_utils.custom_pytorch_utils import TranslationDataset, collate_fn, evaluate_model_on_bleu

print(load_dotenv(dotenv_path=BASE_DIR))

True


### Load model and setup device

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
print(device)
model.to(device);

cuda


In [19]:
model.model.decoder

MarianDecoder(
  (embed_tokens): Embedding(63430, 512, padding_idx=63429)
  (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
  (layers): ModuleList(
    (0-5): 6 x MarianDecoderLayer(
      (self_attn): MarianAttention(
        (k_proj): Linear(in_features=512, out_features=512, bias=True)
        (v_proj): Linear(in_features=512, out_features=512, bias=True)
        (q_proj): Linear(in_features=512, out_features=512, bias=True)
        (out_proj): Linear(in_features=512, out_features=512, bias=True)
      )
      (activation_fn): SiLU()
      (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): MarianAttention(
        (k_proj): Linear(in_features=512, out_features=512, bias=True)
        (v_proj): Linear(in_features=512, out_features=512, bias=True)
        (q_proj): Linear(in_features=512, out_features=512, bias=True)
        (out_proj): Linear(in_features=512, out_features=512, bias=True)
      )
      (encoder_attn_lay

In [17]:
# Freeze encoder 
for params in model.model.encoder.layers.parameters():
    params.requires_grad = False

In [20]:
model.state_dict

<bound method Module.state_dict of MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(63430, 512, padding_idx=63429)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(63430, 512, padding_idx=63429)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer

### Load and prepare data

In [None]:
data = pd.read_csv(DATA_DIR + "/final_data/all_data.csv")
train_indices = data.sample(frac=0.85, random_state=42).index
train_data = data.iloc[train_indices].reset_index(drop=True)
valid_data = data.drop(train_indices).reset_index(drop=True)

print(train_data.shape)
print(valid_data.shape)

train_data.head()

(1029, 2)
(182, 2)


Unnamed: 0,pl,mig
0,Jaki ma Pan numer domu?,Jaki twój dom numer
1,Czy ma Pani krewnych?,Ty krewni masz
2,Jak nazywa się prezydent polski?,Prezydent polska nazwisko
3,Jakie Pani ma obywatelstwo?,Ty obywatelstwo jakie
4,Gdzie mieszkasz?,Ty mieszkać gdzie


In [None]:
train_dataset = TranslationDataset(train_data.pl, train_data.mig, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

valid_dataset = TranslationDataset(valid_data.pl, valid_data.mig, tokenizer)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
next(iter(train_dataloader))

{'input_ids': tensor([[  322,    26,    18,    78,    25,  5898, 17983,   788,     2,     0,
          63429, 63429, 63429, 63429, 63429, 63429, 63429],
         [  230,  5986,   151,    54,  2227,    17,   581,    25,  2160,     9,
           7688,  1549, 22251,    10, 10260,     2,     0],
         [  926,  5171,  1117,  7866,   100,  3567,     7,     0, 63429, 63429,
          63429, 63429, 63429, 63429, 63429, 63429, 63429],
         [ 3049,   140,  2320,  2601, 14288, 18334,     7,     0, 63429, 63429,
          63429, 63429, 63429, 63429, 63429, 63429, 63429],
         [ 1123,   538, 11413, 20930,   969,    18,   127,     2,     0, 63429,
          63429, 63429, 63429, 63429, 63429, 63429, 63429],
         [  362,  2320,    45,    88,   389,  2948,   398,     7,     0, 63429,
          63429, 63429, 63429, 63429, 63429, 63429, 63429],
         [  926,  2320, 17579,  7959,  1810,     7,     0, 63429, 63429, 63429,
          63429, 63429, 63429, 63429, 63429, 63429, 63429],
       

### Model training

In [7]:
with neptune.init_run() as run:
    lr = 5e-5
    num_epochs = 5
    optimizer = Adam(model.parameters(), lr=lr)

    run["hyperparameters/learning_rate"] = optimizer.state_dict()['param_groups'][0]['lr']
    run["hyperparameters/optimizer"] = "Adam"
    run["hyperparameters/betas"] = stringify_unsupported(optimizer.state_dict()['param_groups'][0]['betas'])
    run["hyperparameters/eps"] = optimizer.state_dict()['param_groups'][0]['eps']
    run["datasets/train"].track_files("data/final_data/all_data.csv")
    run["hyperparameters/num_epochs"] = num_epochs

    model.train()
    for epoch in range(num_epochs):
        loss_all = 0
        for batch in tqdm(train_dataloader):
            batch = {key: value.to(device) for key, value in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_all += loss.item()
        run["train/loss"].append(np.round(loss_all / len(train_dataloader), 4))
        print(f"Epoch: {epoch + 1}, loss: {np.round(loss_all / len(train_dataloader), 4)}")
        
    run["score/final_loss"] = np.round(loss_all / len(train_dataloader), 4)
    bleu_metric = evaluate.load("bleu")
    bleu_score = evaluate_model_on_bleu(model, valid_dataloader, tokenizer, bleu_metric, device, None)
    run["metrics/BLEU"] = bleu_score



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-40


100%|██████████| 129/129 [00:24<00:00,  5.25it/s]


Epoch: 1, loss: 3.1024


100%|██████████| 129/129 [00:25<00:00,  5.00it/s]


Epoch: 2, loss: 1.3719


100%|██████████| 129/129 [00:25<00:00,  5.07it/s]


Epoch: 3, loss: 0.8305


100%|██████████| 129/129 [00:25<00:00,  5.05it/s]


Epoch: 4, loss: 0.6041


100%|██████████| 129/129 [00:24<00:00,  5.17it/s]

Epoch: 5, loss: 0.4318





In [12]:
print(f"BLEU: {bleu_score}")

BLEU: 0.659
