In [1]:
!pip install sacrebleu
!pip install evaluate
!pip install bert_score
!pip install sacremoses

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1

In [2]:
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
from time import time
import evaluate
import math
import itertools

2025-06-22 11:41:35.534134: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750592495.682558      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750592495.727990      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Set seed.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [23]:
model_name = "Helsinki-NLP/opus-mt-ko-en"

eval_size = 0.15

BATCH_SIZE = 10
NUM_EPOCHS = 3
LEARNING_RATE = 1e-6
train_file_path = "/kaggle/input/ai-hub-dataset/ai_hub_train_corpus_small.json"
if(torch.cuda.is_available()):
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print(f"Using {DEVICE} for processing")

Using cuda for processing


In [18]:
def load_json_file(file_path):
    with open(file_path, encoding = 'utf-8') as f:
        data = json.load(f)
    ko_text = [text['ko_text'] for data_point in data for text in data_point['text']]
    en_text = [text['en_text'] for data_point in data for text in data_point['text']]
    return ko_text, en_text


In [19]:

def convert_to_pd(file_path):
    ko_text, en_text = load_json_file(file_path)
    data = {'korean': ko_text, 'english': en_text}
    df = pd.DataFrame(data)
    return df

In [7]:
development_data = convert_to_pd(train_file_path)

In [20]:
# Custom Dataset class.
class TranslationDataset(Dataset):
    def __init__(self, df, tokenizer, max_length = 128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.df['korean'][idx]
        tgt = self.df['english'][idx]

        src_enc = self.tokenizer(src, return_tensors = "pt", padding="max_length", truncation=True, max_length = self.max_length)
        tgt_enc = self.tokenizer(tgt, return_tensors = "pt", padding="max_length", truncation=True, max_length = self.max_length)

        input_ids = src_enc["input_ids"].squeeze()
        attention_mask = src_enc["attention_mask"].squeeze()
        labels = tgt_enc["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100 #ignore padding in loss calculation
        return{
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels":labels
        }

In [24]:
def train(model, optimizer, dataloader):
    model.to(DEVICE)
    model.train()

    epoch_loss = 0
    num_iter = 0
    start_time = time()

    progress_bar = tqdm(dataloader, leave=False)
    losses = []
    for batch in progress_bar:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
    
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    
            epoch_loss += loss.item()
            num_iter += 1
            # progress_bar.set_postfix(loss=loss.item())
            if(num_iter%100 == 0):
                print(f"iter = {num_iter}, loss = {loss.item()}")

    avg_loss = epoch_loss / num_iter if num_iter > 0 else float("inf")
    elapsed = time() - start_time
    print(f"Training completed in {elapsed:.2f}s — Avg Loss: {avg_loss:.4f}")
    return avg_loss

def cross_validate():
    num_of_folds = int(1/eval_size)
    print(f"Using {num_of_folds} folds for cross validation")
    bleu = evaluate.load("bleu")
    chrf = evaluate.load("chrf")
    bert_score = evaluate.load("bertscore")
    bleu_score_list = []
    bert_score_list = []
    chrf_score_list = []
    train_loss_list = []
    for fold in range(num_of_folds):
        train_data = development_data[development_data.index % num_of_folds != fold]
        eval_data = development_data[development_data.index % num_of_folds == fold]

        train_data = train_data.reset_index(drop = True)
        eval_data = eval_data.reset_index(drop = True)

        tokenizer = MarianTokenizer.from_pretrained(model_name)
        train_dataset = TranslationDataset(train_data, tokenizer)
        valid_dataset = TranslationDataset(eval_data, tokenizer)
        
        valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

        model = MarianMTModel.from_pretrained(model_name).to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9)

        print(f"Train model for fold number {fold}")
        train_loss = []
        for epoch in range(NUM_EPOCHS):
            
            train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
            loss = train(model, optimizer, train_dataloader)
            train_loss.append(loss)
            print(f"epoch num = {epoch}, loss = {loss}")
        train_loss_list.append(sum(train_loss)/len(train_loss))
        print(f"Avg Train Loss: {train_loss_list[-1]}")

        print(f"Fine Tuned Model Evaluation Metric Score: Fold num = {fold}")
        model.eval()

        predictions = []
        references = []
        for _, row in tqdm(eval_data.iterrows(), total=len(eval_data)):
            tokenizer.src_lang = "ko_KR"
            inputs = tokenizer(row["korean"], return_tensors="pt", max_length=128, truncation=True, padding="max_length")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            output_ids = model.generate(**inputs,decoder_start_token_id=tokenizer.convert_tokens_to_ids("en_XX"), max_new_tokens=128)
            pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            if(row["english"] == ""):
                print()
                print(f"row[english] is empty, pred = {pred}")
            if(pred == ""):
                print(f"pred is empty, expected = {row['english']}")
            predictions.append(pred)
            references.append([row["english"]])
            # break
        print(f"references = {len(references)}, predictions = {len(predictions)}")



        bleu_score = bleu.compute(predictions=predictions, references=references)
        print("Corpus BLEU:", bleu_score["bleu"])
        bleu_score_list.append(bleu_score["bleu"])

        results = bert_score.compute(predictions=predictions, references=references, lang="en")
        print("BERTScore F1:", sum(results["f1"])/len(results["f1"]))
        bert_score_list.append(sum(results["f1"])/len(results["f1"]))

        results = chrf.compute(predictions=predictions, references=references, word_order=2)
        print("chrF++ score:", results["score"])
        chrf_score_list.append(results["score"])

    avg_bleu_score = sum(bleu_score_list)/len(bleu_score_list)
    avg_bert_score = sum(bert_score_list)/len(bert_score_list)
    avg_chrf_score = sum(chrf_score_list)/len(chrf_score_list)
    avg_train_loss = sum(train_loss_list)/len(train_loss_list)
    print(f"Average Corpus BLEU: {avg_bleu_score}")
    print(f"Average BERTScore F1: {avg_bert_score}")
    print(f"Average chrF++ score: {avg_chrf_score}")
    print(f"Average Train Loss: {avg_train_loss}")
    return avg_bleu_score, avg_bert_score, avg_chrf_score, avg_train_loss



In [25]:
cross_validate()

Using 6 folds for cross validation
Train model for fold number 0


 14%|█▍        | 100/696 [00:27<02:48,  3.55it/s]

iter = 100, loss = 6.40846586227417


 29%|██▊       | 200/696 [00:54<02:07,  3.89it/s]

iter = 200, loss = 5.858311653137207


 43%|████▎     | 300/696 [01:19<01:42,  3.88it/s]

iter = 300, loss = 5.674610614776611


 57%|█████▋    | 400/696 [01:46<01:19,  3.72it/s]

iter = 400, loss = 5.331934928894043


 72%|███████▏  | 500/696 [02:12<00:51,  3.82it/s]

iter = 500, loss = 5.341391563415527


 86%|████████▌ | 600/696 [02:38<00:25,  3.83it/s]

iter = 600, loss = 5.120550155639648


                                                 

Training completed in 183.89s — Avg Loss: 5.5431
epoch num = 0, loss = 5.543063895455722


 14%|█▍        | 100/696 [00:26<02:37,  3.79it/s]

iter = 100, loss = 5.044975757598877


 29%|██▊       | 200/696 [00:52<02:10,  3.79it/s]

iter = 200, loss = 4.807192802429199


 43%|████▎     | 300/696 [01:18<01:43,  3.82it/s]

iter = 300, loss = 4.928666114807129


 57%|█████▋    | 400/696 [01:45<01:18,  3.79it/s]

iter = 400, loss = 4.634214401245117


 72%|███████▏  | 500/696 [02:11<00:51,  3.82it/s]

iter = 500, loss = 4.733636856079102


 86%|████████▌ | 600/696 [02:37<00:25,  3.84it/s]

iter = 600, loss = 4.545952796936035


                                                 

Training completed in 182.64s — Avg Loss: 4.6541
epoch num = 1, loss = 4.654110689957936


 14%|█▍        | 100/696 [00:26<02:36,  3.80it/s]

iter = 100, loss = 4.487942695617676


 29%|██▊       | 200/696 [00:52<02:10,  3.81it/s]

iter = 200, loss = 4.452435493469238


 43%|████▎     | 300/696 [01:18<01:44,  3.80it/s]

iter = 300, loss = 4.565787315368652


 57%|█████▋    | 400/696 [01:45<01:17,  3.80it/s]

iter = 400, loss = 4.3652567863464355


 72%|███████▏  | 500/696 [02:11<00:51,  3.80it/s]

iter = 500, loss = 4.379635334014893


 86%|████████▌ | 600/696 [02:37<00:25,  3.79it/s]

iter = 600, loss = 4.301608085632324


                                                 

Training completed in 182.97s — Avg Loss: 4.3148
epoch num = 2, loss = 4.3147756895799745
Avg Train Loss: 4.8373167583312116
Fine Tuned Model Evaluation Metric Score: Fold num = 0


  8%|▊         | 105/1391 [01:02<13:30,  1.59it/s]

pred is empty, expected = The five years of Noh Moo-hyun's government was also a time when the expectations of the labor world were betrayed.


 15%|█▌        | 210/1391 [02:09<13:38,  1.44it/s]

pred is empty, expected = Kim Il-sung, struggling in the anti-Japanese armed struggle, was impressed by the power of nuclear weapons as he saw Japan surrender right away to the US atomic bombing.


 16%|█▌        | 225/1391 [02:21<15:59,  1.22it/s]

pred is empty, expected = Then came the thought of God vaguely believing.


 28%|██▊       | 384/1391 [04:00<11:08,  1.51it/s]

pred is empty, expected = Until now, they have never promised to shutdown their nuclear weapons.


 29%|██▉       | 409/1391 [04:15<11:48,  1.39it/s]

pred is empty, expected = Sometimes the debate got heated and turned into a fight.


 35%|███▍      | 480/1391 [04:55<10:15,  1.48it/s]

pred is empty, expected = Most companies lack experience in the cloud, which is why they have to think about how to have an entire architecture and at what stage, which situations to adopt and apply.


 45%|████▍     | 620/1391 [06:11<07:56,  1.62it/s]

pred is empty, expected = Almost all of the larger works were produced in this bloom studio.


 53%|█████▎    | 734/1391 [07:18<08:03,  1.36it/s]

pred is empty, expected = Riterra criticized that all of these claims were unfounded.


 61%|██████    | 844/1391 [08:32<05:42,  1.60it/s]

pred is empty, expected = There used to be always a chill in his work.


 64%|██████▍   | 890/1391 [08:58<05:36,  1.49it/s]

pred is empty, expected = At that time, there were a lot of theories about the Big Deal.


 77%|███████▋  | 1069/1391 [10:48<03:43,  1.44it/s]

pred is empty, expected = Freud said that there is an ego and a super-ego inside a human being.


 86%|████████▋ | 1200/1391 [12:05<02:11,  1.46it/s]

pred is empty, expected = Because everything is related to the digital revolution and globalization.


100%|██████████| 1391/1391 [13:51<00:00,  1.67it/s]


references = 1391, predictions = 1391
Corpus BLEU: 0.07136376114923786


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.837930300996768
chrF++ score: 24.164306296722433
Train model for fold number 1


 14%|█▍        | 100/696 [00:26<02:36,  3.81it/s]

iter = 100, loss = 6.352525234222412


 29%|██▊       | 200/696 [00:52<02:10,  3.80it/s]

iter = 200, loss = 5.712690830230713


 43%|████▎     | 300/696 [01:18<01:43,  3.82it/s]

iter = 300, loss = 5.767583847045898


 57%|█████▋    | 400/696 [01:45<01:18,  3.76it/s]

iter = 400, loss = 5.236110210418701


 72%|███████▏  | 500/696 [02:11<00:51,  3.81it/s]

iter = 500, loss = 5.263247966766357


 86%|████████▌ | 600/696 [02:37<00:25,  3.84it/s]

iter = 600, loss = 5.155655384063721


                                                 

Training completed in 182.92s — Avg Loss: 5.5442
epoch num = 0, loss = 5.544177021103343


 14%|█▍        | 100/696 [00:26<02:36,  3.80it/s]

iter = 100, loss = 4.961226940155029


 29%|██▊       | 200/696 [00:52<02:10,  3.80it/s]

iter = 200, loss = 4.69627046585083


 43%|████▎     | 300/696 [01:18<01:44,  3.80it/s]

iter = 300, loss = 4.963962078094482


 57%|█████▋    | 400/696 [01:45<01:18,  3.75it/s]

iter = 400, loss = 4.502849578857422


 72%|███████▏  | 500/696 [02:11<00:51,  3.80it/s]

iter = 500, loss = 4.667644500732422


 86%|████████▌ | 600/696 [02:37<00:25,  3.82it/s]

iter = 600, loss = 4.630962371826172


                                                 

Training completed in 182.92s — Avg Loss: 4.6550
epoch num = 1, loss = 4.655025509239613


 14%|█▍        | 100/696 [00:26<02:36,  3.80it/s]

iter = 100, loss = 4.438404083251953


 29%|██▊       | 200/696 [00:52<02:10,  3.81it/s]

iter = 200, loss = 4.326869487762451


 43%|████▎     | 300/696 [01:18<01:43,  3.83it/s]

iter = 300, loss = 4.608471393585205


 57%|█████▋    | 400/696 [01:44<01:18,  3.79it/s]

iter = 400, loss = 4.229825496673584


 72%|███████▏  | 500/696 [02:11<00:51,  3.81it/s]

iter = 500, loss = 4.3687968254089355


 86%|████████▌ | 600/696 [02:37<00:25,  3.82it/s]

iter = 600, loss = 4.351978778839111


                                                 

Training completed in 182.61s — Avg Loss: 4.3185
epoch num = 2, loss = 4.318504417079619
Avg Train Loss: 4.839235649140858
Fine Tuned Model Evaluation Metric Score: Fold num = 1


  2%|▏         | 26/1391 [00:17<17:32,  1.30it/s]

pred is empty, expected = There was no way that I wouldn't feel loved.


  2%|▏         | 32/1391 [00:21<15:24,  1.47it/s]

pred is empty, expected = Lee Sang-joon, the department manager, leaves home after the shift is over, if nothing special happens.


  9%|▊         | 121/1391 [01:18<16:03,  1.32it/s]

pred is empty, expected = To solve this problem, countless scientists have been working on it.


  9%|▉         | 130/1391 [01:23<16:15,  1.29it/s]

pred is empty, expected = Group Cherry Bullet turns into a lovely girl and comes back.


 16%|█▌        | 220/1391 [02:12<14:51,  1.31it/s]

pred is empty, expected = I loved myself like this without knowing it.


 26%|██▌       | 360/1391 [03:43<11:15,  1.53it/s]

pred is empty, expected = The civil petition bulletin board has been turned into a forum for public debate.


 27%|██▋       | 370/1391 [03:48<11:21,  1.50it/s]

pred is empty, expected = When the enemy champion was narrowly missed, a sigh covered the stadium.


 28%|██▊       | 387/1391 [03:58<12:31,  1.34it/s]

pred is empty, expected = This is the first time Nexon opened its contents festival with another company.


 36%|███▌      | 495/1391 [05:03<11:59,  1.24it/s]

pred is empty, expected = This attitude of operators is hard to get, yet understandable.


 46%|████▌     | 643/1391 [06:20<07:23,  1.69it/s]

pred is empty, expected = Pastor Park said, “If the Christians find a way to reveal God in their lives, the church helps them.”


 63%|██████▎   | 875/1391 [08:31<06:35,  1.31it/s]

pred is empty, expected = It is the contents where the representative Hwang has always insisted on the public welfare Campaign.


 64%|██████▍   | 894/1391 [08:42<05:45,  1.44it/s]

pred is empty, expected = The ubervisors don't really know that they are ubervisors.


 69%|██████▉   | 958/1391 [09:21<05:25,  1.33it/s]

pred is empty, expected = In his lifetime, Roh had high expectations for Sejong City.


 69%|██████▉   | 959/1391 [09:22<05:57,  1.21it/s]

pred is empty, expected = The large rock was deeply etched with strange patterns.


 86%|████████▋ | 1200/1391 [11:55<02:12,  1.44it/s]

pred is empty, expected = Most important companies have left behind, so there was no way to hold on.


 92%|█████████▏| 1283/1391 [12:40<01:09,  1.56it/s]

pred is empty, expected = We know all about the potential and scale of the team.


100%|██████████| 1391/1391 [13:42<00:00,  1.69it/s]


references = 1391, predictions = 1391
Corpus BLEU: 0.0705198986058687




BERTScore F1: 0.8364042266935517
chrF++ score: 24.079297573412727
Train model for fold number 2


 14%|█▍        | 100/696 [00:26<02:35,  3.84it/s]

iter = 100, loss = 6.41370153427124


 29%|██▊       | 200/696 [00:52<02:10,  3.80it/s]

iter = 200, loss = 5.7654008865356445


 43%|████▎     | 300/696 [01:18<01:44,  3.79it/s]

iter = 300, loss = 5.699131965637207


 57%|█████▋    | 400/696 [01:45<01:18,  3.79it/s]

iter = 400, loss = 5.2071027755737305


 72%|███████▏  | 500/696 [02:11<00:51,  3.81it/s]

iter = 500, loss = 5.162962913513184


 86%|████████▌ | 600/696 [02:37<00:25,  3.82it/s]

iter = 600, loss = 5.123218536376953


                                                 

Training completed in 182.61s — Avg Loss: 5.5417
epoch num = 0, loss = 5.541655598700731


 14%|█▍        | 100/696 [00:26<02:37,  3.78it/s]

iter = 100, loss = 4.918844699859619


 29%|██▊       | 200/696 [00:52<02:12,  3.75it/s]

iter = 200, loss = 4.728308200836182


 43%|████▎     | 300/696 [01:19<01:44,  3.79it/s]

iter = 300, loss = 4.891857624053955


 57%|█████▋    | 400/696 [01:45<01:18,  3.78it/s]

iter = 400, loss = 4.539035320281982


 72%|███████▏  | 500/696 [02:11<00:51,  3.82it/s]

iter = 500, loss = 4.609811305999756


 86%|████████▌ | 600/696 [02:37<00:25,  3.83it/s]

iter = 600, loss = 4.559452533721924


                                                 

Training completed in 182.84s — Avg Loss: 4.6504
epoch num = 1, loss = 4.650378407075487


 14%|█▍        | 100/696 [00:26<02:36,  3.81it/s]

iter = 100, loss = 4.443906784057617


 29%|██▊       | 200/696 [00:52<02:10,  3.79it/s]

iter = 200, loss = 4.306105613708496


 43%|████▎     | 300/696 [01:18<01:43,  3.81it/s]

iter = 300, loss = 4.546046733856201


 57%|█████▋    | 400/696 [01:45<01:18,  3.79it/s]

iter = 400, loss = 4.24180793762207


 72%|███████▏  | 500/696 [02:11<00:51,  3.78it/s]

iter = 500, loss = 4.305178642272949


 86%|████████▌ | 600/696 [02:37<00:25,  3.83it/s]

iter = 600, loss = 4.280849456787109


                                                 

Training completed in 182.87s — Avg Loss: 4.3151
epoch num = 2, loss = 4.315079388947322
Avg Train Loss: 4.835704464907846
Fine Tuned Model Evaluation Metric Score: Fold num = 2


  2%|▏         | 32/1391 [00:17<16:48,  1.35it/s]

pred is empty, expected = Lee Sang-joon said, “The joy of being able to help someone in my busy life has been the driving force to do the voluntary service for 10 years.”


  8%|▊         | 108/1391 [01:03<16:01,  1.33it/s]

pred is empty, expected = The prosecution's special self-management is urgently needed to avoid unnecessary misunderstandings.


  9%|▉         | 122/1391 [01:14<16:19,  1.30it/s]

pred is empty, expected = This could mean that Cheong Wa Dae gave a false briefing to the public.


  9%|▉         | 132/1391 [01:21<17:32,  1.20it/s]

pred is empty, expected = Chairman Liu is an American alumnus of financial affairs with relationships with American Republican politicians.


 16%|█▌        | 216/1391 [02:11<12:02,  1.63it/s]

pred is empty, expected = In fact, there are many cases where large corporate traders have broken security.


 16%|█▋        | 229/1391 [02:17<11:18,  1.71it/s]

pred is empty, expected = Influenced by Minister Park's remarks, BTC prices plunged.


 17%|█▋        | 232/1391 [02:19<14:23,  1.34it/s]

pred is empty, expected = Chinatown, which attracts 300,000 people every year, is a must.


 20%|██        | 285/1391 [02:52<13:00,  1.42it/s]

pred is empty, expected = Myeonbong Mountain is also a favorite place for wild flower lovers.


 25%|██▍       | 346/1391 [03:28<11:58,  1.45it/s]

pred is empty, expected = President Moon's strong points are his sincere words and actions.


 30%|██▉       | 411/1391 [04:09<11:57,  1.37it/s]

pred is empty, expected = Both are to some extent responsible for the removal.


 39%|███▉      | 546/1391 [05:36<10:32,  1.34it/s]

pred is empty, expected = It is since manager Ole Gunnar Solskjaer came up with a counterattack-oriented strategy.


 41%|████      | 572/1391 [05:51<08:34,  1.59it/s]

pred is empty, expected = The fans are especially greedy to win this season.


 44%|████▎     | 606/1391 [06:09<11:33,  1.13it/s]

pred is empty, expected = The purpose of the two countries to cooperate in order to promote internationalist value.


 48%|████▊     | 665/1391 [06:44<09:24,  1.29it/s]

pred is empty, expected = In this context, Dean Acheson, who later becomes Secretary of State, intervened.


 48%|████▊     | 668/1391 [06:46<07:54,  1.53it/s]

pred is empty, expected = The financial authorities' responsibility, which Lawmaker Park claims, is largely attributable to allegations of the preferential listing to Samsung Biologics and management oversight of some accounting firms' bungled accounting reports.


 50%|█████     | 700/1391 [07:04<06:38,  1.73it/s]

pred is empty, expected = There are also many students who are interested in composing music.


 53%|█████▎    | 742/1391 [07:32<07:13,  1.50it/s]

pred is empty, expected = Then, they put up a normalization condition of the National Assembly that the Democratic Party would be hard to accept.


 54%|█████▍    | 749/1391 [07:37<08:32,  1.25it/s]

pred is empty, expected = Former minister Yoon was concerned about the possibility of a longer parliamentary vacuum.


 59%|█████▉    | 823/1391 [08:26<07:32,  1.25it/s]

pred is empty, expected = The semipublic system is to compensate for the deficit of bus companies instead of having the local governments take control of the route.


 70%|███████   | 974/1391 [09:56<05:31,  1.26it/s]

pred is empty, expected = An unexpected significant loophole is revealed in the operation of Economic, Social & Labor Council.


 76%|███████▋  | 1064/1391 [10:55<03:43,  1.47it/s]

pred is empty, expected = The living room where Sun-hye escaped to was so cold that she felt chill.


 79%|███████▉  | 1100/1391 [11:21<04:15,  1.14it/s]

pred is empty, expected = Among them, Ancelotti and Zidane were all three times.


100%|██████████| 1391/1391 [14:32<00:00,  1.59it/s]


references = 1391, predictions = 1391
Corpus BLEU: 0.06496198401587254




BERTScore F1: 0.831022311251083
chrF++ score: 23.708671760484307
Train model for fold number 3


 14%|█▍        | 100/696 [00:26<02:34,  3.85it/s]

iter = 100, loss = 6.419219017028809


 29%|██▊       | 200/696 [00:52<02:10,  3.79it/s]

iter = 200, loss = 5.742475509643555


 43%|████▎     | 300/696 [01:18<01:44,  3.77it/s]

iter = 300, loss = 5.722564220428467


 57%|█████▋    | 400/696 [01:45<01:18,  3.76it/s]

iter = 400, loss = 5.200925827026367


 72%|███████▏  | 500/696 [02:11<00:51,  3.78it/s]

iter = 500, loss = 5.173435688018799


 86%|████████▌ | 600/696 [02:38<00:25,  3.79it/s]

iter = 600, loss = 5.158966064453125


                                                 

Training completed in 183.33s — Avg Loss: 5.5429
epoch num = 0, loss = 5.542940585092566


 14%|█▍        | 100/696 [00:26<02:37,  3.79it/s]

iter = 100, loss = 4.950761318206787


 29%|██▊       | 200/696 [00:53<02:11,  3.76it/s]

iter = 200, loss = 4.801858901977539


 43%|████▎     | 300/696 [01:19<01:44,  3.80it/s]

iter = 300, loss = 4.895598888397217


 57%|█████▋    | 400/696 [01:45<01:18,  3.76it/s]

iter = 400, loss = 4.511324882507324


 72%|███████▏  | 500/696 [02:12<00:51,  3.80it/s]

iter = 500, loss = 4.5897626876831055


 86%|████████▌ | 600/696 [02:38<00:25,  3.80it/s]

iter = 600, loss = 4.571308135986328


                                                 

Training completed in 183.74s — Avg Loss: 4.6537
epoch num = 1, loss = 4.653674502824915


 14%|█▍        | 100/696 [00:26<02:37,  3.79it/s]

iter = 100, loss = 4.5077714920043945


 29%|██▊       | 200/696 [00:52<02:10,  3.79it/s]

iter = 200, loss = 4.395936012268066


 43%|████▎     | 300/696 [01:19<01:44,  3.78it/s]

iter = 300, loss = 4.5246758460998535


 57%|█████▋    | 400/696 [01:45<01:18,  3.77it/s]

iter = 400, loss = 4.222546100616455


 72%|███████▏  | 500/696 [02:12<00:51,  3.82it/s]

iter = 500, loss = 4.346519947052002


 86%|████████▌ | 600/696 [02:38<00:25,  3.84it/s]

iter = 600, loss = 4.278741836547852


                                                 

Training completed in 183.37s — Avg Loss: 4.3158
epoch num = 2, loss = 4.3157603703011045
Avg Train Loss: 4.837458486072862
Fine Tuned Model Evaluation Metric Score: Fold num = 3


  2%|▏         | 22/1391 [00:14<18:15,  1.25it/s]

pred is empty, expected = Daeyuwinia is struggling with kimchi refrigerator.


  4%|▍         | 59/1391 [00:34<16:16,  1.36it/s]

pred is empty, expected = His/her strong vision and dedication played a main role.


  8%|▊         | 108/1391 [01:08<19:07,  1.12it/s]

pred is empty, expected = Commissioner Won's charges will be determined by the prosecution.


 10%|▉         | 135/1391 [01:27<15:30,  1.35it/s]

pred is empty, expected = Oh Soo-jin also thanked the heart and took good care of the grandmother.


 23%|██▎       | 325/1391 [03:29<15:38,  1.14it/s]

pred is empty, expected = Experts failed President Moon and Cheong Wa Dae.


 25%|██▍       | 346/1391 [03:43<13:34,  1.28it/s]

pred is empty, expected = I wish President Moon would hold press conferences much more often.


 33%|███▎      | 459/1391 [04:53<12:39,  1.23it/s]

pred is empty, expected = It was not that difficult to analyze it.


 42%|████▏     | 579/1391 [06:04<09:48,  1.38it/s]

pred is empty, expected = It is necessary to respect the manager's inclinations and values.


 50%|█████     | 697/1391 [07:12<09:02,  1.28it/s]

pred is empty, expected = Coupang's data value can be significant in SVF.


 60%|█████▉    | 832/1391 [08:43<05:29,  1.70it/s]

pred is empty, expected = "It is meaningful in that we can set the criteria of people who play games pathologically and study their treatments," Professor Noh said.


 60%|██████    | 837/1391 [08:47<07:36,  1.21it/s]

pred is empty, expected = At that time, the French audience enthusiastically watched director Kim Ki-young's movie.


 71%|███████▏  | 993/1391 [10:31<05:34,  1.19it/s]

pred is empty, expected = Professor Park pointed out that the analysis of the finance officer Park Jong-gyu made a interpretation error.


 77%|███████▋  | 1067/1391 [11:28<04:08,  1.30it/s]

pred is empty, expected = The pure charm of Yeongyo that has never experienced hard times is reflected.


 80%|███████▉  | 1109/1391 [11:54<03:52,  1.21it/s]

pred is empty, expected = This is the cause of inconvenience to the passengers of the bus.


 96%|█████████▌| 1329/1391 [14:14<00:39,  1.59it/s]

pred is empty, expected = The ball touched the left arm of the rose rushing for a tackle.


100%|██████████| 1391/1391 [14:50<00:00,  1.56it/s]


references = 1391, predictions = 1391
Corpus BLEU: 0.06922429599839744




BERTScore F1: 0.8360571155524099
chrF++ score: 24.36734957816182
Train model for fold number 4


 14%|█▍        | 100/696 [00:26<02:35,  3.83it/s]

iter = 100, loss = 6.33998966217041


 29%|██▊       | 200/696 [00:52<02:10,  3.79it/s]

iter = 200, loss = 5.850435256958008


 43%|████▎     | 300/696 [01:19<01:44,  3.79it/s]

iter = 300, loss = 5.673503398895264


 57%|█████▋    | 400/696 [01:45<01:18,  3.79it/s]

iter = 400, loss = 5.267263889312744


 72%|███████▏  | 500/696 [02:11<00:51,  3.81it/s]

iter = 500, loss = 5.179368495941162


 86%|████████▌ | 600/696 [02:38<00:25,  3.79it/s]

iter = 600, loss = 5.112772464752197


                                                 

Training completed in 183.17s — Avg Loss: 5.5408
epoch num = 0, loss = 5.540787952384729


 14%|█▍        | 100/696 [00:26<02:36,  3.80it/s]

iter = 100, loss = 4.888105392456055


 29%|██▊       | 200/696 [00:52<02:10,  3.81it/s]

iter = 200, loss = 4.8005876541137695


 43%|████▎     | 300/696 [01:18<01:43,  3.81it/s]

iter = 300, loss = 4.811751842498779


 57%|█████▋    | 400/696 [01:45<01:17,  3.80it/s]

iter = 400, loss = 4.633802890777588


 72%|███████▏  | 500/696 [02:11<00:52,  3.76it/s]

iter = 500, loss = 4.6377692222595215


 86%|████████▌ | 600/696 [02:38<00:25,  3.79it/s]

iter = 600, loss = 4.604820251464844


                                                 

Training completed in 183.11s — Avg Loss: 4.6536
epoch num = 1, loss = 4.653609899953864


 14%|█▍        | 100/696 [00:26<02:37,  3.79it/s]

iter = 100, loss = 4.422440052032471


 29%|██▊       | 200/696 [00:52<02:11,  3.78it/s]

iter = 200, loss = 4.459619045257568


 43%|████▎     | 300/696 [01:18<01:44,  3.80it/s]

iter = 300, loss = 4.492384433746338


 57%|█████▋    | 400/696 [01:45<01:18,  3.75it/s]

iter = 400, loss = 4.316017150878906


 72%|███████▏  | 500/696 [02:11<00:51,  3.80it/s]

iter = 500, loss = 4.325144290924072


 86%|████████▌ | 600/696 [02:38<00:25,  3.79it/s]

iter = 600, loss = 4.299601078033447


                                                 

Training completed in 183.11s — Avg Loss: 4.3161
epoch num = 2, loss = 4.316053084943486
Avg Train Loss: 4.836816979094027
Fine Tuned Model Evaluation Metric Score: Fold num = 4


  8%|▊         | 107/1390 [01:07<13:45,  1.55it/s]

pred is empty, expected = The union rebels against this, and the tension between labor and labor increased.


 10%|▉         | 134/1390 [01:26<19:11,  1.09it/s]

pred is empty, expected = The only thing Park Mi-soon had was the caregiver license, which she got about 10 years ago.


 11%|█         | 155/1390 [01:37<12:25,  1.66it/s]

pred is empty, expected = His father's influence was great that Chung entered the teaching profession


 51%|█████     | 703/1390 [07:10<08:50,  1.29it/s]

pred is empty, expected = Focused on investigating charges handed by the police at that time.


 54%|█████▍    | 749/1390 [07:38<07:22,  1.45it/s]

pred is empty, expected = Just as there was a democratic versus anti-democratic structure in the past.


 57%|█████▋    | 789/1390 [08:03<07:19,  1.37it/s]

pred is empty, expected = There were mixed views on President Moon's influence over the nomination process.


 61%|██████    | 848/1390 [08:42<07:26,  1.21it/s]

pred is empty, expected = Several people, including the leader of the Brexit party, Nayal Faraji, were hit.


 84%|████████▍ | 1172/1390 [12:05<02:24,  1.51it/s]

pred is empty, expected = CEO Kim Kyu-sung reached out when the business was in a difficult situation.


 93%|█████████▎| 1297/1390 [13:22<01:04,  1.45it/s]

pred is empty, expected = Candidate striker Fernando Llorente should lead the attack.


 94%|█████████▍| 1304/1390 [13:26<00:53,  1.62it/s]

pred is empty, expected = It's amazing that he respects this plan.


 97%|█████████▋| 1350/1390 [13:54<00:26,  1.50it/s]

pred is empty, expected = Manchester City manager Pep Guardiola has been caught in criticism.


100%|██████████| 1390/1390 [14:18<00:00,  1.62it/s]

pred is empty, expected = It remains to be seen what choice head coach Bento will make regarding Lee Kang-in.
references = 1390, predictions = 1390





Corpus BLEU: 0.07210318481018838




BERTScore F1: 0.8386159729185722
chrF++ score: 24.306842449834313
Train model for fold number 5


 14%|█▍        | 100/696 [00:26<02:36,  3.81it/s]

iter = 100, loss = 6.2819695472717285


 29%|██▊       | 200/696 [00:52<02:10,  3.80it/s]

iter = 200, loss = 5.8282318115234375


 43%|████▎     | 300/696 [01:18<01:44,  3.80it/s]

iter = 300, loss = 5.669398784637451


 57%|█████▋    | 400/696 [01:45<01:19,  3.74it/s]

iter = 400, loss = 5.16564416885376


 72%|███████▏  | 500/696 [02:11<00:51,  3.79it/s]

iter = 500, loss = 5.195357799530029


 86%|████████▌ | 600/696 [02:38<00:25,  3.80it/s]

iter = 600, loss = 5.091230869293213


                                                 

Training completed in 183.37s — Avg Loss: 5.5406
epoch num = 0, loss = 5.540595851410394


 14%|█▍        | 100/696 [00:26<02:36,  3.80it/s]

iter = 100, loss = 4.846850872039795


 29%|██▊       | 200/696 [00:52<02:10,  3.80it/s]

iter = 200, loss = 4.828793048858643


 43%|████▎     | 300/696 [01:18<01:44,  3.79it/s]

iter = 300, loss = 4.826534271240234


 57%|█████▋    | 400/696 [01:45<01:18,  3.77it/s]

iter = 400, loss = 4.535833835601807


 72%|███████▏  | 500/696 [02:11<00:51,  3.77it/s]

iter = 500, loss = 4.640176296234131


 86%|████████▌ | 600/696 [02:38<00:25,  3.82it/s]

iter = 600, loss = 4.490762710571289


                                                 

Training completed in 183.37s — Avg Loss: 4.6527
epoch num = 1, loss = 4.652657541735419


 14%|█▍        | 100/696 [00:26<02:36,  3.81it/s]

iter = 100, loss = 4.339537620544434


 29%|██▊       | 200/696 [00:52<02:11,  3.78it/s]

iter = 200, loss = 4.4594197273254395


 43%|████▎     | 300/696 [01:18<01:44,  3.80it/s]

iter = 300, loss = 4.533886432647705


 57%|█████▋    | 400/696 [01:45<01:18,  3.76it/s]

iter = 400, loss = 4.267194747924805


 72%|███████▏  | 500/696 [02:11<00:51,  3.79it/s]

iter = 500, loss = 4.262528896331787


 86%|████████▌ | 600/696 [02:38<00:25,  3.80it/s]

iter = 600, loss = 4.217762470245361


                                                 

Training completed in 183.41s — Avg Loss: 4.3177
epoch num = 2, loss = 4.317694424897775
Avg Train Loss: 4.836982606014529
Fine Tuned Model Evaluation Metric Score: Fold num = 5


 13%|█▎        | 180/1390 [01:53<16:59,  1.19it/s]

pred is empty, expected = Cheong Wa Dae was the place where former president Roh made efforts to reform politics.


 18%|█▊        | 257/1390 [02:43<14:07,  1.34it/s]

pred is empty, expected = All the customers share the sweets they brought.


 19%|█▉        | 262/1390 [02:46<13:55,  1.35it/s]

pred is empty, expected = Kang Sung-hyuk was the runner-up and Lee Dong-jun was the third.


 30%|██▉       | 414/1390 [04:20<11:12,  1.45it/s]

pred is empty, expected = Everyone should think that there should be an election.


 33%|███▎      | 453/1390 [04:43<11:59,  1.30it/s]

pred is empty, expected = Earlier, Korean scientists proposed these dark matter candidates.


 33%|███▎      | 464/1390 [04:49<07:35,  2.03it/s]

pred is empty, expected = Football can't be so fun for him.


 34%|███▍      | 475/1390 [04:57<08:04,  1.89it/s]

pred is empty, expected = Vidal's efforts to take the flow of the midfield were significant.


 35%|███▌      | 489/1390 [05:04<09:01,  1.67it/s]

pred is empty, expected = He was interested in basic natural phenomena, the ultimate cause, and the structure of the universe.


 38%|███▊      | 527/1390 [05:27<10:35,  1.36it/s]

pred is empty, expected = In other words, Sterling's unconscionable behavior has become a catalyst that accelerated VAR.


 42%|████▏     | 585/1390 [05:58<08:39,  1.55it/s]

pred is empty, expected = This is why the German people's affection for Müller is special.


 45%|████▍     | 619/1390 [06:18<08:22,  1.53it/s]

pred is empty, expected = The statues of the Buddha and bicycles that Paik Nam-june bought from flea or antique markets were stacked.


 51%|█████     | 711/1390 [07:12<09:45,  1.16it/s]

pred is empty, expected = One of the most important things head coach Bento thinks is identity.


 56%|█████▌    | 781/1390 [07:56<07:01,  1.44it/s]

pred is empty, expected = The children's voices somehow created an atmosphere of excitement and entertainment.


 73%|███████▎  | 1013/1390 [10:17<04:57,  1.27it/s]

pred is empty, expected = Above all, it is feared that the public will lose confidence in the president's words.


 77%|███████▋  | 1074/1390 [10:55<04:02,  1.30it/s]

pred is empty, expected = Both of them included the purpose of drawing out underground funds.


 87%|████████▋ | 1214/1390 [12:21<02:07,  1.39it/s]

pred is empty, expected = That is now becoming the opposite.


 96%|█████████▌| 1332/1390 [13:26<00:37,  1.54it/s]

pred is empty, expected = TikTok has already been cited as a window for sharing obscene materials and has been banned from use in Bangladesh.


 99%|█████████▊| 1372/1390 [13:50<00:10,  1.65it/s]

pred is empty, expected = This is why Jung Woo-young was physically exhausted in the second half.


100%|██████████| 1390/1390 [14:00<00:00,  1.65it/s]


references = 1390, predictions = 1390
Corpus BLEU: 0.06738325548915884




BERTScore F1: 0.8338035600648509
chrF++ score: 23.838595489063902
Average Corpus BLEU: 0.06925939667812063
Average BERTScore F1: 0.8356389145795394
Average chrF++ score: 24.07751052461325
Average Train Loss: 4.837252490593555


(0.06925939667812063, 0.8356389145795394, 24.07751052461325, 4.837252490593555)

In [None]:
'''
LearningRate:        NA                  ,1e-4               ,1e-4               ,1e-4               ,1e-4               ,1e-5               ,1e-6               ,1e-4               ,1e-5               ,1e-6
BatchSize:           NA                  ,30                 ,20                 ,10                 ,5                  ,10                 ,10                 ,10                 ,10                 ,10
Epochs:              NA                  ,1                  ,1                  ,1                  ,1                  ,1                  ,1                  ,3                  ,3                  ,3
train_loss:          NA                  ,3.1573             ,3.008              ,2.815              ,2.69               ,4.14               ,5.54               ,2.06               ,3.41               ,4.83
BERTScore F1:        0.9099301791124955  ,0.8487134749304035 ,0.8803324055528926 ,0.8809359362739289 ,0.8885468969801943 ,0.8451397162140034 ,0.8686191831267568 ,0.9014879917245895 ,0.8647126556345132 ,0.8356389145795394
bleu(eval):          0.172               ,0.066              ,0.107              ,0.112              ,0.129              ,0.0622             ,0.118              ,0.166              ,0.087              ,0.069
chrf(eval):          42.175              ,25.98              ,33.57              ,33.72              ,36.82              ,23.93              ,31.27              ,42.042             ,28.39              ,24.077
'''

'\nLearningRate: 1e-4\nBatchSize: 30\ntrain_loss:3.1573\nbert(eval):\nbleu(eval):\nchrf(eval):\n'

In [None]:
model.eval()

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [None]:
predictions = []
references = []

for _, row in tqdm(eval_data.iterrows(), total=len(eval_data)):
    tokenizer.src_lang = "ko_KR"
    inputs = tokenizer(row["korean"], return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    output_ids = model.generate(**inputs,decoder_start_token_id=tokenizer.convert_tokens_to_ids("en_XX"), max_new_tokens=128)
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predictions.append(pred)
    references.append([row["english"]])

100%|██████████| 835/835 [16:42<00:00,  1.20s/it]


In [None]:


print("Fine Tuned Model Evaluation Metric Score:")
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=references)
print("Corpus BLEU:", bleu_score["bleu"])
bert_score = evaluate.load("bertscore")
results = bert_score.compute(predictions=predictions, references=references, lang="en")
print("BERTScore Precision:", sum(results["precision"])/len(results["precision"]))
print("BERTScore Recall:", sum(results["recall"])/len(results["recall"]))
print("BERTScore F1:", sum(results["f1"])/len(results["f1"]))
chrf = evaluate.load("chrf")
results = chrf.compute(predictions=predictions, references=references, word_order=2)
print("chrF++ score:", results["score"])

Fine Tuned Model Evaluation Metric Score:
Corpus BLEU: 0.12995444330677577


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.8881441473247048
BERTScore Recall: 0.8890909860234061
BERTScore F1: 0.8885468969801943
chrF++ score: 36.827999014975724


In [None]:
def validate(model, dataloader):
    print('Validating...')
    model.eval()
    total_loss = 0
    num_iter = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", leave=False):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            num_iter += 1
            #print(f"Eval iter = {num_iter}, Loss: {loss.item():.4f}")

    avg_val_loss = total_loss / num_iter if num_iter > 0 else float("inf")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    return avg_val_loss