In [2]:
!pip install sacrebleu
!pip install evaluate
!pip install bert_score
!pip install sacremoses

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1
Collecting evaluate
  Downloading eva

In [2]:
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
from time import time
import evaluate

In [3]:
# Set seed.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [16]:
model_name = "Helsinki-NLP/opus-mt-ko-en"

eval_size = 0.15

BATCH_SIZE = 30
NUM_EPOCHS = 1
LEARNING_RATE = 1e-3
train_file_path = "/content/ai_hub_train_corpus_small.json"
if(torch.cuda.is_available()):
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print(f"Using {DEVICE} for processing")

Using cuda for processing


In [7]:
def load_json_file(file_path):
    with open(file_path, encoding = 'utf-8') as f:
        data = json.load(f)
    ko_text = [text['ko_text'] for data_point in data for text in data_point['text']]
    en_text = [text['en_text'] for data_point in data for text in data_point['text']]
    return ko_text, en_text


In [8]:

def convert_to_pd(file_path):
    ko_text, en_text = load_json_file(file_path)
    data = {'korean': ko_text, 'english': en_text}
    df = pd.DataFrame(data)
    return df

In [9]:
development_data = convert_to_pd(train_file_path)

In [10]:
# Custom Dataset class.
class TranslationDataset(Dataset):
    def __init__(self, df, tokenizer, max_length = 128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.df['korean'][idx]
        tgt = self.df['english'][idx]

        src_enc = self.tokenizer(src, return_tensors = "pt", padding="max_length", truncation=True, max_length = self.max_length)
        tgt_enc = self.tokenizer(tgt, return_tensors = "pt", padding="max_length", truncation=True, max_length = self.max_length)

        input_ids = src_enc["input_ids"].squeeze()
        attention_mask = src_enc["attention_mask"].squeeze()
        labels = tgt_enc["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100 #ignore padding in loss calculation
        return{
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels":labels
        }

In [44]:
def train(model, optimizer, num_epochs, dataloader):
    print('Training started...')
    model.to(DEVICE)
    model.train()

    total_batches = len(dataloader)
    effective_batches = math.ceil(total_batches * num_epochs) if num_epochs < 1 else total_batches

    epoch_loss = 0
    num_iter = 0
    start_time = time()

    progress_bar = tqdm(dataloader, total=effective_batches, desc=f"Epoch {num_epochs}", leave=False)
    losses = []
    for batch in progress_bar:
        if num_iter >= effective_batches:
            break
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        num_iter += 1
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / num_iter if num_iter > 0 else float("inf")
    elapsed = time() - start_time
    print(f"Training completed in {elapsed:.2f}s — Avg Loss: {avg_loss:.4f}")
    return avg_loss

def cross_validate():
    num_of_folds = int(1/eval_size)
    print(f"Using {num_of_folds} folds for cross validation")
    bleu = evaluate.load("bleu")
    chrf = evaluate.load("chrf")
    bert_score = evaluate.load("bertscore")
    bleu_score_list = []
    bert_score_list = []
    chrf_score_list = []
    train_loss_list = []
    for fold in range(3, num_of_folds):
        train_data = development_data[development_data.index % num_of_folds != fold]
        eval_data = development_data[development_data.index % num_of_folds == fold]

        train_data = train_data.reset_index(drop = True)
        eval_data = eval_data.reset_index(drop = True)

        tokenizer = MarianTokenizer.from_pretrained(model_name)
        train_dataset = TranslationDataset(train_data, tokenizer)
        valid_dataset = TranslationDataset(eval_data, tokenizer)

        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
        valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

        model = MarianMTModel.from_pretrained(model_name).to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9)

        loss = 0#train(model, optimizer, NUM_EPOCHS, train_dataloader)
        print(f"Train model for fold number {fold}")
        train_loss_list.append(loss)
        print(f"Train Loss: {loss}")

        print(f"Fine Tuned Model Evaluation Metric Score: Fold num = {fold}")
        model.eval()

        predictions = []
        references = []
        for _, row in tqdm(eval_data.iterrows(), total=len(eval_data)):
            tokenizer.src_lang = "ko_KR"
            inputs = tokenizer(row["korean"], return_tensors="pt", max_length=128, truncation=True, padding="max_length")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            output_ids = model.generate(**inputs,decoder_start_token_id=tokenizer.convert_tokens_to_ids("en_XX"), max_new_tokens=128)
            pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            if(row["english"] == ""):
                print()
                print(f"row[english] is empty, input = {inputs}, pred = {pred}")
            if(pred == ""):
                print(f"pred is empty, input = {inputs}, expected = {row['english']}")
            predictions.append(pred)
            references.append([row["english"]])
            # break
        print(f"references = {len(references)}, predictions = {len(predictions)}")



        bleu_score = bleu.compute(predictions=predictions, references=references)
        print("Corpus BLEU:", bleu_score["bleu"])
        bleu_score_list.append(bleu_score["bleu"])

        results = bert_score.compute(predictions=predictions, references=references, lang="en")
        print("BERTScore F1:", sum(results["f1"])/len(results["f1"]))
        bert_score_list.append(sum(results["f1"])/len(results["f1"]))

        results = chrf.compute(predictions=predictions, references=references, word_order=2)
        print("chrF++ score:", results["score"])
        chrf_score_list.append(results["score"])

    avg_bleu_score = sum(bleu_score_list)/len(bleu_score_list)
    avg_bert_score = sum(bert_score_list)/len(bert_score_list)
    avg_chrf_score = sum(chrf_score_list)/len(chrf_score_list)
    avg_train_loss = sum(train_loss_list)/len(train_loss_list)
    print(f"Average Corpus BLEU: {avg_bleu_score}")
    print(f"Average BERTScore F1: {avg_bert_score}")
    print(f"Average chrF++ score: {avg_chrf_score}")
    print(f"Average Train Loss: {avg_train_loss}")
    return avg_bleu_score, avg_bert_score, avg_chrf_score, avg_train_loss



In [45]:
cross_validate()

Using 6 folds for cross validation




Train model for fold number 3
Train Loss: 0
Fine Tuned Model Evaluation Metric Score: Fold num = 3


  7%|▋         | 99/1391 [00:32<12:27,  1.73it/s]

pred was empty


 99%|█████████▉| 1375/1391 [07:32<00:09,  1.65it/s]

pred was empty


100%|██████████| 1391/1391 [07:37<00:00,  3.04it/s]


references = 1391, predictions = 1391
Corpus BLEU: 0.1759417135962956


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.9092217586522133
chrF++ score: 42.5869639675939




Train model for fold number 4
Train Loss: 0
Fine Tuned Model Evaluation Metric Score: Fold num = 4


 13%|█▎        | 180/1390 [01:01<11:58,  1.69it/s]

pred was empty


100%|██████████| 1390/1390 [07:55<00:00,  2.92it/s]


references = 1390, predictions = 1390
Corpus BLEU: 0.17267038954399472




BERTScore F1: 0.9104032462878193
chrF++ score: 42.376989536580275




Train model for fold number 5
Train Loss: 0
Fine Tuned Model Evaluation Metric Score: Fold num = 5


 69%|██████▉   | 960/1390 [05:15<04:34,  1.57it/s]

pred was empty


100%|██████████| 1390/1390 [07:41<00:00,  3.01it/s]


references = 1390, predictions = 1390
Corpus BLEU: 0.168561135638165




BERTScore F1: 0.9095624997461442
chrF++ score: 41.98053987429997
Average Corpus BLEU: 0.17239107959281844
Average BERTScore F1: 0.9097291682287256
Average chrF++ score: 42.314831126158055
Average Train Loss: 0.0


(0.17239107959281844, 0.9097291682287256, 42.314831126158055, 0.0)

In [2]:
(0.9097291682287256*3 + 0.9100606588902532 + 0.9105990815728281 + 0.9097338295257151)/6

0.9099301791124955

In [1]:
(42.314831126158055*3 + 42.02530418012024 + 42.09531073000732 + 41.9858179249788)/6

42.17515436893009

In [46]:
(0.17239107959281844*3 + 0.1728779984172209 + 0.17170250560804198 + 0.17143359276617504)/6

0.1721978892616489

In [None]:
'''
LearningRate:        NA                  ,1e-4               ,1e-4               ,1e-4               ,1e-4               ,1e-3
BatchSize:           NA                  ,30                 ,20                 ,10                 ,5                  ,30
Epochs:              NA                  ,1                  ,1                  ,1                  ,1                  ,1
train_loss:          NA                  ,3.1573             ,3.008              ,2.815              ,2.69               ,5.48
BERTScore F1:        0.9099301791124955  ,0.8487134749304035 ,0.8803324055528926 ,0.8809359362739289 ,0.8885468969801943
bleu(eval):          0.172               ,0.066              ,0.107              ,0.112              ,0.129
chrf(eval):          42.175              ,25.98              ,33.57              ,33.72              ,36.82
'''

'\nLearningRate: 1e-4\nBatchSize: 30\ntrain_loss:3.1573\nbert(eval):\nbleu(eval):\nchrf(eval):\n'

In [None]:
model.eval()

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [None]:
predictions = []
references = []

for _, row in tqdm(eval_data.iterrows(), total=len(eval_data)):
    tokenizer.src_lang = "ko_KR"
    inputs = tokenizer(row["korean"], return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    output_ids = model.generate(**inputs,decoder_start_token_id=tokenizer.convert_tokens_to_ids("en_XX"), max_new_tokens=128)
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predictions.append(pred)
    references.append([row["english"]])

100%|██████████| 835/835 [16:42<00:00,  1.20s/it]


In [None]:


print("Fine Tuned Model Evaluation Metric Score:")
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=references)
print("Corpus BLEU:", bleu_score["bleu"])
bert_score = evaluate.load("bertscore")
results = bert_score.compute(predictions=predictions, references=references, lang="en")
print("BERTScore Precision:", sum(results["precision"])/len(results["precision"]))
print("BERTScore Recall:", sum(results["recall"])/len(results["recall"]))
print("BERTScore F1:", sum(results["f1"])/len(results["f1"]))
chrf = evaluate.load("chrf")
results = chrf.compute(predictions=predictions, references=references, word_order=2)
print("chrF++ score:", results["score"])

Fine Tuned Model Evaluation Metric Score:
Corpus BLEU: 0.12995444330677577


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.8881441473247048
BERTScore Recall: 0.8890909860234061
BERTScore F1: 0.8885468969801943
chrF++ score: 36.827999014975724


In [None]:
def validate(model, dataloader):
    print('Validating...')
    model.eval()
    total_loss = 0
    num_iter = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", leave=False):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            num_iter += 1
            #print(f"Eval iter = {num_iter}, Loss: {loss.item():.4f}")

    avg_val_loss = total_loss / num_iter if num_iter > 0 else float("inf")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    return avg_val_loss