In [55]:
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
from time import time

In [56]:
# Set seed.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [57]:
model_name = "Helsinki-NLP/opus-mt-ko-en"

eval_size = 0.1

BATCH_SIZE = 5
NUM_EPOCHS = 1
LEARNING_RATE = 1e-4
train_file_path = "/content/ai_hub_train_corpus_small.json"
if(torch.cuda.is_available()):
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print(f"Using {DEVICE} for processing")

Using cuda for processing


In [58]:
tokenizer = MarianTokenizer.from_pretrained(model_name)#.to(DEVICE)
model = MarianMTModel.from_pretrained(model_name).to(DEVICE)



In [59]:
def load_json_file(file_path):
    with open(file_path, encoding = 'utf-8') as f:
        data = json.load(f)
    ko_text = [text['ko_text'] for data_point in data for text in data_point['text']]
    en_text = [text['en_text'] for data_point in data for text in data_point['text']]
    return ko_text, en_text


In [60]:

def convert_to_pd(file_path):
    ko_text, en_text = load_json_file(file_path)
    data = {'korean': ko_text, 'english': en_text}
    df = pd.DataFrame(data)
    return df

In [61]:
development_data = convert_to_pd(train_file_path)

In [62]:
train_data, eval_data = train_test_split(development_data, test_size = eval_size)

train_data = train_data.reset_index(drop = True)
eval_data = eval_data.reset_index(drop = True)

In [63]:
print(train_data.head())

                                              korean  \
0           퇴직연금 적립금은 지난해 말 기준 190조원에 이르지만, 수익률이 낮다.   
1  요즘에는 학업을 방해하는 심각한 요소가 게임 중독이라고 할 정도로 게임에 빠져있는 ...   
2               정부가 직접 나서 강제 셧다운제를 완화하자는 법안도 제출돼 있다.   
3  서로가 서로에게 익숙해질 법한 상황에서 감독 간의 불꽃 튀는 지략 싸움이 예고된 가...   
4  아베 총리가 26일 도쿄 번화가 롯폰기에 있는 로바다야키(일본식 선술집)에서 트럼프...   

                                             english  
0  Retirement pension reserves amounted to KRW 19...  
1  These days, there are children who are so addi...  
2  A bill to ease the forced shutdown system is a...  
3  In a situation in which they are getting used ...  
4  The Mainichi Shimbun reported that Abe plans t...  


In [64]:
# Custom Dataset class.
class TranslationDataset(Dataset):
    def __init__(self, df, tokenizer, max_length = 128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.df['korean'][idx]
        tgt = self.df['english'][idx]

        src_enc = self.tokenizer(src, return_tensors = "pt", padding="max_length", truncation=True, max_length = self.max_length)
        tgt_enc = self.tokenizer(tgt, return_tensors = "pt", padding="max_length", truncation=True, max_length = self.max_length)

        input_ids = src_enc["input_ids"].squeeze()
        attention_mask = src_enc["attention_mask"].squeeze()
        labels = tgt_enc["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100 #ignore padding in loss calculation
        return{
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels":labels
        }

In [65]:
train_dataset = TranslationDataset(train_data, tokenizer)
valid_dataset = TranslationDataset(eval_data, tokenizer)

In [66]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9)

In [67]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

In [68]:
iter(train_dataloader)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x7b318dd06510>

In [69]:
def train(model, optimizer, num_epochs, dataloader):
    print('Training started...')
    model.to(DEVICE)
    model.train()

    train_loss = []
    total_batches = len(dataloader)
    effective_batches = math.ceil(total_batches * num_epochs) if num_epochs < 1 else total_batches

    epoch_loss = 0
    num_iter = 0
    start_time = time()

    progress_bar = tqdm(dataloader, total=effective_batches, desc=f"Epoch {num_epochs}", leave=False)
    losses = []
    for batch in progress_bar:
        if num_iter >= effective_batches:
            break
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        num_iter += 1
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / num_iter if num_iter > 0 else float("inf")
    train_loss.append(avg_loss)
    elapsed = time() - start_time
    print(f"Training completed in {elapsed:.2f}s — Avg Loss: {avg_loss:.4f}")
    return train_loss

def validate(model, dataloader):
    print('Validating...')
    model.eval()
    total_loss = 0
    num_iter = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", leave=False):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            num_iter += 1
            #print(f"Eval iter = {num_iter}, Loss: {loss.item():.4f}")

    avg_val_loss = total_loss / num_iter if num_iter > 0 else float("inf")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    return avg_val_loss

In [70]:
train(model, optimizer, NUM_EPOCHS, train_dataloader)

Training started...


                                                                       

Training completed in 192.72s — Avg Loss: 2.6906




[2.690559714913527]

In [71]:
model.eval()

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [72]:
predictions = []
references = []

for _, row in tqdm(eval_data.iterrows(), total=len(eval_data)):
    tokenizer.src_lang = "ko_KR"
    inputs = tokenizer(row["korean"], return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    output_ids = model.generate(**inputs,decoder_start_token_id=tokenizer.convert_tokens_to_ids("en_XX"), max_new_tokens=128)
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predictions.append(pred)
    references.append([row["english"]])

100%|██████████| 835/835 [09:32<00:00,  1.46it/s]


In [39]:
!pip install sacrebleu
!pip install evaluate
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [73]:

import evaluate
print("Fine Tuned Model Evaluation Metric Score:")
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=references)
print("Corpus BLEU:", bleu_score["bleu"])
bert_score = evaluate.load("bertscore")
results = bert_score.compute(predictions=predictions, references=references, lang="en")
print("BERTScore Precision:", sum(results["precision"])/len(results["precision"]))
print("BERTScore Recall:", sum(results["recall"])/len(results["recall"]))
print("BERTScore F1:", sum(results["f1"])/len(results["f1"]))
chrf = evaluate.load("chrf")
results = chrf.compute(predictions=predictions, references=references, word_order=2)
print("chrF++ score:", results["score"])

Fine Tuned Model Evaluation Metric Score:
Corpus BLEU: 0.12995444330677577


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.8881441473247048
BERTScore Recall: 0.8890909860234061
BERTScore F1: 0.8885468969801943
chrF++ score: 36.827999014975724


In [2]:
'''
LearningRate: 1e-4                       ,1e-4               ,1e-4               ,1e-4
BatchSize: 30                            ,20                 ,10                 ,5
Epochs: 1                                ,1                  ,1                  ,1
train_loss:3.1573                        ,3.008              ,2.815              ,2.69
BERTScore Precision: 0.8429047888624454  ,0.8794069016051149 ,0.8812887632204387 ,0.8881441473247048
BERTScore Recall: 0.8551971387006565     ,0.8813994906619638 ,0.8808288726264131 ,0.8890909860234061
BERTScore F1: 0.8487134749304035         ,0.8803324055528926 ,0.8809359362739289 ,0.8885468969801943
bleu(eval):0.066                         ,0.107              ,0.112              ,0.129
chrf(eval): 25.98                        ,33.57              ,33.72              ,36.82
'''

'\nLearningRate: 1e-4\nBatchSize: 30\ntrain_loss:3.1573\nbert(eval):\nbleu(eval):\nchrf(eval):\n'