In [None]:
import os
import re
import sys
import torch
import sentencepiece
import pandas as pd
from tqdm.auto import tqdm, trange
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

In [None]:
FOLDER_PATH = '/kaggle/input/aromanian/'
FILE_NAME = 'Tales_processed.csv'
DATA_LANGUAGES = {
    'ro': 'Romanian',
    'rup': 'Aromanian General Dialect',
    'rup_std': 'Aromanian Standard Dialect',
    'rup_cun': 'Aromanian Regional Dialect',
    'en': 'English',
    'es': 'Spanish',
    'fr': 'French'
}

In [None]:
path = os.path.join(FOLDER_PATH, FILE_NAME)
translations_df = pd.read_csv(path, index_col=[0]).drop('split', axis=1)
translations_df

In [None]:
class T5FineTuning:
    def __init__(self, translations_df, data_languages):
        self.checkpoint = 't5-small'
        self.tokenizer = T5Tokenizer.from_pretrained(self.checkpoint)
        self.model = T5ForConditionalGeneration.from_pretrained(self.checkpoint)
        self.translations_df = translations_df
        self.data_languages = data_languages

    def __tokenize(self, batch, source_lang_code, target_lang_code):
        source_lang = self.data_languages[source_lang_code]
        target_lang = self.data_languages[target_lang_code]
        prefix = f"translate {source_lang} to {target_lang}: "
        
        batch[source_lang_code] = [prefix + text for text in batch[source_lang_code]]
        
        tokenized_input = self.tokenizer(batch[source_lang_code], padding='max_length', truncation=True, max_length=512)
        tokenized_label = self.tokenizer(batch[target_lang_code], padding='max_length', truncation=True, max_length=512)
        tokenized_input['labels'] = tokenized_label['input_ids']
        
        return tokenized_input

    def fine_tunig(self, source_lang_code, target_lang_code='ro'):
        print(f"translate {source_lang_code} to {target_lang_code}")
        
        temp_df = translations_df[[source_lang_code, target_lang_code]].copy()
        dataset = Dataset.from_dict(temp_df)
        split_datasets = dataset.train_test_split(test_size=0.1)
        
        tokenized_train = split_datasets['train'].map(lambda batch: self.__tokenize(batch, source_lang_code, target_lang_code), batched=True)
        tokenized_test = split_datasets['test'].map(lambda batch: self.__tokenize(batch, source_lang_code, target_lang_code), batched=True)
        
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=30,
            per_device_train_batch_size=8,  
            per_device_eval_batch_size=8, 
            learning_rate=2e-4,
            weight_decay=0.02,
            gradient_accumulation_steps=2, 
            evaluation_strategy="epoch", 
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_test
        )
        
        trainer.train()
        
        model_location = f'./{source_lang_code}-{target_lang_code}-checkpoint-v1'
        
        self.model.save_pretrained(model_location)
        self.tokenizer.save_pretrained(model_location)
        
        return model_location

In [None]:
t5_fine_tuning = T5FineTuning(translations_df, DATA_LANGUAGES)

In [None]:
for language in DATA_LANGUAGES.keys():
    if language == "ro":
        continue
    print(language)
    t5_fine_tuning.fine_tunig(language)