In [None]:
mBART50 - multilingual fine-tuning (SWRC)

In [None]:
# Label data
# Train data
#Read the file
with open("ko_train_SWRC.txt", "r", encoding="utf-8") as ko_train_SWRC:
    ko_sentences = ko_train_SWRC.readlines()

with open("ch_train_SWRC.txt", "r", encoding="utf-8") as zh_train_SWRC:
    zh_sentences = zh_train_SWRC.readlines()

#Remove \n
ko_sentences = [line.strip() for line in ko_sentences]
zh_sentences = [line.strip() for line in zh_sentences]

#Merge ko-sentence and zh-sentence into a df
merged_df = pd.DataFrame({
    'source': ko_sentences,
    'target': zh_sentences
})

#Save as a new CSV
merged_df.to_csv("ko_zh_train_dataset_SWRC.csv", index=False, encoding="utf-8")

# Validation data
#Read the file
with open("ko_vali_SWRC.txt", "r", encoding="utf-8") as ko_validation:
    ko_sentences = ko_validation.readlines()

with open("ch_vali_SWRC.txt", "r", encoding="utf-8") as zh_validation:
    zh_sentences = zh_validation.readlines()

#Remove \n
ko_sentences = [line.strip() for line in ko_sentences]
zh_sentences = [line.strip() for line in zh_sentences]

#Check if row number is the same
if len(ko_sentences) != len(zh_sentences):
    raise ValueError("Not the same.")

#Merge ko-sentence and zh-sentence into a df
merged_df = pd.DataFrame({
    'source': ko_sentences,
    'target': zh_sentences
})

#Save as a new CSV
merged_df.to_csv("ko_zh_validation_dataset_SWRC.csv", index=False, encoding="utf-8")

# Test data
#Read the file
with open("ko_test_SWRC.txt", "r", encoding="utf-8") as ko_test:
    ko_sentences = ko_test.readlines()

with open("ch_test_SWRC.txt", "r", encoding="utf-8") as zh_test:
    zh_sentences = zh_test.readlines()

#Remove \n
ko_sentences = [line.strip() for line in ko_sentences]
zh_sentences = [line.strip() for line in zh_sentences]

#Check if row number is the same
if len(ko_sentences) != len(zh_sentences):
    raise ValueError("Not the same.")

#Merge ko-sentence and zh-sentence into a df
merged_df = pd.DataFrame({
    'source': ko_sentences,
    'target': zh_sentences
})

#Save as a new CSV
merged_df.to_csv("ko_zh_test_dataset_SWRC.csv", index=False, encoding="utf-8")

In [None]:
# Label language codes (Same step of labeling japanese data, japanese language code: ja_XX)

df = pd.read_csv('ko_zh_train_dataset_SWRC.csv')  

# Add langauge code
df['language'] = 'ko_KR'

# Save
df.to_csv('ko_zh_train_dataset_SWRC_languagecode.csv', index=False)

In [None]:
# Combine ja_ch_corpus with SWRC

import pandas as pd

df1 = pd.read_csv('ja_ch_trian_webcrawl_languagecode.csv')
df2 = pd.read_csv('ko_zh_train_dataset_SWRC_languagecode.csv')

df_merged = pd.concat([df1, df2], ignore_index=True)

df_merged.to_csv('SWRC_webcrawl_train.csv', index=False)


In [None]:
# Fine-tuning
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback
import os
from datasets import load_dataset
import torch
from glob import glob

# Load dataset
data_files = {
    "train": "SWRC_webcrawl_train.csv",
    "validation": "SWRC_webcrawl_dev.csv.csv",
    "test": "SWRC_webcrawl_test.csv"
}
dataset = load_dataset('csv', data_files=data_files)

# Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Preprocessing function
def preprocess_function(examples):
    inputs = examples['source']
    targets = examples['target']

    # Set the source language dynamically based on the input
    if examples['language'][0] == "ja_XX":
        tokenizer.src_lang = "ja_XX"
    elif examples['language'][0] == "ko_KR":
        tokenizer.src_lang = "ko_KR"

    tokenizer.tgt_lang = "zh_CN"  # Set target language to Chinese

    # Tokenize inputs and labels
    model_inputs = tokenizer(inputs, max_length=200, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=200, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Disable WandB
os.environ["WANDB_DISABLED"] = "true"

# Set output directory
output_dir = '/home/u542596/experiments/multilingual_fine_tune/SWRC_webcrawl'
os.makedirs(output_dir, exist_ok=True)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    lr_scheduler_type="linear",
    warmup_steps=1500,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Fine-tune the model
train_results = trainer.train()