# 🇰🇷 Korean → English Translation using T5 (Hugging Face Transformers)

In [None]:
# ✅ Install libraries (only run once)
!pip install transformers datasets accelerate

In [None]:
# ✅ Load and prepare data from WikiMatrix
import pandas as pd

# Load your aligned Korean-English file (already unzipped)
with open("./en-ko_unzipped/WikiMatrix.en-ko.en", encoding="utf-8") as f_en, \
     open("./en-ko_unzipped/WikiMatrix.en-ko.ko", encoding="utf-8") as f_ko:
    en_lines = [line.strip() for line in f_en.readlines()]
    ko_lines = [line.strip() for line in f_ko.readlines()]

df = pd.DataFrame({"en": en_lines, "ko": ko_lines})
df.dropna(inplace=True)
df = df[(df['en'].str.strip() != "") & (df['ko'].str.strip() != "")]
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = df[:5000]  # keep small for demo
df.head()

In [None]:
# ✅ Load T5 tokenizer and model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'google/mt5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# ✅ Tokenize the dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch

class TranslationDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    def __len__(self):
        return len(self.inputs['input_ids'])
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.targets['input_ids'][idx])
        return item

# Format inputs for T5 (prefix task with "translate Korean to English: ...")
df['src'] = 'translate Korean to English: ' + df['ko']
X_train, X_val, y_train, y_val = train_test_split(df['src'], df['en'], test_size=0.1, random_state=42)

train_enc = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors="pt")
train_labels = tokenizer(list(y_train), padding=True, truncation=True, max_length=128, return_tensors="pt")
val_enc = tokenizer(list(X_val), padding=True, truncation=True, max_length=128, return_tensors="pt")
val_labels = tokenizer(list(y_val), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_dataset = TranslationDataset(train_enc, train_labels)
val_dataset = TranslationDataset(val_enc, val_labels)

In [None]:
# ✅ Train using Hugging Face Trainer
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

In [None]:
# ✅ Inference function
def translate_ko_to_en(text):
    input_text = "translate Korean to English: " + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    output = model.generate(**inputs, max_length=128)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Try sample
translate_ko_to_en("안녕하세요. 오늘 날씨가 어때요?")