# OpenNMT Translation Notebook (Korean → English using WikiMatrix)

In [None]:
!pip install OpenNMT-py

In [None]:
import pandas as pd
import os

base_path = "./en-ko_unzipped"  # Change if needed
output_path = "./opennmt_data"
os.makedirs(output_path, exist_ok=True)

# Read the WikiMatrix aligned files
with open(os.path.join(base_path, "WikiMatrix.en-ko.en"), encoding="utf-8") as f_en, \
     open(os.path.join(base_path, "WikiMatrix.en-ko.ko"), encoding="utf-8") as f_ko:
    en_lines = [line.strip() for line in f_en.readlines()]
    ko_lines = [line.strip() for line in f_ko.readlines()]

# Create DataFrame
df = pd.DataFrame({"en": en_lines, "ko": ko_lines})
df.dropna(inplace=True)
df = df[(df["en"].str.strip() != "") & (df["ko"].str.strip() != "")]
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split
split_idx = int(0.9 * len(df))
train_df = df[:split_idx]
valid_df = df[split_idx:]

# Save to disk
train_df["ko"].to_csv(f"{output_path}/train.ko", index=False, header=False)
train_df["en"].to_csv(f"{output_path}/train.en", index=False, header=False)
valid_df["ko"].to_csv(f"{output_path}/valid.ko", index=False, header=False)
valid_df["en"].to_csv(f"{output_path}/valid.en", index=False, header=False)

In [None]:
!onmt_preprocess \
  -train_src opennmt_data/train.ko -train_tgt opennmt_data/train.en \
  -valid_src opennmt_data/valid.ko -valid_tgt opennmt_data/valid.en \
  -save_data opennmt_data/processed

In [None]:
!onmt_train \
  -data opennmt_data/processed \
  -save_model opennmt_data/koen_transformer \
  -train_steps 10000 \
  -batch_size 64 \
  -encoder_type transformer \
  -decoder_type transformer \
  -position_encoding \
  -share_embeddings \
  -optim adam -learning_rate 2e-4 \
  -gpu_ranks 0

In [None]:
# Translate test Korean sentence
!echo "안녕하세요. 오늘 날씨가 어때요?" > test_input.ko
!onmt_translate \
  -model opennmt_data/koen_transformer_step_10000.pt \
  -src test_input.ko \
  -output test_output.en \
  -gpu 0

# View translation
!cat test_output.en