Transformer: following tutorial of OpenNMT

In [None]:
# Create a directory and clone the Github MT-Preparation repository
git clone https://github.com/ymoslem/MT-Preparation.git

In [None]:
# Install the requirements
pip install -r MT-Preparation/requirements.txt

In [None]:
# Segmenting sentences in Korean and Chinese datasets
import pandas as pd

input_file = '/home/u542596/experiments/bilingual_fine_tune/BLEU_and_COMET/korean_original_SWRC_train.txt'
output_file = '/home/u542596/experiments/bilingual_fine_tune/BLEU_and_COMET/korean_original_SWRC_train_seg.txt'

with open(input_file, 'r', encoding='utf-8') as infile:
    sentences = infile.readlines()

# character segment function
def char_tokenize(text):
    return ' '.join(list(text.strip()))

# character-based segment every sentence
char_segmented_sentences = [char_tokenize(sentence) for sentence in sentences]

# create a df and save to csv
df = pd.DataFrame({'segmented_sentences': char_segmented_sentences})
df.to_csv(output_file, index=False, header=False, encoding='utf-8')

Source: korean_original_SWRC_train_seg.txt
Target: chinese_original_SWRC_train_seg.txt

In [None]:
#Sentence piece
python MT-Preparation/subwording/1-train_unigram.py korean_original_SWRC_train_seg.txt chinese_original_SWRC_train_seg.txt

In [None]:
#Subword
python MT-Preparation/subwording/2-subword.py source.model target.model Source: korean_original_SWRC_train_seg.txt chinese_original_SWRC_train_seg.txt

#Output
#Done subwording the source file! Output: korean_original_SWRC_train_seg.txt.subword
#Done subwording the target file! Output: chinese_original_SWRC_train_seg.txt.subword

We do not follow splitting part since we need to use exactly the same training, dev, and test sets with prior work. Therefore, just repeat steps from segmentation to subword

In [None]:
# Train
pip install OpenNMT-py

# Create the YAML configuration file
# Change hyperparameters based on prior work: https://arxiv.org/pdf/1911.11008

config = '''# config.yaml
## Where the samples will be written
save_data: run
# Training files
data:
    corpus_1:
        path_src: korean_original_SWRC.txt-filtered.ko.subword.train
        path_tgt: ch_original_SWRC_segmented.txt-filtered.ch.subword.train
        transforms: [filtertoolong]
    valid:
        path_src: korean_original_SWRC.txt-filtered.ko.subword.dev
        path_tgt: ch_original_SWRC_segmented.txt-filtered.ch.subword.dev
        transforms: [filtertoolong]
# Vocabulary files, generated by onmt_build_vocab
src_vocab: run/source.vocab
tgt_vocab: run/target.vocab
# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 50000
tgt_vocab_size: 50000
# Filter out source/target longer than n if [filtertoolong] enabled
src_seq_length: 50
src_seq_length: 50
# Tokenization options
src_subword_model: source.model
tgt_subword_model: target.model
# Where to save the log file and the output models/checkpoints
log_file: train.log
save_model: models/model.transformer

# Stop training if it does not imporve after n validations
#early_stopping: 4
# Default: 5000 - Save a model checkpoint for each n
save_checkpoint_steps: 5000
# To save space, limit checkpoints to last n
# keep_checkpoint: 3
seed: 3435
# Default: 100000 - Train the model to max n steps 
# Increase to 200000 or more for large datasets
# For fine-tuning, add up the required steps to the original steps
train_steps: 100000

# Default: 10000 - Run validation after n steps
valid_steps: 5000

# Default: 4000 - for large datasets, try up to 8000
warmup_steps: 8000
report_every: 100
# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]
# Batching
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 4096
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]
# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
# warmup_steps: 8000
adam_beta1: 0.9
adam_beta2: 0.98
decay_method: "noam"
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"
# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
''' 
with open("config.yaml", "w+") as config_yaml: 
    config_yaml.write(config)


In [None]:
#Create voc
onmt_build_vocab -config config.yaml -n_sample -1 -num_threads 64

In [None]:
#Training
onmt_train -config config.yaml

In [None]:
# Translation
# Choose the best checkpoint: model.transformer_step_3000.pt
onmt_translate -model models/model.transformer_step_3000.pt -src korean_original_SWRC_train_seg.txt.subword -output ch.SWRC_3000.translated -gpu 0 -min_length 1

In [None]:
# Check the first 5 lines of the translation file 
head -n 5 ch.SWRC_3000.translated

In [None]:
# Desubword the translation file 
python MT-Preparation/subwording/3-desubword.py target.model ch.SWRC_3000.translated

# Output: ch.SWRC_3000.translated.desubword

In [None]:
# Desubword the target file (reference) of the test dataset
python3 MT-Preparation/subwording/3-desubword.py target.model chinese_original_SWRC_train_seg.txt.subword

# Output: chinese_original_SWRC_train_seg.txt.subword.desubword

BLEU

In [None]:
# Download the BLEU script
!wget https://raw.githubusercontent.com/ymoslem/MT-Evaluation/main/BLEU/compute-bleu.py

In [None]:
# Install sacrebleu
pip install sacrebleu

In [None]:
# BLEU
python compute-bleu.py ch_subword.test.desubword  ch.SWRC_100000.translated.desubword

COMET

In [None]:
#Remove space -CH
from comet import download_model, load_from_checkpoint
import os

file_path = '/home/u542596/experiments/bilingual_fine_tune/BLEU_and_COMET/ch.SWRC_3000.translated'
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
lines_no_spaces = [line.replace(" ", "").strip() for line in lines]
with open(file_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines_no_spaces))

# COMET
model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

# Read sentence
with open('korean_original_SWRC_train.txt', 'r', encoding='utf-8') as src_file, \
     open('ch.SWRC_3000.translated', 'r', encoding='utf-8') as mt_file, \
     open('chinese_original_SWRC_train.txt.txt', 'r', encoding='utf-8') as ref_file:

    src_lines = src_file.readlines()
    mt_lines = mt_file.readlines()
    ref_lines = ref_file.readlines()

# Create data
data = [
    {
        "src": src.strip(),
        "mt": mt.strip(),
        "ref": ref.strip()
    }
    for src, mt, ref in zip(src_lines, mt_lines, ref_lines)
]

# Sentence-level COMET
model_output = model.predict(data, batch_size=8, gpus=1)

# Set output dir
output_dir = 'comet'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "comet-score_ch.SWRC_3000.translated")

# Write scores to the file
with open(output_file, 'w', encoding='utf-8') as f:
    for i, score in enumerate(model_output["scores"]):
        f.write(f"Sentence {i}: {score}\n")
    f.write(f"Overall COMET Score: {model_output['system_score']}\n")