In [None]:
# Define paths when locally running
base_path = "data/base"
dataset_path = f"{base_path}/dataset"
data_bin_path = f"{base_path}/data-bin"
checkpoints_path = f"{base_path}/checkpoints"
logs_path = f"{base_path}/logs"
evaluation_folder = f"{base_path}/evaluation"

!mkdir -p "{dataset_path}"
!mkdir -p "{data_bin_path}"
!mkdir -p "{checkpoints_path}"
!mkdir -p "{logs_path}"
!mkdir -p "{evaluation_folder}"

In [None]:
# Train truecaser model for Finnish
!mosesdecoder/scripts/recaser/train-truecaser.perl \
-corpus {dataset_path}/train.fi \
-model {dataset_path}/truecase-model.fi

# Train truecaser model for English
!mosesdecoder/scripts/recaser/train-truecaser.perl \
-corpus {dataset_path}/train.en \
-model {dataset_path}/truecase-model.en

In [None]:
set_name = "test2"

# Normalize punctuation and tokenize Finnish text
!cat {dataset_path}/{set_name}.fi | \
mosesdecoder/scripts/tokenizer/normalize-punctuation.perl fi | \
mosesdecoder/scripts/tokenizer/tokenizer.perl -threads 8 -no-escape -l fi \
> {dataset_path}/{set_name}.tok.fi

# Normalize punctuation and tokenize English text
!cat {dataset_path}/{set_name}.en | \
mosesdecoder/scripts/tokenizer/normalize-punctuation.perl en | \
mosesdecoder/scripts/tokenizer/tokenizer.perl -threads 8 -no-escape -l en \
> {dataset_path}/{set_name}.tok.en

# Truecase the tokenized Finnish text
!mosesdecoder/scripts/recaser/truecase.perl \
-model {dataset_path}/truecase-model.fi \
< {dataset_path}/{set_name}.tok.fi \
> {dataset_path}/{set_name}.tok.truecase.fi

# Truecase the tokenized English text
!mosesdecoder/scripts/recaser/truecase.perl \
-model {dataset_path}/truecase-model.en \
< {dataset_path}/{set_name}.tok.en \
> {dataset_path}/{set_name}.tok.truecase.en

# Clean the corpus
!perl mosesdecoder/scripts/training/clean-corpus-n.perl \
{dataset_path}/{set_name}.tok.truecase en fi \
{dataset_path}/{set_name}.tok.clean 1 50

In [None]:
set_name = "test2"
bpe_path = "data/bpe/dataset"

# Apply the learned BPE model and vocabulary to the dev & test set
!subword-nmt apply-bpe -c {bpe_path}/bpe.codes \
    --vocabulary {bpe_path}/vocab.fi < {dataset_path}/{set_name}.tok.clean.fi > {dataset_path}/{set_name}.bpe.fi
!subword-nmt apply-bpe -c {bpe_path}/bpe.codes \
    --vocabulary {bpe_path}/vocab.en < {dataset_path}/{set_name}.tok.clean.en > {dataset_path}/{set_name}.bpe.en

In [None]:
! fairseq-preprocess --source-lang fi --target-lang en\
    --srcdict data/bpe/data-bin/dict.fi.txt \
    --tgtdict data/bpe/data-bin/dict.en.txt \
    --testpref {dataset_path}/test2.tok.clean \
    --destdir {data_bin_path} \
    --workers 20


In [None]:
!CUDA_VISIBLE_DEVICES=4,5,6,7 fairseq-generate {data_bin_path} \
    --path data/bpe/checkpoints/checkpoint_best.pt \
    --batch-size 128 --beam 5 --remove-bpe \
    --scoring sacrebleu --sacrebleu\
    > {base_path}/translations_sacrebleu.txt


In [None]:
output_file = f'{base_path}/translations_sacrebleu.txt'  # File generated by fairseq-generate
reordered_output_file = f'{base_path}/reordered_output.txt'  # File to save the reordered translations

# Read the output file and extract translations
with open(output_file, 'r') as f:
    lines = f.readlines()

translations = {}
for line in lines:
    if line.startswith('H-'):
        parts = line.split('\t')
        index = int(parts[0].split('-')[1])
        translation = parts[2].strip()
        translations[index] = translation

# Reorder translations and save to file
with open(reordered_output_file, 'w') as f:
    for i in sorted(translations.keys()):
        f.write(translations[i] + '\n')


In [None]:
evaluation_folder = "data/base/evaluation"
!mkdir -p "{evaluation_folder}"
set_name = "test2"

# Reverse the truecasing of the reference test2 set (English)
!mosesdecoder/scripts/recaser/detruecase.perl \
< {dataset_path}/test2.tok.clean.en \
> {evaluation_folder}/test2.tok.en

# Reverse the truecasing of the source test2 set (Finnish)
!mosesdecoder/scripts/recaser/detruecase.perl \
< {dataset_path}/test2.tok.clean.fi \
> {evaluation_folder}/test2.tok.fi

# Reverse the tokenization of the reference test2 set (English)
!mosesdecoder/scripts/tokenizer/detokenizer.perl -l en \
< {evaluation_folder}/test2.tok.en \
> {evaluation_folder}/test2.detok.en

# Reverse the tokenization of the source test2 set (Finnish)
!mosesdecoder/scripts/tokenizer/detokenizer.perl -l fi \
< {evaluation_folder}/test2.tok.fi \
> {evaluation_folder}/test2.detok.fi

# Reverse the punctuation normalization of the reference test2 set (English)
!mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -r \
< {evaluation_folder}/test2.detok.en \
> {evaluation_folder}/test2.en

# Reverse the punctuation normalization of the source test2 set (Finnish)
!mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -r \
< {evaluation_folder}/test2.detok.fi \
> {evaluation_folder}/test2.fi

# Reverse the truecasing of the hypothesis translations (English)
!mosesdecoder/scripts/recaser/detruecase.perl \
< {base_path}/reordered_output.txt \
> {evaluation_folder}/reordered_output.truecase.txt

# Reverse the tokenization of the hypothesis translations (English)
!mosesdecoder/scripts/tokenizer/detokenizer.perl -l en \
< {evaluation_folder}/reordered_output.truecase.txt \
> {evaluation_folder}/reordered_output.detok.txt

# Reverse the punctuation normalization of the hypothesis translations (English)
!mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -r \
< {evaluation_folder}/reordered_output.detok.txt \
> {evaluation_folder}/reordered_output.txt

In [None]:
!sacrebleu {evaluation_folder}/test2.en < {evaluation_folder}/reordered_output.txt > {evaluation_folder}/sacrebleu_score.txt

In [None]:
!CUDA_VISIBLE_DEVICES=4,5,6,7 comet-score -t {evaluation_folder}/reordered_output.txt -r {evaluation_folder}/test2.en -s {evaluation_folder}/test2.fi > {evaluation_folder}/comet_score.txt