In [1]:
# Define paths when locally running
base_path = "data/base"
dataset_path = f"{base_path}/dataset"
data_bin_path = f"{base_path}/data-bin"
checkpoints_path = f"{base_path}/checkpoints"
logs_path = f"{base_path}/logs"
evaluation_folder = f"{base_path}/evaluation"

!mkdir -p "{dataset_path}"
!mkdir -p "{data_bin_path}"
!mkdir -p "{checkpoints_path}"
!mkdir -p "{logs_path}"
!mkdir -p "{evaluation_folder}"

In [2]:
set_name = "test2"

# Normalize punctuation and tokenize Finnish text
!cat {dataset_path}/{set_name}.fi | \
mosesdecoder/scripts/tokenizer/normalize-punctuation.perl fi | \
mosesdecoder/scripts/tokenizer/tokenizer.perl -threads 8 -no-escape -l fi \
> {dataset_path}/{set_name}.tok.fi

# Normalize punctuation and tokenize English text
!cat {dataset_path}/{set_name}.en | \
mosesdecoder/scripts/tokenizer/normalize-punctuation.perl en | \
mosesdecoder/scripts/tokenizer/tokenizer.perl -threads 8 -no-escape -l en \
> {dataset_path}/{set_name}.tok.en

# Truecase the tokenized Finnish text
!mosesdecoder/scripts/recaser/truecase.perl \
-model {dataset_path}/truecase-model.fi \
< {dataset_path}/{set_name}.tok.fi \
> {dataset_path}/{set_name}.tok.truecase.fi

# Truecase the tokenized English text
!mosesdecoder/scripts/recaser/truecase.perl \
-model {dataset_path}/truecase-model.en \
< {dataset_path}/{set_name}.tok.en \
> {dataset_path}/{set_name}.tok.truecase.en

# Clean the corpus
!perl mosesdecoder/scripts/training/clean-corpus-n.perl \
{dataset_path}/{set_name}.tok.truecase en fi \
{dataset_path}/{set_name}.tok.clean 1 50

Tokenizer Version 1.1
Language: fi
Number of threads: 8
Tokenizer Version 1.1
Language: en
Number of threads: 8
clean-corpus.perl: processing data/base/dataset/test2.tok.truecase.en & .fi to data/base/dataset/test2.tok.clean, cutoff 1-50, ratio 9

Input sentences: 3002  Output sentences:  2970


In [3]:
set_name = "test2"

# Apply the learned BPE model and vocabulary to the dev & test set
!subword-nmt apply-bpe -c {dataset_path}/bpe.codes \
    --vocabulary {dataset_path}/vocab.fi < {dataset_path}/{set_name}.tok.clean.fi > {dataset_path}/{set_name}.bpe.fi
!subword-nmt apply-bpe -c {dataset_path}/bpe.codes \
    --vocabulary {dataset_path}/vocab.en < {dataset_path}/{set_name}.tok.clean.en > {dataset_path}/{set_name}.bpe.en

In [8]:
! fairseq-preprocess --source-lang fi --target-lang en\
    --srcdict data/bpe/data-bin/dict.fi.txt \
    --tgtdict data/bpe/data-bin/dict.en.txt \
    --testpref {dataset_path}/test2.tok.clean \
    --destdir {data_bin_path} \
    --workers 20


2023-06-28 09:42:06 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None, optimizer=None, lr_scheduler='fixed', scoring='bleu', task='translation', source_lang='fi', target_lang='en', tr

In [None]:
!fairseq-generate {data_bin_path} \
    --path {checkpoints_path}/checkpoint_best.pt \
    --batch-size 128 --beam 5 --remove-bpe \
    --scoring sacrebleu --sacrebleu\
    > {base_path}/translations_sacrebleu.txt
