[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](
https://colab.research.google.com/github/Klabauterkerl/finnish-chopper/blob/main/fairseq_bpe.ipynb)

In [None]:
# Install fairseq and other dependencies
%pip install fairseq
%pip install sacrebleu sentencepiece
%pip install tensorboardX
%pip install subword-nmt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths for the mounted Google Drive
base_path = "/content/drive/MyDrive/translation_model"
dataset_path = f"{base_path}/dataset"
data_bin_path = f"{base_path}/data-bin"
checkpoints_path = f"{base_path}/checkpoints"
logs_path = f"{base_path}/logs"

# Create directories in Google Drive
!mkdir -p "{dataset_path}"
!mkdir -p "{data_bin_path}"
!mkdir -p "{checkpoints_path}"
!mkdir -p "{logs_path}"


In [22]:
# Define paths when locally running
base_path = "data"
dataset_path = f"{base_path}/dataset"
data_bin_path = f"{base_path}/data-bin"
checkpoints_path = f"{base_path}/checkpoints"
logs_path = f"{base_path}/logs"

!mkdir -p "{dataset_path}"
!mkdir -p "{data_bin_path}"
!mkdir -p "{checkpoints_path}"
!mkdir -p "{logs_path}"

In [None]:
# Download and extract dataset
!wget -P "{dataset_path}" https://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz
!gunzip "{dataset_path}/europarl-v9.fi-en.tsv.gz"


In [None]:
# Split dataset into two files, each containing one column of the original dataset

!cut -f1 {dataset_path}/europarl-v9.fi-en.tsv > {dataset_path}/train.fi
!cut -f2 {dataset_path}/europarl-v9.fi-en.tsv > {dataset_path}/train.en

In [None]:
# Install Moses for preprocessing
!git clone https://github.com/moses-smt/mosesdecoder.git

In [None]:
# Train truecaser model for Finnish
!mosesdecoder/scripts/recaser/train-truecaser.perl \
-corpus {dataset_path}/train.fi \
-model {dataset_path}/truecase-model.fi

# Train truecaser model for English
!mosesdecoder/scripts/recaser/train-truecaser.perl \
-corpus {dataset_path}/train.en \
-model {dataset_path}/truecase-model.en

In [None]:
set_name = "train"

# Normalize punctuation and tokenize Finnish text
!cat {dataset_path}/{set_name}.fi | \
mosesdecoder/scripts/tokenizer/normalize-punctuation.perl fi | \
mosesdecoder/scripts/tokenizer/tokenizer.perl -threads 8 -no-escape -l fi \
> {dataset_path}/{set_name}.tok.fi

# Normalize punctuation and tokenize English text
!cat {dataset_path}/{set_name}.en | \
mosesdecoder/scripts/tokenizer/normalize-punctuation.perl en | \
mosesdecoder/scripts/tokenizer/tokenizer.perl -threads 8 -no-escape -l en \
> {dataset_path}/{set_name}.tok.en

# Truecase the tokenized Finnish text
!mosesdecoder/scripts/recaser/truecase.perl \
-model {dataset_path}/truecase-model.fi \
< {dataset_path}/{set_name}.tok.fi \
> {dataset_path}/{set_name}.tok.truecase.fi

# Truecase the tokenized English text
!mosesdecoder/scripts/recaser/truecase.perl \
-model {dataset_path}/truecase-model.en \
< {dataset_path}/{set_name}.tok.en \
> {dataset_path}/{set_name}.tok.truecase.en

# Clean the corpus
!perl mosesdecoder/scripts/training/clean-corpus-n.perl \
{dataset_path}/{set_name}.tok.truecase en fi \
{dataset_path}/{set_name}.tok.clean 1 50

In [None]:
set_name = "train"

# Learn a joint BPE model and vocabulary
!subword-nmt learn-joint-bpe-and-vocab \
     --input {dataset_path}/{set_name}.tok.clean.fi {dataset_path}/{set_name}.tok.clean.en -s 32000 \
     -o {dataset_path}/bpe.codes --write-vocabulary {dataset_path}/vocab.fi {dataset_path}/vocab.en

# Apply the learned BPE model and vocabulary
!subword-nmt apply-bpe -c {dataset_path}/bpe.codes \
     --vocabulary {dataset_path}/vocab.fi < {dataset_path}/{set_name}.tok.clean.en > {dataset_path}/{set_name}.bpe.fi
!subword-nmt apply-bpe -c {dataset_path}/bpe.codes \
     --vocabulary {dataset_path}/vocab.en < {dataset_path}/{set_name}.tok.clean.en > {dataset_path}/{set_name}.bpe.en

In [None]:
set_name = "train"

# Apply the learned BPE model and vocabulary to the dev & test set
!subword-nmt apply-bpe -c {dataset_path}/bpe.codes \
    --vocabulary {dataset_path}/vocab.fi < {dataset_path}/{set_name}.tok.clean.en > {dataset_path}/{set_name}.bpe.fi
!subword-nmt apply-bpe -c {dataset_path}/bpe.codes \
    --vocabulary {dataset_path}/vocab.en < {dataset_path}/{set_name}.tok.clean.en > {dataset_path}/{set_name}.bpe.en

In [None]:
# Define paths for the mounted Google Drive
base_path = "/content/drive/MyDrive/translation_model"
data_bin_path = f"{base_path}/data-bin"
checkpoints_path = f"{base_path}/checkpoints"
dataset_path = f"{base_path}/dataset"



In [None]:
# Create Dataset using BPE Data
!fairseq-preprocess --source-lang fi --target-lang en \
    --trainpref {bpe_path}/train.bpe --validpref {bpe_path}/valid.bpe --testpref {bpe_path}/test.bpe \
    --destdir {data_bin_path}/bpe --joined-dictionary --workers 20

In [None]:
# Train Model using BPE Dataset
!fairseq-train "{data_bin_path}/bpe" \
    --arch transformer --share-all-embeddings \
    --encoder-layers 5 --decoder-layers 5 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 2048 --decoder-ffn-embed-dim 2048 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \
    --optimizer adam --lr 0.0005 --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr 1e-07 \
    --stop-min-lr 1e-09 --clip-norm 0.0 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --weight-decay 0.0001 --max-tokens 4096 \
    --update-freq 1 --max-epoch 30 --save-interval 1 \
    --keep-last-epochs 5 --log-format simple --log-interval 100 \
    --tensorboard-logdir "{logs_path} --seed 42" \
    --save-dir "{checkpoints_path}/bpe" \
    --amp --patience 3

In [None]:
# Generate translations using BPE trained model
! fairseq-generate "{data_bin_path}/bpe" \
    --path "{checkpoints_path}/bpe/checkpoint_best.pt" \
    --beam 5 --lenpen 1.2 \
    --quiet \
    --gen-subset test \
    --remove-bpe > "{base_path}/translations_bpe.txt"

In [None]:
# Compute BLEU score
!cat "{base_path}/translations_bpe.txt" | sacrebleu {dataset_path}/test.en