<a href="https://colab.research.google.com/github/joeynmt/joeynmt/blob/main/notebooks/joey_v2_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ELAR Transformer

In [None]:
!nvidia-smi

In [None]:
import torch
torch.__version__

In [None]:
group_size = 200
offset = 1
tab_seperated = "elan_cleaned_retained_lit.txt"
input_language = "auslan"
output_language = "en"

In [None]:
from datasets import DatasetDict, Features, Value, Translation

data = open(tab_seperated, "r")

lines = data.readlines()

unsorted_data = []

for line in lines:
    sentences = line.split("\t")

    if len(sentences) < 2:
        continue

    translation = {
        input_language: sentences[1].strip().lower(),
        output_language: sentences[0].lower()
    }

    unsorted_data.append(translation)

import random
random.seed(10)

random.shuffle(unsorted_data)

unsorted_data = unsorted_data[:6000]

test = unsorted_data[group_size * offset: group_size + group_size * offset]
dev = unsorted_data[group_size + group_size * offset : group_size * 2 + group_size * offset]
train = unsorted_data[group_size * 2 + group_size * offset:] + unsorted_data[:group_size * offset:]

In [None]:
import datasets

def create_dataset(data_group, start, end):

    data = {
        "translation": data_group,
        "id": [str(i) for i in range(start, end)]
    }

    features = Features({
        'translation': Translation(languages=('auslan', 'en'), id='my_custom_id'),
        'id': Value("string")
    })
    my_dataset = datasets.Dataset.from_dict(data, features=features)

    return my_dataset

In [None]:
elar_test = create_dataset(test, 0, group_size)
elar_dev = create_dataset(dev, group_size, group_size * 2)
elar_train = create_dataset(train, group_size * 2, group_size * 2 + len(train))

In [None]:
dataset_dict = DatasetDict({ 
  "train": elar_train,
  "validation": elar_dev,
  "test": elar_test
})

data_dir = "data/auslan"
dataset_dict.save_to_disk(data_dir)

### Vocabulary

In [None]:
from pathlib import Path

# Create the config
config = """
name: "auslan"
joeynmt_version: "2.2.0"

data:
    train: "{data_dir}/train"
    dev: "{data_dir}/validation"
    test: "{data_dir}/test"
    dataset_type: "huggingface"
    sample_dev_subset: 80
    src:
        lang: "auslan"
        max_length: 100
        lowercase: True
        normalize: False
        level: "word"
        voc_limit: 2950
        voc_min_freq: 1
        voc_file: "{data_dir}/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "{data_dir}/sp.model"

    trg:
        lang: "en"
        max_length: 100
        lowercase: True
        normalize: False
        level: "word"
        voc_limit: 2950
        voc_min_freq: 1
        voc_file: "{data_dir}/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "{data_dir}/sp.model"

""".format(data_dir=data_dir)
with (Path(data_dir) / "config.yaml").open('w') as f:
    f.write(config)

In [None]:
# !wget https://raw.githubusercontent.com/joeynmt/joeynmt/v2.2/scripts/build_vocab.py

In [None]:
!python build_vocab.py {data_dir}/config.yaml --joint

## Configuration

In [None]:
model_dir = "data/models/auslan"
config += """
testing:
    n_best: 1
    beam_size: 5
    beam_alpha: 1.0
    batch_size: 256
    batch_type: "token"
    max_output_length: 100
    eval_metrics: ["bleu"]
    #return_prob: "hyp"
    #return_attention: False
    sacrebleu_cfg:
        tokenize: "13a"

training:
    #load_model: "{model_dir}/latest.ckpt"
    #reset_best_ckpt: False
    #reset_scheduler: False
    #reset_optimizer: False
    #reset_iter_state: False
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999]
    scheduling: "warmupinversesquareroot"
    learning_rate_warmup: 2000
    learning_rate: 0.0002
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    loss: "crossentropy"
    batch_size: 32
    batch_type: "token"
    batch_multiplier: 4
    early_stopping_metric: "bleu"
    epochs: 500000
    # updates: 20000
    validation_freq: 1000
    logging_freq: 100
    model_dir: "{model_dir}"
    overwrite: True
    shuffle: True
    use_cuda: True
    fp16: False
    print_valid_sents: [0, 1, 2, 3]
    keep_best_ckpts: 3

model:
    initializer: "xavier_uniform"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier_uniform"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 4
        num_heads: 8
        embeddings:
            embedding_dim: 256
            scale: True
            dropout: 0.0
        # typically ff_size = 4 x hidden_size
        hidden_size: 256
        ff_size: 1024
        dropout: 0.4
        layer_norm: "pre"
    decoder:
        type: "transformer"
        num_layers: 4
        num_heads: 8
        embeddings:
            embedding_dim: 256
            scale: True
            dropout: 0.4
        # typically ff_size = 4 x hidden_size
        hidden_size: 256
        ff_size: 1024
        dropout: 0.1
        layer_norm: "pre"

""".format(model_dir=model_dir)
with (Path(data_dir) / "config.yaml").open('w') as f:
    f.write(config)

### Run training

In [None]:
!python -m joeynmt train {data_dir}/config.yaml

### Evaluation

In [None]:
!python -m joeynmt test {data_dir}/config.yaml --ckpt {model_dir}/best.ckpt

In [None]:
!python -m joeynmt translate {data_dir}/config.yaml --ckpt {model_dir}/best.ckpt

In [None]:
nbest_config = config.replace('n_best: 1', 'n_best: 5')\
  .replace('#return_prob: "hyp"', 'return_prob: "hyp"')

with (Path(data_dir) / "nbest_config.yaml").open('w') as f:
    f.write(nbest_config)

In [None]:
!python -m joeynmt translate {data_dir}/nbest_config.yaml --ckpt {model_dir}/best.ckpt