In [None]:
%pip install joeynmt subword_nmt sentencepiece # Main OpenNMT Package (No Need for subword_nmt sentencepiece)
%pip install https://github.com/RobotsMali-AI/rmai/releases/download/0.0.4/rmaipkg-0.0.4.tar.gz # RobotsMaliAI's Datasets and Models
%pip install https://github.com/s7d11/daba/releases/download/v0.0.1-alpha/daba-0.9.2.tar.gz # Non-UI Version of Daba
%pip install sacrebleu # Redundant comes with OpenNMT-py

In [None]:
!git clone https://github.com/gjuuuy/MT_base.git

In [None]:
%cd MT_base

In [None]:
!nvidia-smi 

In [None]:
from rmai.datasets.text import parallel

#Afficher quelques textes parrallèle bamb-fr.
text = parallel.get_text(max_len =500000 ,randomize = True)

#Creér nos trainset et validset
train, valid = parallel.random_split(text,90)

test = valid[:360]
valid = valid[360:]

#Extraction du trainbam-fr et du validbam-fr
extract = lambda x,dataset : [i[x] for i in dataset]

train_bam = extract(0,train)
train_fra = extract(1,train)

valid_bam = extract(0,valid)
valid_fra = extract(1,valid)

test_bam = extract(0,test)
test_fra = extract(1,test)


In [None]:
#Eliminer les caractère superflues.

parallel.write_to(lines = train_bam, name = 'trains',path = "data")
parallel.write_to(lines = train_fra, name = 'traint',path = "data")
parallel.write_to(lines = valid_bam, name = 'devs',path = "data")
parallel.write_to(lines = valid_fra, name = 'devt',path = "data")
parallel.write_to(lines = test_bam, name = 'tests',path = "data")
parallel.write_to(lines = test_fra, name = 'testt',path = "data")

In [None]:
!mv data/trains.txt data/train.bam && !mv data/traint.txt data/train.fr
!mv data/devs.txt data/dev.bam && !mv data/devt.txt data/dev.fr
!mv data/tests.txt data/test.bam && !mv data/testt.txt data/test.fr

In [None]:
!mkdir models

In [None]:
config = f"""
name: "bam2fr"
joeynmt_version: "2.2.0"

data:
    train: "data/train"
    dev: "data/dev"
    test: "data/test"
    dataset_type: "plain"
    #dataset_cfg:           # not necessary for manually saved pyarray daraset
    #    name: "de-en"
    sample_dev_subset: 200
    src:
        lang: "bam"
        max_length: 100
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 49039
        voc_min_freq: 0
        voc_file: "data/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "data/sp.model"

    trg:
        lang: "fr"
        max_length: 100
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 49039
        voc_min_freq: 0
        voc_file: "data/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "data/sp.model"

testing:
    n_best: 1
    beam_size: 5
    beam_alpha: 1.0
    batch_size: 256
    batch_type: "token"
    max_output_length: 100
    eval_metrics: ["bleu"]
    #return_prob: "hyp"
    #return_attention: False
    sacrebleu_cfg:
        tokenize: "13a"

training:
    #load_model: "models/latest.ckpt"
    #reset_best_ckpt: False
    #reset_scheduler: False
    #reset_optimizer: False
    #reset_iter_state: False
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999]
    scheduling: "plateau"
    learning_rate_warmup: 1000
    learning_rate: 0.0004
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.2
    loss: "crossentropy"
    batch_size: 4096
    batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "bleu"
    epochs: 120
    updates: 100000
    validation_freq: 4000
    logging_freq: 100
    model_dir: "models/bam2fr"
    overwrite: True
    shuffle: True
    use_cuda: True
    print_valid_sents: [0, 1, 2, 3]
    keep_best_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: False
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4
        embeddings:
            embedding_dim: 256
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 256
        ff_size: 1024
        dropout: 0.3
        layer_norm: "pre"
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4
        embeddings:
            embedding_dim: 256
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 256
        ff_size: 1024
        dropout: 0.3
        layer_norm: "pre"
        """
with open("config.yaml","w") as f : 
  f.write(config)

In [None]:
#Vocab building.
!wget https://raw.githubusercontent.com/joeynmt/joeynmt/v2.2/scripts/build_vocab.py

In [None]:
!python build_vocab.py config.yaml --joint

In [None]:
!python -m joeynmt train config.yaml