## Data preparation WORK

**Change according to your configuration, and environment**

**Make sure there the 'scripts' folder and 'data' folder are presents.**

In [None]:
cd /content/drive/MyDrive/onmt

In [None]:
%pip install datasets OpenNMT-py sentencepiece
%pip install --no-cache-dir https://github.com/RobotsMali-AI/rmai/releases/download/0.0.4/rmaipkg-0.0.4.tar.gz

In [None]:
from datasets import load_dataset

dataset = load_dataset("RobotsMaliAI/bayelemabaga", "bam-fr")

In [None]:
def decompose_dataset(dset):
  out = {"train": [], "validation": [], "test": []}
  for k in out:
    values = [(i["bam"], i["fr"]) for i in dset[k]["translation"]]
    out[k] = values
  out["dev"] = out["validation"]
  del out["validation"]
  return out

def write_x(path, data):
  with open(path, "w") as fp:
    for i in data:
      fp.write(f"{i}\n")

def write_to_fs(dt_tuple, name):
  bam, fr = zip(*dt_tuple)
  write_x(f"data/{name}.bam", bam)
  write_x(f"data/{name}.fr", fr)

out = decompose_dataset(dataset)
train = out["train"]
dev = out["dev"]
test = out["test"]

write_to_fs(train, "train")
write_to_fs(dev, "dev")
write_to_fs(test, "test")


In [None]:
!bash scripts/prepare.sh data/train.bam data/train.fr 80000 unigram

## Model / Training Configuration

In [None]:
import os

model_name = "bam2fr"
vocab_size = 50000

training_steps = 100
valid_steps = int(training_steps / 5)
save_ckpt_freq = int(training_steps / 5)
warmup_steps = int(training_steps / 10)
reporting = int(training_steps/10)
GPU = 1 # TOGGLE for GPU

if(not os.path.exists(model_name)):
  os.makedirs(model_name)

config = f"""

## Where the samples will be written
save_data: {model_name}/run

overwrite: True # Toggle this for rewritting

# Training files
data:
    corpus_1:
        path_src: data/train.sub-src.txt
        path_tgt: data/train.sub-trg.txt
        transforms: [filtertoolong]
        weight : 1
    valid:
        path_src: data/dev.sub-src.txt
        path_tgt: data/dev.sub-trg.txt
        transforms: [filtertoolong]

# Vocabulary files, generated by onmt_build_vocab
src_vocab: {model_name}/source.vocab
tgt_vocab: {model_name}/target.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: {vocab_size}
tgt_vocab_size: {vocab_size}

# Filter out source/target longer than n if [filtertoolong] enabled
src_seq_length: 150
src_seq_length: 150

# Tokenization options
src_subword_model: data/source.model
tgt_subword_model: data/target.model

# Where to save the log file and the output models/checkpoints
log_file: {model_name}/train.log
save_model: {model_name}/models/{model_name}

# Stop training if it does not improve after n validations
early_stopping: 3

# Default: 5000 - Save a model checkpoint for each n
save_checkpoint_steps: {save_ckpt_freq}

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 3435

# Default: 100000 - Train the model to max n steps 
# Increase to 200000 or more for large datasets
# For fine-tuning, add up the required steps to the original steps
train_steps: {training_steps}

# Default: 10000 - Run validation after n steps
valid_steps: {valid_steps}

# Default: 4000 - for large datasets, try up to 8000
warmup_steps: {warmup_steps}
report_every: {reporting}

# Number of GPUs, and IDs of GPUs


# Batching
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
# warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]

"""

if(GPU):
  config += """
world_size: 1
gpu_ranks: [0]
  """

with open(f"{model_name}/config.yaml", "w") as fp:
  fp.write(config)



In [None]:
!onmt_build_vocab -c bam2fr/config.yaml -n_sample -1 --dump_samples # -1 full corpus, bpe, sentencepiece

In [None]:
!onmt_train -config bam2fr/config.yaml

## Model Evaluation

In [None]:
!onmt_translate -model bam2fr/models/bam2fr_step_100.pt -src data/test.sub-src.txt -output bam2fr/models/pred_100.txt

In [None]:
# Sacrebleu testing CODE