In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
my_drive_path = '/content/drive/MyDrive'
sys.path.append(my_drive_path)

Mounted at /content/drive


In [None]:
pip install evaluate bleu sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting efficiency (from bleu)
  Downloading efficiency-2.0-py3-none-any.whl.metadata (2.5 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10

In [None]:
from sklearn.model_selection import train_test_split

# BLEU evaluation:
import nltk
from nltk.translate.bleu_score import sentence_bleu

from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import evaluate

import pandas as pd
import numpy as np
import warnings
import pickle

warnings.filterwarnings('ignore')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from transformers import NllbTokenizer
from transformers import AutoConfig

In [None]:
# Load local functions:
from database_object import * # read the data and create the dataset
from functions import *
from functions_model import *

# DATA COLLECTION

In [None]:
db = database() # Create a database object

# Read the data and fill the database:
db.get_data_alsaimmer(display=False)
db.get_data_alsatext(display=False)
db.get_data_motsAlsacienMulhouse(display=False)
db.get_data_alignments(display=False)


In [None]:
# Split the dataset into training/validation/testing subsets :
train, validtest = train_test_split(db.db    , test_size=0.6, random_state=0) # 60% training
valid, test      = train_test_split(validtest, test_size=0.5, random_state=0) # 20%-20% valid, test

# Create the dataset in a 'Dataset' format for tokenization :
d = {'train'     : Dataset.from_dict({'translation': train}),
     'validation': Dataset.from_dict({'translation': valid}),
     'test'      : Dataset.from_dict({'translation': test})
     }
d = DatasetDict(d)

# PRE-PROCESSING

In [None]:
# Choice of Tokenizer:
checkpoint = "google-t5/t5-base" # "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)

#checkpoint = "facebook/nllb-200-distilled-600M" # too much demanding for colab (RAM excedeed, even with batch_size=1)
#checkpoint = "facebook/mbart-large-50-many-to-many-mmt" # same
#tokenizer = NllbTokenizer.from_pretrained(checkpoint)

src_lang = "fr"
tgt_lang = "als" # target language set to german as Alsatian as it is not supported

# translate French to Alsatian:
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang

# Print an example of sentence and their tokens:
with tokenizer.as_target_tokenizer():
  tokens_fr = tokenizer(db.db[0]['fr'])
print("---- FRENCH ------------------------")
print(db.db[0]['fr'])
print(tokens_fr)

with tokenizer.as_target_tokenizer():
  tokens_als = tokenizer(db.db[0]['als'])
print("---- ALSACIAN ----------------------")
print(db.db[0]['als'])
print(tokens_als)


config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

----------------------------
A neuf heures vont (au lit) les raffinés
{'input_ids': [71, 20812, 8446, 193, 17, 41, 402, 4996, 61, 110, 3, 52, 18581, 899, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
----------------------------
Àm niina gehn d Fiina
{'input_ids': [3, 2, 51, 3, 29, 23, 77, 9, 873, 107, 29, 3, 26, 3188, 77, 9, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokenized_datasets = d.map(encode, batched=True, fn_kwargs={"tokenizer":tokenizer})

Map:   0%|          | 0/5895 [00:00<?, ? examples/s]

Map:   0%|          | 0/4422 [00:00<?, ? examples/s]

Map:   0%|          | 0/4422 [00:00<?, ? examples/s]

# MODEL TUNING

In [None]:
# Loading of the pre-trained model for fine-tuning with additional dataset:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Set the output directory for saving the model:
model_name = checkpoint.split("/")[-1] # folder name to save output checkpoints

output_dir = "%s/runs/%s_essai6_evaluation"%(my_drive_path, model_name)

In [None]:
# Set the arguments for training :
args = Seq2SeqTrainingArguments(
    output_dir,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-4,
    per_device_train_batch_size = 8, # 64 is too much for colab -> RAM overflow
    per_device_eval_batch_size  = 8,
    save_steps       = 512, # save every 512 steps.
    weight_decay     = 0.01,
    save_total_limit = 1,
    num_train_epochs = 20,
    predict_with_generate = True,
    fp16                  = True, # True to speed up training on GPU
    metric_for_best_model = 'eval_loss',
    greater_is_better     = False,
    load_best_model_at_end= True,
    seed=1,
  )

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Evaluation function
# (using references from https://github.com/huggingface/transformers/issues/24433 and https://github.com/huggingface/transformers/issues/22634

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # HF may return a tuple; take first element
    if isinstance(preds, (tuple, list)):
      preds = preds[0]

    # If logits slipped in (B, T, V), convert to token ids safely
    if preds.ndim == 3:
      preds = np.argmax(preds, axis=-1)

    # Map ignore index to a real token id for decoding
    pad_id = tokenizer.pad_token_id
    preds  = np.where(preds != -100, preds, pad_id)
    labels = np.where(labels != -100, labels, pad_id)

    pred_seq  = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    label_seq = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(pred_seq, label_seq)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    return result

Downloading builder script: 0.00B [00:00, ?B/s]

# MODEL TRAINING

In [None]:
# Set up the trainer:
trainer = Seq2SeqTrainer(model, args,
    train_dataset    = tokenized_datasets["train"],
    eval_dataset     = tokenized_datasets["validation"],
    data_collator    = data_collator,
    processing_class = tokenizer,
    compute_metrics  = compute_metrics,
)

In [None]:
# Do the training:
try:
  trainer.train(resume_from_checkpoint=True)
except:
  trainer.train(resume_from_checkpoint=False)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoanne-adam[0m ([33mjoanne-adam-cea[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu
1,3.1898,2.185302,14.067474
2,2.2671,1.83721,27.233399
3,1.5301,1.692937,34.282313
4,1.3146,1.67257,38.605954
5,1.0162,1.683218,40.567329
6,0.9112,1.714994,41.943575
7,0.699,1.752563,44.162432
8,0.648,1.842656,44.566143
9,0.5423,1.917478,44.443846
10,0.4638,1.961397,45.393821


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


# MODEL EVALUATION

In [None]:
# After training, evaluate the model on test set :
trainer.evaluate()

{'eval_loss': 1.6725703477859497,
 'eval_bleu': 38.60595363456538,
 'eval_runtime': 279.657,
 'eval_samples_per_second': 15.812,
 'eval_steps_per_second': 1.977,
 'epoch': 20.0}

In [None]:
# Save the model for future use
trainer.save_model(output_dir)
config.save_pretrained(output_dir)

#tokenizer.save_pretrained(output_dir)

# save dataset into pickle:
with open('%s/tokenized_datasets.pckl'%output_dir, 'wb') as fic:
  pickle.dump(tokenized_datasets, fic)

In [None]:
# --------------------------------------------------------------------------------------------------
def do_translation(text, model_name, return_text=True, return_token=False):
  """
  Do the translation using the trained model.
  return either the translation or the tokens
  """
  # make the text a correct format for translation :
  translator = pipeline("translation_XX_to_YY", model=model_name)
  translator(text)

  # Tokenize the text: text -> tokens :
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  inputs = tokenizer(text, return_tensors="pt").input_ids

  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  # Do the translation using the tokenized text:
  outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

  # Token -> text:
  translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

  if return_text:
    return translation
  if return_tokens:
    return output[0]

# TRY SOME TRANSLATION

In [None]:
text = "Le château est sur la montagne."

translation = do_translation(text, output_dir)
print("%s -> %s"%(text, translation))

Device set to use cuda:0


D Schtàui sch m Barg.
