## Notes 
- For the moment the model is only tokenized with the `text_body` and not with the `text_title`
- The evalutation training metric is the `accuracy`. Is it the best metric for an ordered variable ? (classify 4 instead of 5 is less important than classify 1 instead of 5)
- for the moment we do not monitor the evaluation metric during fine-tuning. What does it mean ? Is it useful ? 


In [1]:
import torch
print(torch.cuda.is_available())

print(torch.cuda.device_count())
print(torch.cuda.current_device())
torch.cuda.device(0)
torch.cuda.get_device_name(0)

True
2
0


'NVIDIA TITAN Xp'

In [2]:
model_fr_name = 'camembert-base'
model_en_name = 'roberta-base'
model_translation = {}
model_translation['fr_en'] = 'Helsinki-NLP/opus-mt-fr-en'
model_translation['en_fr'] = 'Helsinki-NLP/opus-mt-en-fr'
dataset_name = 'amazon_reviews_multi'

In [4]:
from datasets.dataset_dict import DatasetDict
from datasets import load_dataset

dataset = load_dataset(dataset_name)

def stars_into_labels(example):
    # Change the range of stars [1-5] to labels [0-4]
    example['stars'] = example['stars']-1
    return example

def get_dataset_language(full_dataset, language) :
    dataset_out = {}
    for split in full_dataset : 
        #Take only the language of interest
        dataset_out[split] = dataset[split].filter(lambda exemple : exemple['language'] == language)
        #Remove useless columns
        dataset_out[split] = dataset_out[split].remove_columns(["review_id","product_id", "reviewer_id",
        "review_title","language","product_category"])
        #Change the range of the labels and rename the column for the training
        dataset_out[split] = dataset_out[split].rename_column("review_body", "text")
        dataset_out[split] = dataset_out[split].map(stars_into_labels)
        dataset_out[split] = dataset_out[split].rename_column("stars", "label")
    return DatasetDict(dataset_out)

#Prprocess the datasets
dataset_fr = get_dataset_language(dataset, 'fr')
dataset_en = get_dataset_language(dataset, 'en')

Loading cached processed dataset at /home/desponds/.cache/huggingface/datasets/amazon_reviews_multi/all_languages/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-abef70f0f0f656cb.arrow
Loading cached processed dataset at /home/desponds/.cache/huggingface/datasets/amazon_reviews_multi/all_languages/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-79c67e48d5206038.arrow
Loading cached processed dataset at /home/desponds/.cache/huggingface/datasets/amazon_reviews_multi/all_languages/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-faeec778108fe3de.arrow
Loading cached processed dataset at /home/desponds/.cache/huggingface/datasets/amazon_reviews_multi/all_languages/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-41232663b589bd61.arrow
Loading cached processed dataset at /home/desponds/.cache/huggingface/datasets/amazon_reviews_multi/all_languages/1.0.0/724e94f4b0c6c405ce7e476a

## 2.1 Fine-tune French Model

In [10]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

# Take the tokenizers of the respective models
tokenizer = {}
tokenizer['fr'] = AutoTokenizer.from_pretrained(model_fr_name)
tokenizer['en'] = AutoTokenizer.from_pretrained(model_en_name)

def tokenize_function(examples, language):
    return tokenizer[language](examples["text"], 
                     padding="max_length", truncation=True)

# Tokenize all the data
for split in dataset_fr :
    dataset_fr[split] = dataset_fr[split].map(lambda examples : tokenize_function(examples,'fr'), batched=True)
    dataset_en[split] = dataset_en[split].map(lambda examples : tokenize_function(examples,'en'), batched=True)


#Take only a small part of the dataset for testing
small_fr_dataset = {}
for split in dataset_fr :
    small_fr_dataset[split] =  dataset_fr[split].shuffle(seed=42).select(range(50))
small_fr_dataset = DatasetDict(small_fr_dataset)

# Load the two pretrained models
model = {}
model['fr'] = AutoModelForSequenceClassification.from_pretrained(model_fr_name, num_labels=5)
model['en'] = AutoModelForSequenceClassification.from_pretrained(model_en_name, num_labels=5)

# Set the training arguments for both trainer
training_args = {}
training_args['fr'] = TrainingArguments(output_dir="test_trainer_fr", evaluation_strategy="epoch")
training_args['en'] = TrainingArguments(output_dir="test_trainer_en", evaluation_strategy="epoch")


# Set the metric to accuracy for the training
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = {}
trainer['fr'] = Trainer(
    model=model['fr'],
    args=training_args['fr'],
    train_dataset= dataset_fr['train'],
    eval_dataset= dataset_fr['validation'],
    compute_metrics=compute_metrics,
)
trainer['en'] = Trainer(
    model=model['en'],
    args=training_args['en'],
    train_dataset= dataset_en['train'],
    eval_dataset= dataset_en['validation'],
    compute_metrics=compute_metrics,
)
trainer['fr'].train('/data/desponds/data/Classification/trainer_fr/checkpoint-37500')
trainer['en'].train('/data/desponds/data/Classification/trainer_en/checkpoint-37500')

In [13]:
training_args['fr'].device

device(type='cuda', index=0)

## Translation

SOhelp : https://stackoverflow.com/questions/70043467/how-to-run-huggingface-helsinki-nlp-models

In [14]:
# Testing how to translate

# First way of translation 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer_trad = AutoTokenizer.from_pretrained(model_translation['fr_en'] )
model_trad = AutoModelForSeq2SeqLM.from_pretrained(model_translation['fr_en'] )

inp = "Je m'appelle Mathieu et je vis à Lausanne"
input_ids = tokenizer_trad(inp, return_tensors="pt").input_ids
outputs = model_trad.generate(input_ids=input_ids, num_return_sequences=1)
print(tokenizer_trad.batch_decode(outputs, skip_special_tokens=True)[0])

# Second way of translation with more abstraction
from transformers import pipeline
translator = {}
translator['fr_en'] = pipeline("translation", model='Helsinki-NLP/opus-mt-fr-en')
print(translator['fr_en']("Ce cours est produit par Hugging Face.")[0]['translation_text'])

loading configuration file config.json from cache at /home/desponds/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-fr-en/snapshots/49463f1706007cb314a942296b77a6483e6f6953/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-fr-en",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkp

'My name is Mathieu and I live in Lausanne.'

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer_trad_fr_en = AutoTokenizer.from_pretrained(model_translation['fr_en'] )
model_trad_fr_en = AutoModelForSeq2SeqLM.from_pretrained(model_translation['fr_en'] )

def translate_fr_en(example):
    inp = example['text']
    input_ids = tokenizer_trad_fr_en(inp, 
                                     return_tensors="pt",
                                     padding="max_length", 
                                     truncation=True).input_ids
    outputs = model_trad_fr_en.generate(input_ids=input_ids, num_return_sequences=1)
    example["text"] = tokenizer_trad_fr_en.batch_decode(outputs, skip_special_tokens=True)[0]
    return example

# Translate the test split of the french dataset
translated_fr_en = dataset_fr['test'].map(translate_fr_en)

#Recompute the tokens of the translated version
translated_fr_en.remove_columns(['input_ids', 'attention_mask'])
translated_fr_en = translated_fr_en.map(lambda examples : tokenize_function(examples,'en'), batched=True)

loading configuration file config.json from cache at /home/desponds/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-fr-en/snapshots/49463f1706007cb314a942296b77a6483e6f6953/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-fr-en",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkp

## Evaluation

In [12]:
trainer['fr'].predict(dataset_fr['test'])

The following columns in the test set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5000
  Batch size = 16


PredictionOutput(predictions=array([[ 4.8188376 ,  1.3251189 , -1.0401316 , -2.6615825 , -2.1623106 ],
       [ 4.653632  ,  1.5906588 , -0.73805064, -2.70451   , -2.4790804 ],
       [ 4.6040254 ,  0.9419153 , -0.99038637, -2.3694022 , -1.9822422 ],
       ...,
       [-3.5564382 , -2.8816404 ,  0.2015841 ,  2.8081903 ,  2.9350033 ],
       [-2.1337707 , -3.0059257 , -1.5882653 ,  1.4551748 ,  4.6391654 ],
       [-3.3857815 , -2.827093  ,  0.06521205,  2.5770404 ,  3.1005316 ]],
      dtype=float32), label_ids=array([0, 0, 0, ..., 4, 4, 4]), metrics={'test_loss': 0.9777808785438538, 'test_accuracy': 0.5972, 'test_runtime': 77.7849, 'test_samples_per_second': 64.28, 'test_steps_per_second': 4.024})

In [13]:
trainer['en'].predict(dataset_en['test'])

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5000
  Batch size = 16


PredictionOutput(predictions=array([[ 4.7439485 ,  1.2203276 , -0.5132247 , -2.6602557 , -2.8391514 ],
       [ 4.2171583 ,  2.2571788 ,  0.00764171, -2.6744833 , -3.3891168 ],
       [ 4.482563  ,  1.9727943 , -0.28513804, -2.7017975 , -3.1886637 ],
       ...,
       [-3.6019254 , -2.7844594 , -0.5561998 ,  2.6640472 ,  3.7220466 ],
       [-3.580228  , -2.8779206 , -1.1100246 ,  2.6448374 ,  4.37079   ],
       [-3.6487594 , -2.9328356 , -0.95110667,  2.7271345 ,  4.247052  ]],
      dtype=float32), label_ids=array([0, 0, 0, ..., 4, 4, 4]), metrics={'test_loss': 0.9427361488342285, 'test_accuracy': 0.6044, 'test_runtime': 77.3702, 'test_samples_per_second': 64.624, 'test_steps_per_second': 4.045})

In [22]:
trainer['en'].predict(translated_fr_en)

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5000
  Batch size = 16


PredictionOutput(predictions=array([[ 4.314968  ,  1.1656147 , -0.40019938, -2.3420541 , -2.7511508 ],
       [ 4.586199  ,  0.8615197 , -0.4600597 , -2.480549  , -2.6347854 ],
       [ 4.063932  ,  0.5198351 , -0.41078267, -2.0535574 , -2.2988946 ],
       ...,
       [-3.9075608 , -2.305743  ,  0.7384524 ,  3.102282  ,  1.8296533 ],
       [-2.533145  , -2.1799974 , -1.9504237 ,  1.2701927 ,  4.6203904 ],
       [-3.4184883 , -2.7364452 , -0.8394333 ,  2.434169  ,  3.9835973 ]],
      dtype=float32), label_ids=array([0, 0, 0, ..., 4, 4, 4]), metrics={'test_loss': 1.096827745437622, 'test_accuracy': 0.5552, 'test_runtime': 77.6661, 'test_samples_per_second': 64.378, 'test_steps_per_second': 4.03})

## Results

In [24]:
import pandas as pd
data = {
    'task' : ['Classification', 'Classification', 'Classification'],
    'dataset' : ['Amazon_reviews_fr', 'Amazon_reviews_en', 'Amazon_reviews_fr'],
    'translated' : ['no', 'no', 'yes'],
    'model'   : ['CamemBERT', 'Roberta', 'Roberta'],
    'test_loss' : [0.9777808785438538, 0.9427361488342285, 1.096827745437622],
    'test_accuracy' : [0.5972, 0.6044, 0.552]
}
results = pd.DataFrame(data)
results

Unnamed: 0,task,dataset,translated,model,test_loss,test_accuracy
0,Classification,Amazon_reviews_fr,no,CamemBERT,0.977781,0.5972
1,Classification,Amazon_reviews_en,no,Roberta,0.942736,0.6044
2,Classification,Amazon_reviews_fr,yes,Roberta,1.096828,0.552
