# Setup

In [None]:
import torch
print(torch.cuda.is_available(),torch.cuda.device_count(),torch.cuda.current_device())
torch.cuda.device(1)

model_fr_name = 'camembert-base'
model_en_name = 'roberta-base'
model_translation = {}
model_translation['fr_en'] = 'Helsinki-NLP/opus-mt-fr-en'
model_translation['en_fr'] = 'Helsinki-NLP/opus-mt-en-fr'
dataset_name = 'amazon_reviews_multi'
data_path = '/data/desponds/data/Classification'


%load_ext autoreload
%autoreload 2

# Get data, trainers and models if already trained

In [None]:
from preprocessing import preprocessing_review_classification
datasets, tokenized = preprocessing_review_classification(dataset_name, data_path)

In [None]:
datasets['en']['train'][0], datasets['fr']['train'][9873]

In [None]:
from training import get_trainers_review_classification
trainers = get_trainers_review_classification(data_path, tokenized)

In [None]:
from training import get_models_review_classification
models = get_models_review_classification(data_path, trainers)

In [None]:
def remove_bad_punctuation(example):
    example['review_body'] = example['review_body'].replace('...', '.')\
                                                    .replace('. .', '.')\
                                                    .replace('..', '.')
    return example
datasets['fr']['test'] = datasets['fr']['test'].map(remove_bad_punctuation)

# Translation

In [None]:
from translation import translate_fr_en_review_classification
save_path = f"{data_path}/translated_dataset.pickle"
translate_fr_en_review_classification(datasets['fr']['test'], save_path)

In [None]:
import pickle
with open(f"{data_path}/translated_dataset.pickle", 'rb') as handle:
    translated_fr_en = pickle.load(handle)

# Evaluation

In [None]:
from evaluation  import evaluate_review_classification 
accuracies = evaluate_review_classification(models, tokenized, datasets)
accuracies

## Results

In [None]:
import pandas as pd
data = {
    'task' : ['Classification', 'Classification', 'Classification', 'Classification'],
    'model'   : [ 'RoBERTa', 'RoBERTa', 'RoBERTa', 'CamemBERT'],
    'nb_sample_train' : [200000,200000,200000,200000],
    'train_dataset' : ['Amazon_reviews_en', 'Amazon_reviews_en', 'Amazon_reviews_en', 'Amazon_reviews_fr'],
    'test_dataset' : ['Amazon_reviews_en', 'Amazon_reviews_fr', 'Amazon_reviews_fr_translated', 'Amazon_reviews_fr', ],
    'translated' : ['no', 'no', 'yes', 'no'],
    'test_accuracy' : [0.6044, 0.2028 , 0.552, 0.5972]
}
results = pd.DataFrame(data)
results