## Notes 
- Pour l'instant, on test sur le validation set. Il faut regarder si le validation set est utiliser durant le training ou s'il est juste la pour l'évaluation. 
- Pour la traduction, le fait de traduire va peut-être faire que la variable `answer_start` va être déplacée mais je pense que ça n'a pas d'impact car on ne l'utilise pas à la validation
- Le model anglais est train sur 4x plus de samples : est ce qu'on veut train sur plus d'epoch sur le francais 
- La traduction de la réponse est pas fait en même temps que celle de la question

In [None]:
import torch
import pickle
import pandas as pd
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"

%load_ext autoreload
%autoreload 2

In [None]:
dataset_name = {}
dataset_name['fr'] = 'fquad'
dataset_name['en'] = 'squad'
data_path = '/data/desponds/data/Question_answering/'

In [None]:
from preprocessing import preprocessing_question_answering
datasets, tokenized = preprocessing_question_answering(dataset_name, data_path)

In [None]:
from training import get_trainers_question_answering
trainers = get_trainers_question_answering(data_path, tokenized, langs = ['en'])

In [None]:
from training import get_models_question_answering
models = get_models_question_answering(trainers)

## Translation

In [None]:
def translate_fr_en_qa(example):
    example['context'] = translate_fr_en(example['context'])
    example['question'] = translate_fr_en(example['question'])
    example['answers']['text'] = [translate_fr_en(example['answers']['text'][0])]
    # If we can find the traduction in the text directly we update the ànswer_start` var
    idx = example['context'].find(example['answers']['text'][0])
    example['answers']['answer_start'] = [idx] if idx != -1 else example['answers']['answer_start']
    return example

# Translate the test split of the french dataset
translated_fr_en = dataset_fr['valid'].map(translate_fr_en_qa, batched=True, batch_size = 32)

with open('/data/desponds/data/Question_answering/translated_dataset.pickle', 'wb') as handle:
    pickle.dump(translated_fr_en, handle)

In [None]:
import pickle
with open('/data/desponds/data/Question_answering/translated_dataset.pickle', 'rb') as handle:
    translated_fr_en = pickle.load(handle)

In [None]:
#Recompute the tokens of the translated version
from preprocessing import preprocess_validation_examples_QA
tokenized_translated_fr_en = translated_fr_en.map(lambda examples : preprocess_validation_examples_QA(examples, 'en'), 
                                                batched=True, remove_columns=datasets['en']["train"].column_names)

## Evaluation 

### Dataset FR on Camembert

In [None]:
# ON hugging face they manage to get {"f1": 88.3, "exact_match": 78.0}
predictions_fr, _, _ = trainers['fr'].predict(tokenized_fr['validation'])


In [None]:
start_logits_fr, end_logits_fr = predictions_fr
metric_fr, predicted_answers_fr, theoretical_answers_fr = compute_metrics_QA(start_logits_fr, end_logits_fr, tokenized_fr['validation'], dataset_fr["valid"])
metric_fr

### Dataset EN on RoBERTa

In [None]:
from training import compute_metrics_QA
predictions_en, _, _ = models['en'].predict(tokenized['en']['validation'])
start_logits_en, end_logits_en = predictions_en
metric_en, predicted_answers_en, theoretical_answers_en = compute_metrics_QA(start_logits_en, end_logits_en, tokenized['en']['validation'], datasets['en']["validation"])
metric_en

### Dataset FR on RoBERTa

In [None]:
predictions_fr_en, _, _ = models['en'].predict(tokenized_translated_fr_en)
start_logits_fr_en, end_logits_fr_en = predictions_fr_en
metric_fr_en, predicted_answers_fr_en, theoretical_answers_fr_en = compute_metrics_QA(start_logits_fr_en, end_logits_fr_en, 
                tokenized_translated_fr_en, translated_fr_en,
               need_translation = True, base_answers = datasets['fr']["validation"]
               )
metric_fr_en

## Analysing results

In [None]:
import pandas as pd
results_qa = pd.DataFrame() 
results_qa['theoretical_answers'] = [th['answers']['text'][0] for th in theoretical_answers_fr]
results_qa['predicted_answers_fr'] = [th['prediction_text'] for th in predicted_answers_fr]
results_qa['predicted_answers_fr_en'] = [th['prediction_text'] for th in predicted_answers_fr_en]
results_qa['predicted_answers_fr_logit'] = [th['prediction_logit'] for th in predicted_answers_fr]
results_qa['predicted_answers_fr_en_logit'] = [th['prediction_logit'] for th in predicted_answers_fr_en]
results_qa['len_context'] = [len(th['context'].split()) for th in dataset_fr["valid"]]
results_qa['question'] = [th['question'] for th in dataset_fr["valid"]]
results_qa['exact_match_fr'] = results_qa.apply(lambda ex : 1 if ex['theoretical_answers_fr'] == ex['predicted_answers_fr'] else 0, axis =1)
results_qa['exact_match_fr_en'] = results_qa.apply(lambda ex : 1 if ex['theoretical_answers_fr'] == ex['predicted_answers_fr_en'] else 0, axis =1)
results_qa['exact_match_fr_en_no_accent_lower'] = results_qa.apply(lambda ex : 1 if strip_accents_and_lower(ex['theoretical_answers_fr']) == strip_accents_and_lower(ex['predicted_answers_fr_en']) else 0, axis =1)
results_qa

## Use Levenshtein Distance

In [None]:
import pickle
with open('/data/desponds/data/Question_answering/comparing_fr_answers.pickle', 'rb') as handle:
    results_qa = pickle.load(handle)

In [None]:
import Levenshtein
from helper import strip_accents_and_lower
lev_dist = pd.DataFrame()
lev_dist['lev_dist_fr_ratio'] = results_qa.apply(lambda ex :  
                Levenshtein.ratio(ex['theoretical_answers_fr'],ex['predicted_answers_fr']), axis =1)
lev_dist['lev_dist_fr_en_ratio'] = results_qa.apply(lambda ex : 
                Levenshtein.ratio(ex['theoretical_answers_fr'],ex['predicted_answers_fr_en']), axis =1)
lev_dist['lev_dist_fr'] = results_qa.apply(lambda ex :  
                Levenshtein.distance(ex['theoretical_answers_fr'],ex['predicted_answers_fr']), axis =1)
lev_dist['lev_dist_fr_en'] = results_qa.apply(lambda ex : 
                Levenshtein.distance(ex['theoretical_answers_fr'],ex['predicted_answers_fr_en']), axis =1)
lev_dist['lev_dist_fr_en_no_accent_lower'] = results_qa.apply(lambda ex : 
                Levenshtein.distance(strip_accents_and_lower(ex['theoretical_answers_fr']),strip_accents_and_lower(ex['predicted_answers_fr_en']) ), axis =1)

lev_dist['lev_dist_fr_en_ratio_no_accent_lower'] = results_qa.apply(lambda ex : 
                Levenshtein.ratio(strip_accents_and_lower(ex['theoretical_answers_fr']),strip_accents_and_lower(ex['predicted_answers_fr_en']) ), axis =1)

In [None]:
lev_dist.head()

In [None]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1,1, figsize = (9,5), sharex=True, sharey=True)
lev_dist[(lev_dist['lev_dist_fr_en'] !=0)&(lev_dist['lev_dist_fr_en'] <100) ]['lev_dist_fr_en'].hist(alpha = 0.6, bins = 30, ax = axs, label = "No postprocessing")
lev_dist[(lev_dist['lev_dist_fr_en_no_accent_lower'] !=0)&(lev_dist['lev_dist_fr_en'] <100) ]['lev_dist_fr_en_no_accent_lower'].hist(alpha = 0.6, bins = 30, ax = axs, label = "Postprrocessing : no accent, lower")
fig.suptitle('Levenshtein distance (caped to 100) between theoretical and predicted')
fig.supxlabel('Levenshtein distance')
# fig.supylabel('Count')
axs.legend(loc = 'upper right')
axs.set_title('Impact of postprocessing')

In [None]:
def diff_exact_match(value_fr, value_fr_en):
    print(len(results_qa[(results_qa['exact_match_fr'] == value_fr) & (results_qa['exact_match_fr_en'] == value_fr_en)]))
    return results_qa[(results_qa['exact_match_fr'] == value_fr) & (results_qa['exact_match_fr_en'] == value_fr_en)]
diff = diff_exact_match(value_fr = 0, value_fr_en = 1)
diff.head(5)

In [None]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1,1, figsize = (9,5), sharex=True, sharey=True)
lev_dist[lev_dist['lev_dist_fr_en_ratio'] !=1]['lev_dist_fr_en_ratio'].hist(alpha = 0.4, bins = 30, ax = axs, label = "No postprocessing")
lev_dist[lev_dist['lev_dist_fr_en_ratio_no_accent_lower'] !=1]['lev_dist_fr_en_ratio_no_accent_lower'].hist(alpha = 0.4, bins = 30, ax = axs, label = "Postprocessing : no accent, lower")
fig.suptitle('Levenshtein ratio between theoretical and predicted')
fig.supxlabel('Levenshtein ratio')
axs.legend(loc = 'upper left')
axs.set_title('Impact of postprocessing')

In [None]:
import pickle 
with open('/data/desponds/data/Question_answering/comparing_fr_answers.pickle', 'wb') as handle:
    pickle.dump(results_qa, handle)

In [None]:
results_qa

### Metrics accepting a certain Levenstein distance with english dataset 

In [None]:
metric_en_lev_10, predicted_answers_en_lev_10, theoretical_answers_en_lev_10 = compute_metrics_QA(
    start_logits_en, 
    end_logits_en, 
    tokenized['en']['validation'], 
    datasets['en']["validation"], 
    accept_levenstein = 10)
metric_en_lev_10

### Metrics accepting a certain Levenstein distance with french translated dataset 

In [None]:
metric_fr_en_lev_1, predicted_answers_fr_en_lev_1, theoretical_answers_fr_en_lev_1 = compute_metrics_QA(
    start_logits_fr_en, end_logits_fr_en, 
    tokenized_translated_fr_en, translated_fr_en,
    need_translation = True, base_answers = datasets['fr']["validation"], 
    accept_levenstein = 1)
metric_fr_en_lev_1

In [None]:
metric_fr_en_lev_3, predicted_answers_fr_en_lev_3, theoretical_answers_fr_en_lev_3 = compute_metrics_QA(
    start_logits_fr_en, 
    end_logits_fr_en, 
    tokenized_translated_fr_en, 
    translated_fr_en,
    need_translation = True, 
    base_answers = datasets['fr']["validation"], 
    accept_levenstein = 3
               )
metric_fr_en_lev_3

# Use Paraphrasing 

In [None]:
# Get the paraphrase models
from training import get_trainers_paraphrasing, get_models_paraphrasing
from preprocessing import preprocessing_paraphrasing
datasets = {}
datasets['fr'] = preprocessing_paraphrasing('fr')
trainers = get_trainers_paraphrasing('/data/desponds/data/Paraphrase/', datasets, langs = ['fr'])
models = get_models_paraphrasing(trainers)

In [None]:
from preprocessing import preprocessing_paraphrasing, tokenize_paraphrasing
from datasets import Dataset

# Get the data that we want to us on the paraphrase models 
def get_df_paraphrasing_qa(lang, with_questions) :
    para = pd.DataFrame()
    if not with_questions : 
        para['sentence1'] = results_qa[f'theoretical_answers_fr']
        para['sentence2'] = results_qa[f'predicted_answers_{lang}']
    else :
        para['sentence1'] = results_qa['question'] +" "+ results_qa[f'theoretical_answers_fr']
        para['sentence2'] = results_qa['question'] +" "+ results_qa[f'predicted_answers_{lang}']
    return Dataset.from_pandas(para)

# Get the data on the predicted answer using Camembert 
para_fr = get_df_paraphrasing_qa('fr', with_questions = False)
para_fr_q = get_df_paraphrasing_qa('fr', with_questions = True)
# Get the data on the predicted answer using Roberta and translation 
para_fr_en = get_df_paraphrasing_qa('fr_en', with_questions = False)
para_fr_en_q = get_df_paraphrasing_qa('fr_en', with_questions = True)

# Tokenize both of them 
tokenizer = AutoTokenizer.from_pretrained('camembert-base')
tokenized_fr = para_fr.map(lambda example : tokenize_paraphrasing(example, 'fr', tokenizer, with_label = False, MAX_LENGTH = 80 , truncation = 'longest_first'))
tokenized_fr_en = para_fr_en.map(lambda example : tokenize_paraphrasing(example, 'fr', tokenizer, with_label = False, MAX_LENGTH = 80, truncation = 'longest_first' ))
tokenized_fr_q = para_fr_q.map(lambda example : tokenize_paraphrasing(example, 'fr', tokenizer, with_label = False, MAX_LENGTH = 160 , truncation = 'longest_first'))
tokenized_fr_en_q = para_fr_en_q.map(lambda example : tokenize_paraphrasing(example, 'fr', tokenizer, with_label = False, MAX_LENGTH = 160, truncation = 'longest_first' ))

In [None]:
# Use the model to get the logits
predictions_fr = models['fr'].predict(tokenized_fr)
predictions_fr_en = models['fr'].predict(tokenized_fr_en)
predictions_fr_q = models['fr'].predict(tokenized_fr_q)
predictions_fr_en_q = models['fr'].predict(tokenized_fr_en_q)

#Use the logits to get the labels 
labels_fr = predictions_fr.predictions.argmax(axis =1)
labels_fr_en = predictions_fr_en.predictions.argmax(axis =1)
labels_fr_q = predictions_fr_q.predictions.argmax(axis =1)
labels_fr_en_q = predictions_fr_en_q.predictions.argmax(axis =1)

# Add our labels to the results df
results_qa['paraphrase_fr'] = labels_fr
results_qa['paraphrase_fr_en'] = labels_fr_en
results_qa['paraphrase_fr_q'] = labels_fr_q
results_qa['paraphrase_fr_en_q'] = labels_fr_en_q

In [None]:
results_qa[['question', 'theoretical_answers_fr', 'predicted_answers_fr', 'predicted_answers_fr_en', 'paraphrase_fr', 'paraphrase_fr_en']]

In [None]:
results_qa[['paraphrase_fr', 'paraphrase_fr_en', 'paraphrase_fr_q', 'paraphrase_fr_en_q']].mean()

## Using BERTscore

In [None]:
from evaluate import load
bertscore = load("bertscore")
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
results = bertscore.compute(predictions=predictions, references=references, lang="fr")
results

In [None]:
results_fr = bertscore.compute(predictions=results_qa['predicted_answers_fr'], 
                            references=results_qa['theoretical_answers_fr'], lang="fr")
results_fr_en = bertscore.compute(predictions=results_qa['predicted_answers_fr_en'], 
                            references=results_qa['theoretical_answers_fr'], lang="fr")
results_fr_wb = bertscore.compute(predictions=results_qa['predicted_answers_fr'], 
                            references=results_qa['theoretical_answers_fr'], lang="fr", rescale_with_baseline = True)
results_fr_en_wb = bertscore.compute(predictions=results_qa['predicted_answers_fr_en'], 
                            references=results_qa['theoretical_answers_fr'], lang="fr", rescale_with_baseline = True)
results_qa['BERTscore_f1_fr'] = results_fr['f1']
results_qa['BERTscore_f1_fr_en'] = results_fr_en['f1']

In [None]:
import numpy as np
np.mean(results_fr['f1']), np.mean(results_fr_en['f1']),np.mean(results_fr_wb['f1']), np.mean(results_fr_en_wb['f1'])

In [None]:
pa_en = [p['prediction_text'] for p in predicted_answers_en]
ta_en = [p['answers']['text'][0] for p in theoretical_answers_en]

In [None]:
results_en = bertscore.compute(predictions=pa_en, 
                            references=ta_en, lang="en")
results_en_wb = bertscore.compute(predictions=pa_en, 
                            references=ta_en, lang="en", rescale_with_baseline = True)
np.mean(results_en['f1']), np.mean(results_en_wb['f1'])

In [None]:
results_qa[['question', 'theoretical_answers_fr', 'predicted_answers_fr', 'predicted_answers_fr_en', 'BERTscore_f1_fr', 'BERTscore_f1_fr_en']]

## Results

In [None]:
import pandas as pd
data = {
    'task' : ['Question_answering', 'Question_answering', 'Question_answering'],
    'model'       : ['CamemBERT', 'Roberta', 'Roberta'],
    'train_dataset' : ['fquad', 'squad', 'squad'],
    'nb_sample_train' : [20731,87599,87599],
    'test_dataset' : ['fquad', 'squad', 'fquad_translated'],
    'translated' : ['no', 'no', 'yes'],
    'f1_score'    : [73.368852, 92.134176, 56.55880],
    'exact_match' : [45.388958, 85.761589, 29.76787],
    'BERTscore'   : [0.8984004, 0.964069, 0.8525224]
    #em : 41.060225846925974 f1 :65.32442339320636 sans retraduire vers le francais
}
results = pd.DataFrame(data)
results