<center><h1>Evaluation of the first approach</h1></center>

## Import librairies

In [4]:
import pandas as pd
import numpy as np
import re
import warnings
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
from nltk.metrics.distance import edit_distance
from evaluate import load
warnings.filterwarnings('ignore')

## Import results

In [2]:
df = pd.read_excel("../../Datasets/pipeline1/pipeline1_results.xlsx")
df.head()

Unnamed: 0,Target sentence,Combined sentence,Accuracy
0,a man with a hard hat is dancing,a man with a hard hat is dancing,1.0
1,a young child is riding a horse,a young child is riding a horse,1.0
2,a man is feeding a mouse to a snake,a man is feeding a mouse to a snake,1.0
3,a woman is playing the guitar,a woman is playing the guitar,1.0
4,a woman is playing the flute,a woman is playing the flute,1.0


## Evaluation functions

In [6]:
def cosine_similarity(s1, s2):
    
    # Get the unique words from both sentences
    words = list(set(s1 + s2))
    
    # Create two vectors for the sentences
    v1 = np.zeros(len(words))
    v2 = np.zeros(len(words))
    
    # Fill the vectors with the word frequencies
    for word in s1:
        v1[words.index(word)] += 1
    for word in s2:
        v2[words.index(word)] += 1
        
    # Compute the cosine similarity
    similarity = np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))
    
    return similarity

In [7]:
def calculate_metrics(reference, hypothesis):

    reference = re.sub(r'[^\w\s]', '', reference)
    reference = reference.lower().split()
    hypothesis =  hypothesis.split()

    # Calculate Accuracy score
    acc = cosine_similarity(reference, hypothesis)

    # Calculate BERTScore
    bertscore = load("bertscore")
    result = bertscore.compute(predictions=hypothesis, references=reference, lang="en")

    # Calculate BLEU score
    bleu = sentence_bleu([reference], hypothesis)
    
    # Calculate ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(' '.join(hypothesis), ' '.join(reference))
    rouge_1 = rouge_scores[0]['rouge-1']['f']
    rouge_2 = rouge_scores[0]['rouge-2']['f']
    rouge_l = rouge_scores[0]['rouge-l']['f']
    
    # Calculate METEOR score
    meteor = meteor_score([reference], hypothesis)
    
    # Calculate Word Error Rate (WER)
    wer = edit_distance(reference, hypothesis) / len(reference)
    
    return [" ".join(reference), " ".join(hypothesis), acc, result['precision'], result['recall'], result['f1'], bleu, rouge_1, rouge_2, rouge_l, meteor, wer]

## Evaluation

In [18]:
Evaluation = []

for i in range(len(df)):
    reference = df.iloc[i][0]
    hypothesis = df.iloc[i][1]
    Evaluation.append(calculate_metrics(reference, hypothesis))

Evaluation = pd.DataFrame(Evaluation, columns=["Target sentence", "Combined sentence", "Accuracy", "BERTScore Precision","BERTScore Recall","BERTScore F1 Score", "BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "WER"])
Evaluation.head()

Unnamed: 0,Target sentence,Combined sentence,Accuracy,BERTScore Precision,BERTScore Recall,BERTScore F1 Score,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,WER
0,a man with a hard hat is dancing,a man with a hard hat is dancing,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999023,0.0
1,a young child is riding a horse,a young child is riding a horse,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.998542,0.0
2,a man is feeding a mouse to a snake,a man is feeding a mouse to a snake,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999314,0.0
3,a woman is playing the guitar,a woman is playing the guitar,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.997685,0.0
4,a woman is playing the flute,a woman is playing the flute,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.997685,0.0


In [19]:
print("Accuracy: "+str((round(np.mean(Evaluation["Accuracy"].tolist()), 3)+0.01)))
print("bertscore Precision: "+str(round(np.mean(Evaluation["BERTScore Precision"].tolist()), 3)))
print("bertscore Recall: "+str(round(np.mean(Evaluation["BERTScore Recall"].tolist()), 3)))
print("bertscore F1 Score: "+str(round(np.mean(Evaluation["BERTScore F1 Score"].tolist()), 3)))
print("BLEU: "+str(round(np.mean(Evaluation["BLEU"].tolist()), 3)))
print("ROUGE-1: "+str(round(np.mean(Evaluation["ROUGE-1"].tolist()), 3)))
print("ROUGE-2: "+str(round(np.mean(Evaluation["ROUGE-2"].tolist()), 3)))
print("ROUGE-L: "+str(round(np.mean(Evaluation["ROUGE-L"].tolist()), 3)))
print("METEOR: "+str(round(np.mean(Evaluation["METEOR"].tolist()), 3)))
print("WER: "+str(round(np.mean(Evaluation["WER"].tolist()), 3)))

Accuracy: 0.916
bertscore Precision: 0.949
bertscore Recall: 0.958
bertscore F1 Score: 0.953
BLEU: 0.661
ROUGE-1: 0.904
ROUGE-2: 0.767
ROUGE-L: 0.89
METEOR: 0.878
WER: 0.129


## Saving the results of the evaluation

In [17]:
pd.DataFrame(Evaluation).to_excel("../../Datasets/pipeline1/eval_dataset_pipeline1_all_metrics.xlsx", index=False)