<center><h1>Test metrics</h1></center>

In [4]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
import numpy as np
from nltk.metrics.distance import edit_distance

def calculate_metrics(reference, hypothesis):
    # Calculate BLEU score
    bleu = sentence_bleu([reference], hypothesis)
    
    # Calculate ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(' '.join(hypothesis), ' '.join(reference))
    rouge_1 = rouge_scores[0]['rouge-1']['f']
    rouge_2 = rouge_scores[0]['rouge-2']['f']
    rouge_l = rouge_scores[0]['rouge-l']['f']
    
    # Calculate METEOR score
    meteor = meteor_score([reference], hypothesis)
    
    # Calculate Word Error Rate (WER)
    wer = edit_distance(reference, hypothesis) / len(reference)
    
    return {'bleu': bleu, 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'meteor': meteor, 'wer': wer}



In [5]:
# Example usage
reference = ['the', 'cat', 'is', 'on', 'the', 'mat']
hypothesis = ['there', 'is', 'a', 'cat', 'on', 'the', 'mat']
reference = 'I love you so much'.split()
hypothesis = 'I love you'.split()
scores = calculate_metrics(reference, hypothesis)
print(scores)

{'bleu': 6.2705618118895185e-78, 'rouge-1': 0.7499999953125, 'rouge-2': 0.6666666622222223, 'rouge-l': 0.7499999953125, 'meteor': 0.6134259259259259, 'wer': 0.4}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [6]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

reference = ['the', 'cat', 'is', 'on', 'the', 'mat']
hypothesis = ['there', 'is', 'a', 'cat', 'on', 'the', 'mat']

# Use a lower maximum n-gram order
bleu = sentence_bleu([reference], hypothesis, weights=(0.5, 0.5))
print(f'BLEU score with lower maximum n-gram order: {bleu}')

# Use a smoothing function
smooth = SmoothingFunction().method1
bleu = sentence_bleu([reference], hypothesis, smoothing_function=smooth)
print(f'BLEU score with smoothing function: {bleu}')

BLEU score with lower maximum n-gram order: 0.48795003647426655
BLEU score with smoothing function: 0.18575057999133598


## Useful links to know more about the metrics and why i chose them :
- [Bleu score](https://towardsdatascience.com/-foundations-of-nlp-explained-bleu-score-and-wer-metrics-1a5ba06d812b#:~:text=Bleu%20Scores%20are%20between%200,rarely%20achieve%20a%20perfect%20match.)
- [Rouge score](https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499)
- [METEOR_SCORE-SECTION3](https://medium.com/explorations-in-language-and-learning/metrics-for-nlg-evaluation-c89b6a781054)
- WER : word error rate is just a very basic formula to see the error rate in a sentence