## Squad evaluation metric

In [22]:
from collections import Counter
import re
import string

In [10]:
def normalize_answer(s):
    """Convert to lowercase and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [16]:
def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

In [17]:
def compute_em(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

In [18]:
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1, if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if  num_same ==0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [102]:
# examples
sys1 = 'the cat was found under the bed'
hum1 = 'the cat was under the bed, ey mon tabarnque il se passe quoi'
sys2 = 'the tiny little cat was found under the big funny bed'
sys3 = 'oh hello Mr the cat, I was under the chair'

In [25]:
compute_f1(hum1, sys2)

0.6153846153846153

## Rouge Eval

In [109]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [110]:
scores = scorer.score(hum1, sys2)

In [111]:
scores

{'rouge1': Score(precision=0.5454545454545454, recall=0.46153846153846156, fmeasure=0.4999999999999999),
 'rouge2': Score(precision=0.2, recall=0.16666666666666666, fmeasure=0.1818181818181818),
 'rougeL': Score(precision=0.5454545454545454, recall=0.46153846153846156, fmeasure=0.4999999999999999)}

In [78]:
scores['rouge1'].fmeasure

0.7058823529411764

In [112]:
targets = [hum1, hum1, hum1]
preds = [sys1, sys2, sys3]

In [113]:
import numpy as np

In [114]:
scores = [scorer.score(target, pred) for target, pred in zip(targets, preds)]

In [115]:
scores

[{'rouge1': Score(precision=0.8571428571428571, recall=0.46153846153846156, fmeasure=0.6),
  'rouge2': Score(precision=0.6666666666666666, recall=0.3333333333333333, fmeasure=0.4444444444444444),
  'rougeL': Score(precision=0.8571428571428571, recall=0.46153846153846156, fmeasure=0.6)},
 {'rouge1': Score(precision=0.5454545454545454, recall=0.46153846153846156, fmeasure=0.4999999999999999),
  'rouge2': Score(precision=0.2, recall=0.16666666666666666, fmeasure=0.1818181818181818),
  'rougeL': Score(precision=0.5454545454545454, recall=0.46153846153846156, fmeasure=0.4999999999999999)},
 {'rouge1': Score(precision=0.5, recall=0.38461538461538464, fmeasure=0.4347826086956522),
  'rouge2': Score(precision=0.3333333333333333, recall=0.25, fmeasure=0.28571428571428575),
  'rougeL': Score(precision=0.5, recall=0.38461538461538464, fmeasure=0.4347826086956522)}]

In [116]:
rouge1_f1, rouge2_f1, rougeL_f1 = 0.0, 0.0, 0.0

for score in scores:
    for k, v in score.items():
        if k == 'rouge1':
            rouge1_f1 += v.fmeasure
        if k == 'rouge2':
            rouge2_f1 += v.fmeasure
        if k == 'rougeL':
            rougeL_f1 += v.fmeasure
            
eval_dict = {
             'rouge1': rouge1_f1 / len(scores),
             'rouge2': rouge2_f1 / len(scores),
             'rougeL': rougeL_f1 / len(scores)
            }

In [117]:
eval_dict

{'rouge1': 0.5115942028985506,
 'rouge2': 0.303992303992304,
 'rougeL': 0.5115942028985506}