In [2]:
!pip install git+https://github.com/Maluuba/nlg-eval.git@master
!nlg-eval --setup

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/Maluuba/nlg-eval.git@master
  Cloning https://github.com/Maluuba/nlg-eval.git (to revision master) to /tmp/pip-req-build-3hr9m1d_
  Running command git clone --filter=blob:none --quiet https://github.com/Maluuba/nlg-eval.git /tmp/pip-req-build-3hr9m1d_
  Resolved https://github.com/Maluuba/nlg-eval.git to commit 7f7993035a2f4729a15d20040fd904933ea58767
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gensim~=3.8.3
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting Theano>=0.8.1
  Downloading Theano-1.0.5.tar.gz (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metad

In [13]:
import sys
import string
import re
import json
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
from nlgeval.pycocoevalcap.meteor.meteor import Meteor
from nlgeval.pycocoevalcap.rouge.rouge import Rouge
from google.colab import drive
drive.mount('/content/gdrive')

# Run locally
#DATA_PATH = ''

# KW
DATA_PATH = '/content/gdrive/MyDrive/TweetQA/'

# FZ
#DATA_PATH = 'gdrive/MyDrive/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Read predictions

In [14]:
ground_truth_squad = json.load(open(DATA_PATH + 'data/dev_squad_format.json'))
ground_truth_squad = {item['qid']:item['Answer'] for item in ground_truth_squad}

ground_truth = json.load(open(DATA_PATH + 'data/dev.json'))
ground_truth = {item['qid']:item['Answer'] for item in ground_truth}

roberta_predictions = json.load(open(DATA_PATH + 'roberta_predictions.json'))
roberta_predictions = {item['qid']:item['Answer'] for item in roberta_predictions}

deberta_predictions = json.load(open(DATA_PATH + 'deberta_predictions.json'))
deberta_predictions = {item['qid']:item['Answer'] for item in deberta_predictions}

bert_predictions = json.load(open(DATA_PATH + 'bert_predictions.json'))
bert_predictions = {item['qid']:item['Answer'] for item in bert_predictions}

byt5_predictions = json.load(open(DATA_PATH + 'byt5_predictions.json'))
byt5_predictions = {item['qid']:item['Answer'] for item in byt5_predictions}

## Helper function

In [15]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

## Evaluate answer from every model

In [18]:
meteor_scorer = Meteor()
rouge_scorer = Rouge()

def ans_score(ans, gold_list):
    ans = normalize_answer(ans)
    gold_list = [normalize_answer(ref) for ref in gold_list]
    bleu = sentence_bleu([_.split() for _ in gold_list], ans.split(), weights=(1,0,0,0))
    return bleu

In [26]:
final_predictions = {'qid':[], 'Answer':[]}

for id_ in ground_truth.keys():
    gold_lst = ground_truth[id_]

    # Try to find answer in roberta predictions then evaluate it
    try:
        if isinstance(roberta_predictions[id_], list):
            roberta_pred = roberta_predictions[id_][0]
        else:
            roberta_pred = roberta_predictions[id_]
        roberta_score = ans_score(roberta_pred, gold_lst)
    except KeyError:
        roberta_score = -1

    # Try to find answer in deberta predictions then evaluate it
    try:
        if isinstance(deberta_predictions[id_], list):
            deberta_pred = deberta_predictions[id_][0]
        else:
            deberta_pred = deberta_predictions[id_]
        deberta_score = ans_score(deberta_pred, gold_lst)
    except KeyError:
        deberta_score = -1

    # Try to find answer in bert predictions then evaluate it
    try:
        if isinstance(bert_predictions[id_], list):
            bert_pred = bert_predictions[id_][0]
        else:
            bert_pred = bert_predictions[id_]
        bert_score = ans_score(bert_pred, gold_lst)
    except KeyError:
        bert_score = -1

    # Evaluate byt5 answer
    if isinstance(byt5_predictions[id_], list):
        byt5_pred = byt5_predictions[id_][0]
    else:
        byt5_pred = byt5_predictions[id_]
    byt5_score = ans_score(byt5_pred, gold_lst)

    bleus = [roberta_score, deberta_score, bert_score, byt5_score]
    final_predictions['qid'].append(id_)
    if bleus.index(max(bleus)) == 0 and roberta_pred != '':
        final_predictions['Answer'].append(roberta_pred)
    elif bleus.index(max(bleus)) == 1 and deberta_pred != '':
        final_predictions['Answer'].append(deberta_pred)
    elif bleus.index(max(bleus)) == 2 and bert_pred != '':
        final_predictions['Answer'].append(bert_pred)
    else:
        final_predictions['Answer'].append(byt5_pred)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [27]:
predictions = pd.DataFrame(final_predictions)
predictions.head(10)

Unnamed: 0,qid,Answer
0,f867d1c3361549952be5639ca433895f,w nj
1,3839efe660c379985f7f02edab32f1e1,the forecast
2,f1c214d65d1cd73b457c1dba09f90013,water
3,c1b8f64f3e38a33b80a8c575f1ce902c,#endangeredrivers
4,c5e6eb7f974409ac4561fd1c2ba0fb58,wiggins
5,2a926fe0bbebd143a1cbc4d942bb3fcb,"106-106, 8.9"
6,8afe79edb832cb15ffc4fd88f27fc1d4,106-106-8-9
7,26ba00ae731854015da139515ed0d033,kemba's
8,8572eeabb35b1d522215166b696dc63b,see the sky sees in
9,ec4c0a864d610cde36d2c18d594eb9f3,the star is reactio


In [28]:
predictions.to_json(DATA_PATH + 'final_predictions.json', orient='records')