## Setting up Simple QA pipeline using HuggingFace Transformers and PreTrained BERT model on SQuAD 2.0

In [19]:
import json

with open('data/squad/dev.json', 'r') as f:
    squad = json.load(f)

In [20]:
from transformers import BertTokenizer, BertForQuestionAnswering

modelname = 'deepset/bert-base-cased-squad2'

tokenizer = BertTokenizer.from_pretrained(modelname)
model = BertForQuestionAnswering.from_pretrained(modelname)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
from transformers import pipeline

## Pipeline: 
API that makes the model inference easy, just pass feature, model, and tokenizer to setup pipeline and then pass question and context to make inference and the pipeline returns the score and answer

In [22]:
qa = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [23]:
squad[:2]

[{'question': 'In what country is Normandy located?',
  'answer': 'France',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'},
 {'question': 'When were the Normans in Normandy?',
  'answer': '10th and 11th centuries',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: No

In [24]:
qa({
    'question': 'In what country is Normandy located?',
    'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
    
})

{'score': 0.9995271563529968, 'start': 159, 'end': 166, 'answer': 'France.'}

In [25]:
answers = []

for pair in squad[:5]:
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    answers.append({'predicted': ans['answer'],
                    'true': pair['answer']})

In [26]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]

# Performance Metrics 
## Exact Match

Returns whether we got an exact match between prediction and true value

In [27]:
em = []

for answer in answers:
    if answer['predicted'] == answer['true']:
        em.append(1)
    else:
        em.append(0)


In [28]:
sum(em)/len(em)

0.4

In [29]:
import re

em = []
for answer in answers:
    pred = re.sub('[^0-9a-z ]', '', answer['predicted'].lower())
    true = re.sub('[^0-9a-z ]', '', answer['true'].lower())
    if pred == true:
        em.append(1)
    else:
        em.append(0)

In [30]:
sum(em)/len(em)

0.8

# ROUGE

In [31]:
from rouge import Rouge

In [32]:
model_out = 'hello to the world'
reference = 'hello world'

In [33]:
rouge = Rouge()

In [34]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}}]

In [35]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]

In [36]:
model_out = [ans['predicted'] for ans in answers]
reference = [ans['true'] for ans in answers]

model_out

['France.',
 '10th and 11th centuries',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo,']

In [38]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001},
  'rouge-2': {'r': 0.6, 'p': 1.0, 'f': 0.7499999953125},
  'rouge-l': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}]

In [39]:
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.7333333333333333, 'p': 0.8, 'f': 0.7599999960400001},
 'rouge-2': {'r': 0.52, 'p': 0.6, 'f': 0.5499999970625},
 'rouge-l': {'r': 0.7333333333333333, 'p': 0.8, 'f': 0.7599999960400001}}