The SQuAD (Stanford Question and Answering Dataset) is a hugely popular dataset containing question and answer pairs scraped from Wikipedia, covering topics ranging from Beyonce, to Physics. As one of the most comprehensive Q&A datasets available, it's only natural that we will be making use of it. So let's explore it.

First, we'll need to download the data. There are two JSON files that we are interested in - train and dev, which we can downloaded from http. Here we will be storing the SQuAD data in the ../../data/squad directory, so we must check if this already exists and if not create the directory.

In [1]:
import os

In [15]:
squad_dir = 'squad'

In [16]:
if not os.path.exists(squad_dir):
    os.mkdir(squad_dir)

In [17]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
files = ['train-v2.0.json', 'dev-v2.0.json']

In [18]:
import requests

In [21]:
for file in files:
    res = requests.get(url+file)
    with open(os.path.join(squad_dir, file), 'wb') as fp:
        for chunk in res.iter_content(chunk_size=40):
            fp.write(chunk)

In [22]:
import json

with open(os.path.join(squad_dir, files[0]), 'rb') as f:
    squad = json.load(f)

In [25]:
squad['data'][0]['paragraphs'][0]

{'qas': [{'question': 'When did Beyonce start becoming popular?',
   'id': '56be85543aeaaa14008c9063',
   'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
   'is_impossible': False},
  {'question': 'What areas did Beyonce compete in when she was growing up?',
   'id': '56be85543aeaaa14008c9065',
   'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
   'is_impossible': False},
  {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
   'id': '56be85543aeaaa14008c9066',
   'answers': [{'text': '2003', 'answer_start': 526}],
   'is_impossible': False},
  {'question': 'In what city and state did Beyonce  grow up? ',
   'id': '56bf6b0f3aeaaa14008c9601',
   'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
   'is_impossible': False},
  {'question': 'In which decade did Beyonce become famous?',
   'id': '56bf6b0f3aeaaa14008c9602',
   'answers': [{'text': 'late 1990s', 'answer_start': 276}],
   'is_impossible': False},
  {'q

---------------------------------------------------------

In [27]:
new_squad = []
for group in squad['data']:
    for paragraph in group['paragraphs']:
        context = paragraph['context']
        for qa_pair in paragraph['qas']:
            question = qa_pair['question']
            if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
                answer = qa_pair['answers'][0]['text']
            elif 'plauisble_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
                answer = qa_pair['plausible_answers'][0]['text']
            else:
                answer = None
            new_squad.append({'question': question, 'answer': answer, 'context': context})

In [28]:
new_squad[:2]

[{'question': 'When did Beyonce start becoming popular?',
  'answer': 'in the late 1990s',
  'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'},
 {'question': 'What areas did Beyonce compete in when she was growing up?',
  'answer': 'singing and dancing',
  'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born Septe

In [30]:
with open(os.path.join(squad_dir, 'train.json'), 'w') as f:
    json.dump(new_squad, f)

In [33]:

with open(os.path.join(squad_dir, files[1]), 'rb') as f:
    squad_dev = json.load(f)

In [37]:
squad_dev['data'][0]['paragraphs'][0]

{'qas': [{'question': 'In what country is Normandy located?',
   'id': '56ddde6b9a695914005b9628',
   'answers': [{'text': 'France', 'answer_start': 159},
    {'text': 'France', 'answer_start': 159},
    {'text': 'France', 'answer_start': 159},
    {'text': 'France', 'answer_start': 159}],
   'is_impossible': False},
  {'question': 'When were the Normans in Normandy?',
   'id': '56ddde6b9a695914005b9629',
   'answers': [{'text': '10th and 11th centuries', 'answer_start': 94},
    {'text': 'in the 10th and 11th centuries', 'answer_start': 87},
    {'text': '10th and 11th centuries', 'answer_start': 94},
    {'text': '10th and 11th centuries', 'answer_start': 94}],
   'is_impossible': False},
  {'question': 'From which countries did the Norse originate?',
   'id': '56ddde6b9a695914005b962a',
   'answers': [{'text': 'Denmark, Iceland and Norway', 'answer_start': 256},
    {'text': 'Denmark, Iceland and Norway', 'answer_start': 256},
    {'text': 'Denmark, Iceland and Norway', 'answer_star

In [41]:
#### Change the dev set

new_squad_dev = []
for group in squad_dev['data']:
    for paragraph in group['paragraphs']:
        context = paragraph['context']
        for qa_pair in paragraph['qas']:
            question = qa_pair['question']
            if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
                answer = qa_pair['answers'][0]['text']
            elif 'plauisble_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
                answer = qa_pair['plausible_answers'][0]['text']
            else:
                answer = None
            new_squad_dev.append({'question': question, 'answer': answer, 'context': context})

In [42]:
new_squad_dev[:2]

[{'question': 'In what country is Normandy located?',
  'answer': 'France',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'},
 {'question': 'When were the Normans in Normandy?',
  'answer': '10th and 11th centuries',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: No

In [43]:
with open(os.path.join(squad_dir, 'dev.json'), 'w') as f:
    json.dump(new_squad_dev, f)

-----------------------------------

### First QA model

In [121]:
with open('squad/dev.json', 'r') as f:
    squad = json.load(f)

In [122]:
squad = squad[:100]

In [None]:
# Initialize tokeniser and the model

In [123]:
from transformers import BertTokenizer, BertForQuestionAnswering

In [124]:
modelname ='deepset/bert-base-cased-squad2'

In [125]:
tokenizer = BertTokenizer.from_pretrained(modelname)

In [126]:
model = BertForQuestionAnswering.from_pretrained(modelname)

In [127]:
from transformers import pipeline

- https://huggingface.co/docs/transformers/main_classes/pipelines

In [128]:
qa = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [54]:
squad[:2]

[{'question': 'In what country is Normandy located?',
  'answer': 'France',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'},
 {'question': 'When were the Normans in Normandy?',
  'answer': '10th and 11th centuries',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: No

In [55]:
qa({'question': 'In what country is Normandy located?',
   'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'})

{'score': 0.9995271563529968, 'start': 159, 'end': 166, 'answer': 'France.'}

In [129]:
answers = []

for pair in squad[:5]:
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    answers.append({
        'predicted': ans['answer'],
        'true': pair['answer']
    })
    

In [130]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'},
 {'predicted': '10th', 'true': '10th century'}]

------------------------------------

## Metrics for language

- Exact Match (EM)

In [131]:
em = []

for answer in answers:
    if answer['predicted'] == answer['true']:
        em.append(1)
    else:
        em.append(0)
        

In [132]:
sum(em)/len(em)   #bad accuracy

0.4

In [133]:
import re


em = []

for answer in answers:
    pred= re.sub('[^0-9a-z ]', '', answer['predicted'].lower())
    true= re.sub('[^0-9a-z ]', '', answer['true'].lower())
    if pred == true:
        em.append(1)
    else:
        em.append(0)

In [134]:
sum(em)/len(em)  # Better exact match

0.8

-----------------------------------------

# ROUGE

In [135]:
from rouge import Rouge

In [136]:
model_out = 'hello to the world'
reference = 'hello world'

In [137]:
rouge = Rouge()

In [138]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}}]

In [139]:
# recall, precision, f score

# rouge2: bigrams


In [140]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'},
 {'predicted': '10th', 'true': '10th century'}]

In [141]:
model_out=[ans['predicted'] for ans in answers]
reference=[ans['true'] for ans in answers]

model_out

['France.',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo,',
 '10th']

In [142]:
reference

['France',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo',
 '10th century']

In [143]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223}}]

In [144]:
rouge.get_scores(model_out, reference,avg=True)

{'rouge-1': {'r': 0.7, 'p': 0.8, 'f': 0.7333333294444444},
 'rouge-2': {'r': 0.4, 'p': 0.4, 'f': 0.399999998},
 'rouge-l': {'r': 0.7, 'p': 0.8, 'f': 0.7333333294444444}}

In [145]:
from tqdm import tqdm #check

### Applying rouge to QA

In [146]:
model_out = []
reference = []

for pair in tqdm(squad[:10], leave=True):
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    # append the prediction and reference to the respective lists
    model_out.append(ans['answer'])
    reference.append(pair['answer'])
    

100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00,  2.26s/it]


In [151]:
reference

['France',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo',
 '10th century',
 None,
 None,
 None,
 None,
 'William the Conqueror']

In [162]:
for i in range(len(reference)):
    if reference[i]== None:
        reference[i] = 'None'

In [163]:
model_out

['France.',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo,',
 '10th',
 '10th and 11th centuries',
 'France.',
 'King Charles III of West Francia.',
 'in the first half of the 10th century,',
 'William the Conqueror,']

In [164]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},


In [165]:
scores = rouge.get_scores(model_out, reference)
scores

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},


In [1]:
# rouge l: measures Longest Common Subsequence(LCS)