In [1]:
try:
    import google.colab
    !pip install benepar
    !nvidia-smi
    !mkdir data
    !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=19jcMX4KFwVAp4yvgvw1GXSnSgpoQytqg' -O data/training_set.json
except:
    pass

# Baselines

In this notebook we explore some baselines for the problem of question answering. 

In [None]:
import os
import numpy as np
import pandas as pd
import math
import spacy
import benepar
import json
from tqdm import tqdm

import utils

# Download spacy corpora of text in case it's needed
!python -m spacy download en_core_web_sm
# Download the benepar neural constituency parser
benepar.download('benepar_en3')

Load the training dataset and create a DataFrame containing:
- The paragraph's text
- The question's text
- The questions's ID

In [3]:
dataset = utils.read_question_set(os.path.join('..', 'data','dev_set.json'))

# Create a more useful data structure using list comprehensions
questions = pd.DataFrame([{
        'context': paragraph['context'],
        'question': qa['question'],
        'questionID': qa['id'],
    }   for article in dataset['data']
        for paragraph in article['paragraphs']
        for qa in paragraph['qas'] ])

Trim the contexts to 512 tokens.

In [4]:
print(len(questions))
display(questions.head(5))

10570


Unnamed: 0,context,question,questionID
0,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec
1,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,56be4db0acb8001400a502ed
2,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,56be4db0acb8001400a502ee
3,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,56be4db0acb8001400a502ef
4,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,56be4db0acb8001400a502f0


Create a function that given a prediction generator produces the spans of text in the context containing the answer.

In [5]:
def get_predictions(prediction_generator, limit=None):
    predictions = {}
    limit = range(limit) if limit is not None else range(len(questions))
    # Instantiate prediction generator
    predictor_iterator = prediction_generator()
    for q in tqdm(limit):
        # Obtain start and end probabilities from the baseline function
        pstartv, pendv = next(predictor_iterator)
        # Obtain the indices of the best answer
        start, end = utils.start_end_token_from_probabilities(
            pstartv, pendv, dim=pstartv.shape[1]
        )[0]
        # Add the ID-answer pair in the predictions dictionary
        id = questions['questionID'].iloc[q]
        text = questions['context'].iloc[q]
        predictions[id] = text[start:end]
    
    return predictions

## 1. Random prediction baseline

We implement a predictor that returns random start and end probabilities. Then, we use the function `start_end_token_from_probabilities` to obtain the max-scoring randomly generated span of text. 

In [6]:
def random_baseline_predict():
    '''
    Creates random prediction vectors.
    '''
    for context in questions['context']:
        pstartv = np.random.random((1, len(context)))
        pendv = np.random.random((1, len(context)))
        yield pstartv, pendv

In [7]:
with open(os.path.join('eval', 'random_predictions_eval.txt'), 'w') as f:
    json.dump(get_predictions(random_baseline_predict), f)

100%|██████████| 10570/10570 [00:52<00:00, 202.45it/s]


## 2. Sliding window baseline

The sliding window baseline is implemented in the same way it was presented in the
SQuAD v1 paper and in the [MCTest paper](https://aclanthology.org/D13-1020.pdf) by Richardson et al.
which originally proposed it.

Apart from the paragraph and the question, the implementation also needs a set of candidate answers.
SQuAD's paper proposes to "*only use spans which are constituents in the constituency parse generated by
Stanford CoreNLP*". In our case, we use a neural parser: **Berkeley Neural Parser**, which is the option
proposed by the `spacy` library, which we are already using as a named entity extractor in the original model.

In [8]:
slw_questions = questions.copy()

In [9]:
# Initialize spacy's pipeline which we'll use for analysis
spacy_pipeline = spacy.load("en_core_web_sm")
# Disable all elements but the tokenizer
spacy_pipeline.disable_pipes("tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner")
# Add the "sentencizer" component and the neural parser
spacy_pipeline.add_pipe('sentencizer')
spacy_pipeline.add_pipe("benepar", config={"model": "benepar_en3",
                                           "disable_tagger": "true"})

def run_pipeline(text):
    try: 
        doc = spacy_pipeline(text)
    except ValueError:
        doc = spacy_pipeline(text[:512])
    return doc

tqdm.pandas()
# Preprocess questions and context
slw_questions['context'] = slw_questions['context'].progress_apply(run_pipeline)
slw_questions['question'] = slw_questions['question'].progress_apply(run_pipeline)

100%|██████████| 10570/10570 [05:31<00:00, 31.85it/s]


In [10]:
def C(w, P):
    # Sum of all words in the sentence where the word is equal to w
    return sum([1 for p in P if w == p])

def IC(w, P):
    # A scoring function based on inverse word frequency
    return math.log(1 + (1/C(w, P)))

def sliding_window_baseline_predict():
    # Extract all processed questions and contexts from the generators (more efficient)
    for i in range(len(slw_questions)):
        question = slw_questions['question'].iloc[i]
        context = slw_questions['context'].iloc[i]

        # Step 1: Create a set of words present in the question (always ignore punctuation)
        Q = set([str(token) for token in question if not token.is_punct])

        # Step 2: From the processed text we can obtain the list of constituents,
        # which will be the proposed answers to each question.
        proposed_answers = [ {
                'answer': answer,
                'start': answer.start_char,
                'end': answer.end_char,
                'sentence': sentence,
                'start_in_sentence': answer.start_char - sentence.start_char,
                'end_in_sentence': answer.end_char - sentence.start_char,
                'token_set': set(str(tok) for tok in answer if not tok.is_punct)
            }
            for sentence in list(context.sents)         # Iterate over sentences
            for answer in list(sentence._.children) # Iterate over constitents
            if not len(set(tok for tok in answer if not tok.is_punct)) == 0     # Ignore span if it only contains punctuation
        ]
        
        # Step 3: Select a subset of the proposed answer based on unigram overlap
        # with the rest of the sentence
        spans_before = [ { str(tok) for tok in 
                   p['sentence'][0:p['start_in_sentence']] }
                   for p in proposed_answers ]
        spans_after = [ { str(tok) for tok in 
                          p['sentence'][p['end_in_sentence']:] }
                          for p in proposed_answers]
        # Compute unigram overlap between before/after spans and question
        uni_overlaps = [ len(Q.intersection(spans_before[i])) + 
                         len(Q.intersection(spans_after[i]))
                         for i in range(len(proposed_answers)) ]
        
        # Keep only candidates with maximal overlap
        proposed_answers = [proposed_answers[i] for i in 
                  np.where(uni_overlaps == np.max(uni_overlaps))[0]]
        
        # Step 4: Now that we have the question's text, the proposed answers and the context,
        # we can apply the sliding window algorithm, which computes a score based on the n-gram
        # overlap between the question's words and the proposed spans of text.
        scores = []
        for i in range(len(proposed_answers)):     # Iterate over all possible answers
            S = {str(s) for s in proposed_answers[i]['token_set'].union(Q)}  # Unite the question and the answer words
            # SQuAD uses only the sentence containing the answer for context
            P = [str(t) for t in proposed_answers[i]['sentence'] if not t.is_punct]
            # Create a LUT of word scores for efficiency 
            adder = {
                p: IC(p, P)
                for p in P
            }
            sw = max([                # Select the maximum score from the...
                    sum([             # ...sum over the scores iterating over words,
                    adder[P[j+w]]    # Obtain pre-computed score for word P[j,w]
                    if P[j+w] in S else 0   # If the word is in S
                    for w in range(len(S))  # For each index w in S
                    if j+w < len(P)])       # Unless we go out of bounds
                for j in range(len(P)) ])  # Obtain a full list for all words in the passage
            scores.append(sw)

        # Create the pstartv and pendv vectors 
        pstartv = np.zeros((1, len(context.text)+1))
        pendv = np.zeros((1, len(context.text)+1))
        best_scoring_answer = proposed_answers[np.argmax(scores)]
        pstartv[0, best_scoring_answer['start']] = 1
        pendv[0, best_scoring_answer['end']] = 1

        yield pstartv, pendv

In [11]:
with open(os.path.join('eval', 'sliding_predictions_eval.txt'), 'w') as f:
    json.dump(get_predictions(sliding_window_baseline_predict), f)

100%|██████████| 10570/10570 [01:08<00:00, 154.55it/s]
