## Computation of heuristics on dataset

### Load predictions and validation data

In [1]:
import json
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
from nltk import tokenize
import spacy
import string
import re

[nltk_data] Downloading package stopwords to /home/luki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/luki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/luki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Folder and file, for which you want to compute the heuristics
file_folder = './datasets/'
validation_dataset = 'valid_squad_with_predictions.json'

### Number of Similar Words Between the Question and Context

In [3]:
# Loading of file to Pandas Dataframe
data = pd.read_json(file_folder + validation_dataset)

In [4]:
data = data.reset_index()
data

Unnamed: 0,index,id,title,context,question,answers,prediction_text,str_answers,ok
0,0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ...",Denver Broncos,"{'text': ['Denver Broncos', 'Denver Broncos', ...",ok
1,1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"{'text': ['Carolina Panthers', 'Carolina Panth...",Carolina Panthers,"{'text': ['Carolina Panthers', 'Carolina Panth...",ok
2,2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's S...","Santa Clara, California","{'text': ['Santa Clara, California', ""Levi's S...",ok
3,3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', ...",Denver Broncos,"{'text': ['Denver Broncos', 'Denver Broncos', ...",ok
4,5,56be8e613aeaaa14008c90d1,Super_Bowl_50,Super Bowl 50 was an American football game to...,What was the theme of Super Bowl 50?,"{'text': ['""golden anniversary""', 'gold-themed...",golden anniversary,"{'text': ['""golden anniversary""', 'gold-themed...",ok
...,...,...,...,...,...,...,...,...,...
10565,10548,5737a5931c456719005744e9,Force,"where is the mass of the object, is the velo...",What force changes an objects direction of tra...,"{'text': ['centripetal', 'unbalanced centripet...",radial (centripetal) force,"{'text': ['centripetal', 'unbalanced centripet...",nok
10566,10555,5737a7351c456719005744f5,Force,A conservative force that acts on a closed sys...,What is the force called rgarding a potential ...,"{'text': ['artifact', 'artifact of the potenti...",conservative force,"{'text': ['artifact', 'artifact of the potenti...",nok
10567,10562,5737a9afc3c5551400e51f63,Force,The connection between macroscopic nonconserva...,What is the exchange of heat associated with?,"{'text': ['nonconservative forces', 'nonconser...",macroscopic closed systems,"{'text': ['nonconservative forces', 'nonconser...",nok
10568,10568,5737aafd1c456719005744fe,Force,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,"{'text': ['kip', 'kip', 'kip', 'kip', 'kip'], ...",the metric slug,"{'text': ['kip', 'kip', 'kip', 'kip', 'kip'], ...",nok


In [5]:
def count_similar_words_in_question_and_context(data):
    """Function for similar words heuristic computation
    This function tokenize the question and the context into words
    Sets of words are created from both of them
    Intersection between sets is computed

    Args:
        data (Pandas Dataframe): dataset for which you want to compute

    Returns:
        int: single number for similar words
    """
    tokenizer = nltk.RegexpTokenizer(r"\w+")

    similar_words = []

    for i in range(len(data)):
        context1 = nltk.word_tokenize(data['context'][i])
        question1 = nltk.word_tokenize(data['question'][i])
        context_new = [word for word in context1 if word.isalnum()]
        question_new = [word for word in question1 if word.isalnum()]
        similar_words.append(len(set(context_new).intersection(set(question_new))))
        
    return similar_words

In [6]:
data['similar_words'] = count_similar_words_in_question_and_context(data)
data

Unnamed: 0,index,id,title,context,question,answers,prediction_text,str_answers,ok,similar_words
0,0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ...",Denver Broncos,"{'text': ['Denver Broncos', 'Denver Broncos', ...",ok,7
1,1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"{'text': ['Carolina Panthers', 'Carolina Panth...",Carolina Panthers,"{'text': ['Carolina Panthers', 'Carolina Panth...",ok,7
2,2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's S...","Santa Clara, California","{'text': ['Santa Clara, California', ""Levi's S...",ok,3
3,3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', ...",Denver Broncos,"{'text': ['Denver Broncos', 'Denver Broncos', ...",ok,4
4,5,56be8e613aeaaa14008c90d1,Super_Bowl_50,Super Bowl 50 was an American football game to...,What was the theme of Super Bowl 50?,"{'text': ['""golden anniversary""', 'gold-themed...",golden anniversary,"{'text': ['""golden anniversary""', 'gold-themed...",ok,6
...,...,...,...,...,...,...,...,...,...,...
10565,10548,5737a5931c456719005744e9,Force,"where is the mass of the object, is the velo...",What force changes an objects direction of tra...,"{'text': ['centripetal', 'unbalanced centripet...",radial (centripetal) force,"{'text': ['centripetal', 'unbalanced centripet...",nok,5
10566,10555,5737a7351c456719005744f5,Force,A conservative force that acts on a closed sys...,What is the force called rgarding a potential ...,"{'text': ['artifact', 'artifact of the potenti...",conservative force,"{'text': ['artifact', 'artifact of the potenti...",nok,9
10567,10562,5737a9afc3c5551400e51f63,Force,The connection between macroscopic nonconserva...,What is the exchange of heat associated with?,"{'text': ['nonconservative forces', 'nonconser...",macroscopic closed systems,"{'text': ['nonconservative forces', 'nonconser...",nok,6
10568,10568,5737aafd1c456719005744fe,Force,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,"{'text': ['kip', 'kip', 'kip', 'kip', 'kip'], ...",the metric slug,"{'text': ['kip', 'kip', 'kip', 'kip', 'kip'], ...",nok,7


In [7]:
data['similar_words'].describe()

count    10570.000000
mean         6.047020
std          2.813025
min          0.000000
25%          4.000000
50%          6.000000
75%          8.000000
max         28.000000
Name: similar_words, dtype: float64

In [8]:
data['similar_words'].value_counts()

5     1635
6     1529
4     1387
7     1295
3     1084
8      998
9      699
2      554
10     431
11     280
1      222
12     163
13     110
14      61
0       34
15      29
16      26
17      14
19       6
18       5
20       3
21       2
28       1
23       1
22       1
Name: similar_words, dtype: int64

### Distance Between a Word From the Question and the Answer in Context

In [9]:
# code from this web site https://www.codegrepper.com/code-examples/python/find+index+of+sublist+in+list+python
def find_sub_list(sl,l):
    """Function for finding the index of sublist in list

    Args:
        sl (list): list created from answer text
        l (list): list created from context

    Returns:
        int: index of the sublist
    """
    results=[]
    sll=len(sl)
    if sll <= 0:
        return results
#     print(f"Length is {len(sl)} and sl is {sl}")
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))

    return results

In [10]:
def count_lowest_position_of_word_from_question_in_context(data):
    """Function for the word distance heuristic
    Computes the distance of word from the question from the answer text in the context

    Args:
        data (Pandas Dataframe): dataset

    Returns:
        int, str: distance of the closest word and the word
    """
    tokenizer = RegexpTokenizer(r'\w+')
    distances = []
    words = []

    for i in range(len(data)):        
        indexes_of_words = []
        context_list = tokenizer.tokenize(data['context'][i])
        question_list = tokenizer.tokenize(data['question'][i])
        answer_text = tokenizer.tokenize(data['answers'][i]['text'][0])
        answer_start = data['answers'][i]['answer_start']

        indexes_of_words = find_sub_list(answer_text, context_list)

        if len(indexes_of_words) > 0:
            answer_index = indexes_of_words[0][0]
        else:
            distances.append(-1)
            words.append('None')
            continue

        filtered_words = [word for word in question_list if word not in stopwords.words('english')]

        list_indexes = {}

        for word in filtered_words:
            if word in context_list:
                for j in range(len(context_list)):
                    if word == context_list[j]:
                        list_indexes[abs(j - answer_index)] = context_list[j]

        sort_orders = sorted(list_indexes.items(), key=lambda x: x[0], reverse=False)

        if len(sort_orders) == 0:
            distances.append(-1)
            words.append('None')
        else:
            distances.append(sort_orders[0][0])
            words.append(sort_orders[0][1])

    return distances, words

In [12]:
data['distances'], data['closest_words'] = count_lowest_position_of_word_from_question_in_context(data)

In [13]:
data['distances'].value_counts()

2      2478
1      2243
3      1404
4       886
5       617
       ... 
155       1
94        1
85        1
276       1
73        1
Name: distances, Length: 98, dtype: int64

### Position of the Answer Regarding an Order of Context Sentences

In [14]:
def identify_in_which_sentence_answer_is(data):
    """Function for the k-th sentence heuristic
    Computes in which sentence the answer is

    Args:
        data (Pandas Dataframe): dataset

    Returns:
        int: number representing the index of the sentence
    """
    sentence_indexes = []

    for i in range(len(data)):
        context1 = tokenize.sent_tokenize(data['context'][i])
        answer = data['answers'][i]['text'][0]
        nth = 0
        for sentence in context1:
            if answer in sentence:
                break
            nth += 1

        sentence_indexes.append(nth)

    return sentence_indexes

In [15]:
data['kth_sentence'] = identify_in_which_sentence_answer_is(data)

In [16]:
data['kth_sentence'].describe()

count    10570.000000
mean         1.643614
std          1.883455
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max         29.000000
Name: kth_sentence, dtype: float64

In [17]:
data['kth_sentence'].value_counts()

0     3571
1     2511
2     1790
3     1263
4      679
5      335
6      191
7      109
8       65
9       21
10      10
11       5
13       5
12       4
15       3
16       2
14       2
29       2
26       1
27       1
Name: kth_sentence, dtype: int64

### Cosine Similarity From TF-IDF Representation Between Context and Question

In [20]:
train_dataset = pd.read_json('./datasets/squad_train.json')

In [21]:
train_dataset

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...
...,...,...,...,...,...
87594,5735d259012e2f140011a09d,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,"{'text': ['Oregon'], 'answer_start': [229]}"
87595,5735d259012e2f140011a09e,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,"{'text': ['Rangoon'], 'answer_start': [414]}"
87596,5735d259012e2f140011a09f,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,"{'text': ['Minsk'], 'answer_start': [476]}"
87597,5735d259012e2f140011a0a0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,"{'text': ['1975'], 'answer_start': [199]}"


In [22]:
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)
model = vectorizer.fit(train_dataset['context']) # fit on the train dataset

def compute_similarity_between_context_and_question(data):
    """Function for cosine similarity heuristic
    Computes the cosine similarity from TF-IDF representation (trained on train SQuAD) for context and question

    Args:
        data (Pandas Dataframe): dataset

    Returns:
        decimal: number representing the cosine similarity
    """
    similarities = []

    for i in tqdm(range(len(data))):
        context1 = vectorizer.transform([data['context'][i]])
        question1 = vectorizer.transform([data['question'][i]])
        similarities.append(cosine_similarity(context1, question1)[0][0])

    return similarities


In [23]:
data['cosine_similarity'] = compute_similarity_between_context_and_question(data)

100%|██████████| 10570/10570 [00:26<00:00, 392.45it/s]


In [24]:
data['cosine_similarity'].describe()

count    10570.000000
mean         0.306264
std          0.158483
min          0.000000
25%          0.187675
50%          0.296845
75%          0.418278
max          0.910359
Name: cosine_similarity, dtype: float64

### Answer Length

In [25]:
def average_answer_length(data):
    """Function for the answer length heuristic
    Computes the average answer length in number of words

    Args:
        data (Pandas Dataframe): dataset

    Returns:
        decimal: number representing the awerage length
    """
    answers_text = []
    for i in range(len(data)):
        answers_text.append(data['answers'][i]['text'])
    answers_text

    answer_lenght = []
    tokenizer = RegexpTokenizer(r'\w+')
    for i in range(len(data)):
        avg_lenght = 0
        for j in range(len(answers_text[i])):
            avg_lenght += len(tokenizer.tokenize(answers_text[i][j]))
        av = avg_lenght/(len(answers_text[i]))
        answer_lenght.append(av)

    return answer_lenght

In [26]:
data['answer_lenght'] = average_answer_length(data)

In [27]:
data['answer_lenght'].describe()

count    10570.000000
mean         3.008505
std          2.457632
min          0.800000
25%          1.333333
50%          2.000000
75%          3.666667
max         24.333333
Name: answer_lenght, dtype: float64

In [28]:
data['answer_lenght'].value_counts()

1.000000     2294
2.000000     1842
3.000000     1016
1.666667      516
2.333333      470
             ... 
18.333333       1
16.666667       1
19.000000       1
18.666667       1
10.750000       1
Name: answer_lenght, Length: 164, dtype: int64

### Extract number of similar NER to answer from context

In [29]:
# download the en_core_web_sm
# if the following code after this cell will not work, try to restart the kernel
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 2.2 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [30]:
nlp = spacy.load('en_core_web_sm') #load spacy

In [31]:
def show_ents(doc): 
    """Function for detecting the named entities

    Args:
        doc (Doc): string processed with nlp()

    Returns:
        list: list of entities
    """
    entities = []
    if doc.ents: 
        for ent in doc.ents: 
            entities.append(ent.label_)
    return entities

In [32]:
def count_similar_NER_from_context_to_answer(data):
    """Function for similar entities heuristic
    Computed the number of similar entities between the answer and context 

    Args:
        data (Pandas Dataframe): dataset

    Returns:
        int: number of similar entities
    """
    context_ents = []
    answer_ents = []

    for row in range(len(data)):
        context = nlp(data['context'][row])
        context_ents.append(show_ents(context))
        for ans in data['answers'][row]['text']:
            act_ans_ents_ = []
            act_ans = nlp(ans)
            act_ans_ents_.append(show_ents(act_ans))
        answer_ents.append(act_ans_ents_)

    max_sim_ents = []

    for row, cont in zip(answer_ents, context_ents):
        max = 0
        for items in row:
            for item in items:
                if cont.count(item) > max:
                    max = cont.count(item)
        max_sim_ents.append(max)
    
    return max_sim_ents

In [33]:
data['max_sim_ents'] = count_similar_NER_from_context_to_answer(data)

In [34]:
data['max_sim_ents'].value_counts()

0     5203
2     1023
1      999
3      756
4      621
5      495
6      382
7      222
8      194
9      169
10      84
11      78
14      66
12      55
15      36
16      33
13      25
19      20
22      17
21      16
18      15
17      14
30      10
25      10
20       5
23       5
32       4
48       4
28       3
29       2
34       2
24       2
Name: max_sim_ents, dtype: int64

### Position of a Subject From a Question Regarding Correct Answer in Context

In [35]:
nlp = spacy.load('en_core_web_sm') #load spacy

In [36]:
def doc_pieces(doc):
    """Function for sentence subject detection

    Args:
        doc (Doc): string processed with nlp()

    Returns:
        list: list of subjects
    """
    subjects = []
    for ent in doc:
        if ent.dep_ == 'nsubj':
            subjects.append(ent.text)
    return subjects

In [37]:
def extract_answer_position_with_respect_to_subject(data):
    """Function for the subject position heuristic
    Computes the position of question's subject in the context regarding the index of correct answer

    Args:
        data (Pandas Dataframe): dataset

    Returns:
        int: number representing the answer is before the extracted subject or after the occurence
    """
    q_subjects = []

    for item in data['question']:
        question = nlp(item)
        q_subjects.append(doc_pieces(question))

    positions = []

    for context, q_sub, answer in zip(data['context'], q_subjects, data['answers']):
        pos = 0
        max = 0
        for item in q_sub:
            if item in context:
                indexes = [m.start() for m in re.finditer(item, context)]
                counter = 0
                for index in indexes:
                    if answer['answer_start'][0] < index:
                        break
                    else:
                        counter += 1
                if max < counter:
                    max = counter
                pos = max
            else:
                pos = -1
        positions.append(pos)
    
    return positions

In [38]:
data['answer_subject_positions'] = extract_answer_position_with_respect_to_subject(data)

In [39]:
data['answer_subject_positions'].value_counts()

-1     3003
 0     3000
 1     2876
 2      910
 3      386
 4      175
 5       80
 6       34
 7       27
 8       25
 9       17
 10      10
 11       7
 13       6
 12       5
 15       4
 14       2
 26       1
 37       1
 20       1
Name: answer_subject_positions, dtype: int64

In [41]:
data.to_json(file_folder+'enhanced_'+validation_dataset, orient='records') # save the dataset with computed heuristics to .json format

#### The code below can be used for creation of flags for the train SQuAD dataset
it is not in the computation process, because we provide the file

In [None]:
# threshold = 7
# data['dist_flag'] = [1 if x > threshold else 0 for x in data['distances']]

# threshold = 4
# data['sim_flag'] = [1 if x <= threshold else 0 for x in data['similar_words']]

# threshold = 3
# data['ans_flag'] = [1 if x > threshold else 0 for x in data['answer_lenght']]

# threshold = 0.1
# data['cos_flag'] = [1 if x <= threshold else 0 for x in data['cosine_similarity']]

# threshold = 1
# data['pos_flag'] = [1 if x > threshold else 0 for x in data['answer_subject_positions']]

# threshold = 0
# data['ents_flag'] = [1 if x <= threshold else 0 for x in data['max_sim_ents']]