In [1]:
# Imports 

import numpy as np 
import pandas as pd 
import os


In [2]:
# Dataset Construction

folder = 'D:/Semester 2/ENFUSE/data/'

df = pd.DataFrame()
for file in ['S10_question_answer_pairs.txt','S09_question_answer_pairs.txt','S08_question_answer_pairs.txt']:
    filename = os.path.join(folder, file)
    df_tmp = pd.read_csv(filename, encoding='latin1', sep='\t').drop_duplicates(subset="Question")
    print(filename, len(df_tmp))
    df = pd.concat([df,df_tmp])

D:/Semester 2/ENFUSE/data/S10_question_answer_pairs.txt 832
D:/Semester 2/ENFUSE/data/S09_question_answer_pairs.txt 598
D:/Semester 2/ENFUSE/data/S08_question_answer_pairs.txt 1033


In [4]:
#df.info()

# Data Cleaning

In [5]:
# normalize the name of columns
df.columns = ['articleTitle','question','answer','difficultyFromQuestioner',
              'difficultyFromAnswerer','articleFile',"articleTitle_to_drop"]

# drop the last column
df.drop('articleTitle_to_drop', axis=1, inplace=True)

In [6]:
# nb of na values in each column
df.isna().sum()

articleTitle                1631
question                       2
answer                       273
difficultyFromQuestioner     893
difficultyFromAnswerer       278
articleFile                    2
dtype: int64

In [7]:
# drop na values
print('original df length: ',len(df))
df.dropna(subset=['question'], inplace=True)
df.dropna(subset=['answer'], inplace=True)
df.dropna(subset=['articleTitle'], inplace=True)
df = df[~df.question.str.contains('#')] # remove badly formatted questions
df = df[~df.answer.isin(['no','yes','Yes','No','No,','Yes,','No.','Yes.','yes.','no.'])] # remove yes/no questions
print('new df length: ',len(df))


original df length:  2463
new df length:  511


In [8]:
pd.options.display.max_colwidth=None
df.head()

Unnamed: 0,articleTitle,question,answer,difficultyFromQuestioner,difficultyFromAnswerer,articleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemistry.,easy,easy,S10_set4_a10
2,Alessandro_Volta,Did Alessandro Volta invent the remotely operated pistol?,Alessandro Volta did invent the remotely operated pistol.,easy,easy,S10_set4_a10
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,S10_set4_a10
6,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium,S10_set4_a10
8,Alessandro_Volta,What did Alessandro Volta invent in 1800?,"In 1800, Alessandro Volta invented the voltaic pile.",medium,easy,S10_set4_a10


# Load Q/A model from hugging face and start predicting

In [9]:
#So, from now on we want to be able to answer the questions in the dataset and compare those answers provided by the model with those provided by the human answerer

# load question-answering model

from transformers import pipeline
qa_model = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


HBox(children=(FloatProgress(value=0.0, description='Downloading (…)lve/main/config.json', max=473.0, style=Pr…




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


HBox(children=(FloatProgress(value=0.0, description='Downloading pytorch_model.bin', max=260793700.0, style=Pr…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)okenizer_config.json', max=29.0, style=Pro…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)solve/main/vocab.txt', max=213450.0, style…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)/main/tokenizer.json', max=435797.0, style…




In [11]:
def answer_question_given_article(question, article_name):
    '''
    given the question, read the article and use it as a context and return a dictionary with an 
    answer and a score and the position of the answer in the article
    '''
    folder_name = 'D:/Semester 2/ENFUSE/data/text_data'
    article_file = str(df[df.articleTitle==article_name].articleFile.unique()[0]) + '.txt.clean'
    article_path = os.path.join(folder_name, article_file)
    
    with open(article_path, 'r', encoding='utf-8') as file:
        context = file.read().replace('\n', '')
        
    return qa_model(question, context)



In [12]:
%%time

# question example and the time it takes to answer it
question_example = """In 1602, the British East India Company's first voyage, commanded by Sir who, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post?"""
answer_question_given_article(question_example, "Jakarta")

Wall time: 25.5 s


{'score': 0.8335651159286499,
 'start': 3262,
 'end': 3281,
 'answer': 'Sir James Lancaster'}

In [13]:
df[df.question==question_example]

Unnamed: 0,articleTitle,question,answer,difficultyFromQuestioner,difficultyFromAnswerer,articleFile
663,Jakarta,"In 1602, the British East India Company's first voyage, commanded by Sir who, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post?",James Lancaster,,medium,S10_set3_a5


In [14]:
# Apply the answering function on a small sample of 20 questions
df_sample = df.sample(30)

# complete answer
df_sample['answer_from_model'] = df_sample.apply(lambda x: answer_question_given_article(x.question, x.articleTitle), axis=1)

# extract the exact answer 
df_sample['models_answer'] = df_sample['answer_from_model'].map(lambda x:x['answer'])

# extract the score (confidence of the model)
df_sample['models_score'] = df_sample['answer_from_model'].map(lambda x:x['score'])

In [15]:
cols_of_interst = ['question','answer','models_answer','models_score']
df_sample.head()[cols_of_interst]

Unnamed: 0,question,answer,models_answer,models_score
743,What percentage of the Korean language does Jeong Jae-do estimate to be Sino-Korean?,Sino-Korean makes up 30% of the Korean language.,70%,0.855748
104,What are the ant colonies that lack queens called?,Colonies that lack queens are called gamergate colonies.,ergatoids,0.973056
188,What are the names of the two zoos in Berlin?,The two zoos in Berlin are the Zoologischer Garten Berlin and the Tierpark Friedrichsfelde.,Zoologischer Garten Berlin,0.856098
1227,Which is the sub-Saharan indigenous language with the greatest number of speakers?,Hausa of West Africa is the sub-Saharan indigenous language with the greatest number of speakers.,Hausa of West Africa,0.83262
883,What is a resident of Melbourne known as?,Melburnian,Scotch College,0.875224


# Let's compare the model's answer with the answer provided in the dataset. To do so, we compute sentences similarity

In [16]:
!pip install -Uq sentence-transformers

Keyring is skipped due to an exception: 'keyring.backends'


In [17]:
def how_similar(sent1, sent2, model):
    '''
    compute sentence similarity. 
    '''
    #Compute embedding for both lists
    embedding_1= model.encode(str(sent1), convert_to_tensor=True)
    embedding_2 = model.encode(str(sent2), convert_to_tensor=True)
    return util.pytorch_cos_sim(embedding_1, embedding_2).item()
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df_sample['answers_similarity'] = df_sample.apply(lambda x: how_similar
            (x.answer, x.models_answer, model=model), axis=1)

In [18]:
# let's inspect 
df_sample[['question','answer','models_answer','answers_similarity']]
.sort_values(by='answers_similarity', ascending=False)

Unnamed: 0,question,answer,models_answer,answers_similarity
557,How many strings does a guitar typically have?,Six,six,1.0
454,What are characteristic features of Finnish?,Vowel harmony and an agglutinative morphology,vowel harmony and an agglutinative morphology,1.0
574,Whom did he share the Nobel Prize with?,Pierre and Marie Curie,Pierre and Marie Curie,1.0
636,What is the capital of Indonesia?,Jakarta,Jakarta,1.0
1319,"Why does Lewis use ""absolute case"" instead of ""nominative""?",Because it is also used for the indefinite accusative.,Because it is also used for the indefinite accusative,0.99312
958,Is the most popular sport in Montreal ice hockey?,"yes, The most popular sport in Montreal is ice hockey.",The most popular sport in Montreal is ice hockey,0.959829
1289,What can be augmented with a fourth valve?,the flugelhorn,flugelhorn,0.951683
1206,When did it sign on the air?,In 1941.,1941,0.862639
401,Whare is the name for drums that have a S10_set of wires held across some of all of the drum heads?,snares,snare drum,0.859701
565,How old is the oldest known representation of a guitar-like intrument being played?,"3,300 years old","3,300 year",0.834975


In [19]:
#df['question']
#df['answer']
df_sample['question']

#print(f"Answer: {models_answer}")

743                                                                                  What percentage of the Korean language does Jeong Jae-do estimate to be Sino-Korean?
104                                                                                                                    What are the ant colonies that lack queens called?
188                                                                                                                         What are the names of the two zoos in Berlin?
1227                                                                                   Which is the sub-Saharan indigenous language with the greatest number of speakers?
883                                                                                                                             What is a resident of Melbourne known as?
1334                                                                                                                              What are the pattern