In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy
from sentence_transformers import SentenceTransformer
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Jinze
[nltk_data]     Wang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# read data to dataframe
train_df = pd.read_json("data/train-v2.0.json")
# valid_df = pd.read_json("data/dev-v2.0.json")

In [3]:
train_df.shape

(442, 2)

In [4]:
train_df.head(5)

Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...
3,v2.0,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,v2.0,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [10]:
train_df.iloc[1,1]['paragraphs'][0]

{'qas': [{'question': "What was Frédéric's nationalities?",
   'id': '56cbd2356d243a140015ed66',
   'answers': [{'text': 'Polish and French', 'answer_start': 182}],
   'is_impossible': False},
  {'question': 'In what era was Frédéric active in?',
   'id': '56cbd2356d243a140015ed67',
   'answers': [{'text': 'Romantic era', 'answer_start': 276}],
   'is_impossible': False},
  {'question': 'For what instrument did Frédéric write primarily for?',
   'id': '56cbd2356d243a140015ed68',
   'answers': [{'text': 'solo piano', 'answer_start': 318}],
   'is_impossible': False},
  {'question': 'In what area was Frédéric born in?',
   'id': '56cbd2356d243a140015ed69',
   'answers': [{'text': 'Duchy of Warsaw', 'answer_start': 559}],
   'is_impossible': False},
  {'question': 'At what age did Frédéric depart from Poland?',
   'id': '56cbd2356d243a140015ed6a',
   'answers': [{'text': '20', 'answer_start': 777}],
   'is_impossible': False},
  {'question': 'What year was Chopin born?',
   'id': '56ce0a3

In [11]:
# Reformat the data
contexts=[]
questions=[]
answer_texts=[]
total_questions=0
for i in range(train_df.shape[0]):
    topic = train_df.iloc[i,1]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            total_questions+=1
            if not q_a['is_impossible']:
                questions.append(q_a['question'])
                answer_texts.append(q_a['answers'][0]['text'])
                contexts.append(sub_para['context'])   
df = pd.DataFrame({"context":contexts, "question": questions, "text": answer_texts})

In [13]:
# only 67% of the questions can be answered
len(df)/total_questions

0.6662190471074824

In [14]:
df.to_csv("data/train_data.csv")

In [15]:
df.head()

Unnamed: 0,context,question,text
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [5]:
# We want to create embeddings for each sentences, later we will just find the sentence where the answer text occurs in.
# So here we collect all sentences
contexts=list(df["context"].drop_duplicates().reset_index(drop= True))
blob=TextBlob(" ".join(contexts))
sentences = [item.raw for item in blob.sentences]
questions = list(df['question'])

In [6]:
# We need to find a way of embedding, here we use SBert for sentence embedding 
# details can be found here: https://www.sbert.net/docs/pretrained_models.html#multi-qa-models
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
sentence_embeddings = model.encode(sentences)
questions_embeddings = model.encode(questions)

In [7]:
emb_dict={}
for i in range(len(sentences)):
    emb_dict[sentences[i]]=sentence_embeddings[i]
for i in range(len(questions)):
    emb_dict[questions[i]]=questions_embeddings[i]

In [28]:
sentences[1]

"Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child."

In [9]:
emb_dict['At what age did Frédéric depart from Poland?'].shape

(384,)

In [10]:
with open('data/dict_embeddings.pickle', 'wb') as handle:
    pickle.dump(emb_dict, handle)