#### Get embedding of each sentence

In [1]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
nlp = spacy.load('en_core_web_sm')

In [2]:
train = pd.read_csv("data/train_data.csv").dropna()
with open("data/dict_embeddings.pickle", "rb") as f:
    d = pickle.load(f)
emb_dict=dict(d)

In [9]:
train.head()

Unnamed: 0.1,Unnamed: 0,context,question,text
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [3]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [4]:
train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
train["target"] = train.apply(get_target, axis = 1)
train['s_emb'] = train['sentences'].apply(lambda x: [emb_dict[item] if item in emb_dict else np.zeros(384) for item in x])
train['q_emb'] = train['question'].apply(lambda x: emb_dict[x] if x in emb_dict else np.zeros(384) )

In [5]:
train.head(3)

Unnamed: 0.1,Unnamed: 0,context,question,text,sentences,target,s_emb,q_emb
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.046378605, -0.04540641, -0.05582417, 0.0166..."
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.09421391, -0.01728372, 0.024937417, 0.06089..."
2,2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.034041032, -0.005861517, 0.005615792, 0.002..."


#### Find the best sentence match depending on the cosine distance

In [6]:
def cosine_sim(x):
    li = []
    for item in x["s_emb"]:
        li.append(spatial.distance.cosine(item,x["q_emb"]))
    return li  

def pred_idx(distances):
    return np.argmin(distances)  

In [7]:
train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))

In [8]:
train['sentences'][0]

['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.',
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.",
 "Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.",
 'Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".']

In [9]:
def accuracy(target, predicted):
    return (target==predicted).sum()/len(target)
print(accuracy(train["target"], train["pred_idx_cos"]))

0.6644782308223911


#### Extract the answer from the target sentence

In [10]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [11]:
[to_nltk_tree(sent.root).pretty_print()  for sent in nlp(train.iloc[0,2]).sents]

          start              
  __________|___________      
 |    |     |     |  becoming
 |    |     |     |     |     
When did Beyonce  ?  popular 



[None]

In [23]:
train.iloc[0,4]

['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.',
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.",
 "Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.",
 'Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".']

In [27]:
for sent in nlp(train.iloc[0,2].lower()).sents:
    print(sent)

when did beyonce start becoming popular?


In [30]:
st.stem(str([sent for sent in nlp(train.iloc[0,2]).sents]))

'[when did beyonce start becoming popular?]'

In [26]:
for sent in nlp("Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.".lower()).sents:
    print(sent)

born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r&b girl-group destiny's child.


In [18]:
def match_roots(x):
    question = x["question"].lower()
    sentences = nlp(x["context"][x['pred_idx_cos']].lower()).sents
    
    question_root = st.stem(str([sent.root for sent in nlp(question).sents][0]))
    
    li = []
    for i,sent in enumerate(sentences):
        roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]

        if question_root in roots: 
            for k,item in enumerate(ast.literal_eval(x["sentences"])):
                if str(sent) in item.lower(): 
                    li.append(k)
    return li

In [19]:
train["root_match_idx"] = train.apply(match_roots, axis = 1)

In [20]:
train.head(4)

Unnamed: 0.1,Unnamed: 0,context,question,text,sentences,target,s_emb,q_emb,cosine_sim,pred_idx_cos,root_match_idx
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.046378605, -0.04540641, -0.05582417, 0.0166...","[0.4551882743835449, 0.6188368201255798, 0.541...",0,[]
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.09421391, -0.01728372, 0.024937417, 0.06089...","[0.43326061964035034, 0.532273679971695, 0.565...",0,[]
2,2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.034041032, -0.005861517, 0.005615792, 0.002...","[0.4596518874168396, 0.2857998013496399, 0.562...",1,[]
3,3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0030430004, -0.067105405, -0.041505888, 0....","[0.07113294, -0.00012752412, -0.0027405985, 0....","[0.40706610679626465, 0.522320032119751, 0.562...",0,[]
