In [5]:
import numpy as np, pandas as pd
import json
import ast 
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
from nltk import Tree
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [6]:
train = pd.read_csv("InferSent/encoder/data/train.tsv",sep='\t')

In [7]:
!pip 


Usage:   
  pip <command> [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  completion                  A helper command used for command completion.
  help                        Show help for commands.

General Options:
  -h, --help                  Show help.
  --isolated                  Run pip in an isolated mode, ignor

In [8]:
train.shape

(4957, 5)

### Loading Embedding dictionary

In [9]:
with open("InferSent/encoder/data/dict_embeddings.pickle", "rb") as f:
    d_context_embeddings = pickle.load(f)

In [10]:
with open("InferSent/encoder/data/dict_embeddings_questions.pickle", "rb") as f:
    d_question_embeddings = pickle.load(f)

In [11]:
dict_context_emb = dict(d_context_embeddings)
dict_question_emb = dict(d_question_embeddings)

In [12]:
len(dict_context_emb), len(dict_question_emb)

(1326, 4835)

In [13]:
#for key,value in dict_emb.items():
#    print(value)

In [14]:
del d_context_embeddings, d_question_embeddings

## Data Processing

In [15]:
sentences = []
with open('InferSent/encoder/data/openbook.txt') as f:
    for line in f:
        sentences.append(line.strip())
print(len(sentences))        

1326


In [16]:
train['sentences'] = [sentences for i in train.index]

In [17]:
train['quest_emb'] = train['Question Stem'].apply(lambda x: dict_question_emb[x] if x \
                                                  in dict_question_emb else np.zeros(4096))

In [18]:
train['sent_emb'] = train['sentences'].apply(lambda x: [dict_context_emb[item][0] if item in\
                                                           dict_context_emb else np.zeros(4096) for item in x])

In [19]:
train.head(3)

Unnamed: 0,ID,Question Stem,Choices,Complete Question,Answer Key,sentences,quest_emb,sent_emb
0,7-980,The sun is responsible for,(A) puppies learning new tricks (B) children g...,The sun is responsible for (A) puppies learnin...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.051545985, 0.03336649, -0.005079767, -0.00...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
1,7-584,When standing miles away from Mount Rushmore,(A) the mountains seem very close (B) the moun...,When standing miles away from Mount Rushmore (...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.045675803, 0.026302924, -0.030445106, -0.0...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
2,7-870,When food is reduced in the stomach,(A) the mind needs time to digest (B) take a s...,When food is reduced in the stomach (A) the mi...,C,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.12085862, 0.0785711, 0.05753665, -0.025412...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."


In [20]:
train.shape

(4957, 8)

In [21]:
train.dropna(inplace=True)

In [22]:
train.head(10)

Unnamed: 0,ID,Question Stem,Choices,Complete Question,Answer Key,sentences,quest_emb,sent_emb
0,7-980,The sun is responsible for,(A) puppies learning new tricks (B) children g...,The sun is responsible for (A) puppies learnin...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.051545985, 0.03336649, -0.005079767, -0.00...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
1,7-584,When standing miles away from Mount Rushmore,(A) the mountains seem very close (B) the moun...,When standing miles away from Mount Rushmore (...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.045675803, 0.026302924, -0.030445106, -0.0...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
2,7-870,When food is reduced in the stomach,(A) the mind needs time to digest (B) take a s...,When food is reduced in the stomach (A) the mi...,C,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.12085862, 0.0785711, 0.05753665, -0.025412...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
3,7-321,Stars are,(A) warm lights that float (B) made out of nit...,Stars are (A) warm lights that float (B) made ...,C,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.05572503, -0.056643672, -0.020228773, -0.0...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
4,9-732,You can make a telescope with a,(A) straw (B) Glass (C) Candle (D) mailing tube,You can make a telescope with a (A) straw (B) ...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.08625981, 0.028972307, 0.061029114, -0.020...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
5,9-782,Poison causes harm to which of the following?,(A) a Tree (B) a robot (C) a house (D) a car,Poison causes harm to which of the following? ...,A,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.06612222, 0.12482706, 0.1083512, -0.042983...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
6,9-87,an inherited characteristic found on all mamma...,(A) nails (B) teeth (C) shoes (D) fur,an inherited characteristic found on all mamma...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.10247472, 0.11741416, 0.12547502, -0.03464...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
7,155,What doesn't eliminate waste?,(A) plants (B) mushrooms (C) bacteria (D) robots,What doesn't eliminate waste? (A) plants (B) m...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.027517647, -0.0104107335, -0.010333556, -0...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
8,1046,As a car approaches you in the night,(A) the headlights become more intense (B) the...,As a car approaches you in the night (A) the h...,A,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.07863207, 0.09673026, 0.018998649, 0.06247...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."
9,7-637,When the weather changes as it does from Chris...,(A) the air may chill (B) the ground may freez...,When the weather changes as it does from Chris...,D,"[""A bee is a pollinating animal"", ""A bird is a...","[[0.06392588, 0.019071648, 0.13140805, 0.07327...","[[0.051334266, 0.032628123, 0.008248221, 0.062..."


## Predicted Cosine & Euclidean Index

In [23]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [24]:
def pred_idx(distances):
    return np.argmin(distances)   

In [25]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train
    

In [None]:
predicted = predictions(train)

In [71]:
predicted.head(3)

NameError: name 'predicted' is not defined

In [23]:
predicted["cosine_sim"][0]

[0.424736299052452,
 0.36405004106069117,
 0.3477550016687636,
 0.3942415731988862,
 0.37102476524939887,
 0.1856902254140269,
 0.35192069116776403]

In [24]:
predicted["euclidean_dis"][0]

[14.563858, 15.262212, 17.398178, 14.272491, 13.339654, 9.336262, 15.720997]

## Accuracy

In [25]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [26]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.44856046941711


### Accuracy for Cosine Similarity

In [27]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.6338843352587958


In [44]:
predicted.to_csv("train_detect_sent.csv", index=None)

In [34]:
predicted.iloc[75207,:]

answer_start                                                    69
context          Both the vertical and dipole antennas are simp...
question                             Are basic antennas expensive?
text                                        relatively inexpensive
sentences        [Both the vertical and dipole antennas are sim...
target                                                           0
sent_emb         [[0.06494937, 0.03690031, 0.12519251, -0.02735...
quest_emb        [[0.031715073, 0.07947657, 0.030824697, 0.0126...
cosine_sim       [0.359188584685867, 0.4689117244223153, 0.4489...
euclidean_dis    [11.512397, 21.817242, 12.696278, 17.895185, 1...
pred_idx_cos                                                     0
pred_idx_euc                                                     0
Name: 75208, dtype: object

In [38]:
ct,k = 0,0
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] != predicted.iloc[i,5]:
        k += 1
        if predicted.iloc[i,11] == predicted.iloc[i,5]:
            ct += 1

In [39]:
ct, k

(5545, 32071)

### Combining Accuracy

In [47]:
label = []
for i in range(predicted.shape[0]):
    if predicted.iloc[i,10] == predicted.iloc[i,11]:
        label.append(predicted.iloc[i,10])
    else:
        label.append((predicted.iloc[i,10],predicted.iloc[i,10]))

In [66]:
ct = 0
for i in range(75206):
    item = predicted["target"][i]
    try:
        if label[i] == predicted["target"][i]: ct +=1
    except:
        if item in label[i]: ct +=1
            

In [68]:
ct/75206

0.6370901257878361

### Root Match

In [2]:
predicted = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [39]:
doc = en_nlp(predicted.iloc[0,1])

In [40]:
predicted.iloc[0,1]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [49]:
predicted.iloc[0,2]

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [41]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [42]:
[to_nltk_tree(sent.root).pretty_print()  for sent in en_nlp(predicted.iloc[0,2]).sents]

                  appear                             
  __________________|____________________________     
 |      |      |    |         |           |      in  
 |      |      |    |         |           |      |    
 |      |      |    To       Mary         in   France
 |      |      |    |      ___|_____      |      |    
did allegedly  ?   whom  the      Virgin 1858 Lourdes



[None]

In [50]:
[to_nltk_tree(sent.root) .pretty_print() for sent in doc.sents][5]

                    has                              
        _____________|_________________               
       |         |   |  school     character         
       |         |   |    |      ______|________      
Architecturally  ,   .   the    a            Catholic

                            is                           
  __________________________|________                     
 |           Atop                  statue                
 |            |                  ____|_________           
 |           dome               |    |         of        
 |    ________|______           |    |         |          
 |   |            Building      |    |        Mary       
 |   |     __________|______    |    |      ___|_____     
 .  gold the        Main    's  a  golden the      Virgin

                                                  is                                                   
  ________________________________________________|______________________                       

In [44]:
for sent in doc.sents:
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

['has', 'has']
['atop', 'is', 'of']
['in', 'of', 'fac', 'is', 'of', 'with', 'with', 'legend']
['to', 'is', 'of']
['behind', 'is', 'grotto', 'of', 'pray']
['is', 'is', 'of', 'at', 'lourd', 'appear', 'to']
['at', 'of', 'in', 'through', 'statu', 'is', 'of']


In [45]:
def match_roots(x):
    question = x["question"].lower()
    sentences = en_nlp(x["context"].lower()).sents
    
    question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
    li = []
    for i,sent in enumerate(sentences):
        roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]

        if question_root in roots: 
            for k,item in enumerate(ast.literal_eval(x["sentences"])):
                if str(sent) in item.lower(): 
                    li.append(k)
    return li

In [46]:
predicted["question"][21493]

'The end of what road was once home to Newgate Prison?'

In [47]:
predicted["context"][21493]

"10th Street (40°44′03″N 74°00′11″W\ufeff / \ufeff40.7342580°N 74.0029670°W\ufeff / 40.7342580; -74.0029670) begins at the FDR Drive and Avenue C. West of Sixth Avenue, it turns southward about 40 degrees to join the Greenwich Village street grid and continue to West Street on the Hudson River. Because West 4th Street turns northward at Sixth Avenue, it intersects 10th, 11th and 12th and 13th Streets in the West Village. The M8 bus operates on 10th Street in both directions between Avenue D and Avenue A, and eastbound between West Street and Sixth Avenue. 10th Street has an eastbound bike lane from West Street to the East River. In 2009, the two-way section of 10th Street between Avenue A and the East River had bicycle markings and sharrows installed, but it still has no dedicated bike lane. West 10th Street was previously named Amos Street for Richard Amos. The end of West 10th Street toward the Hudson River was once the home of Newgate Prison, New York City's first prison and the Uni

In [48]:
predicted["root_match_idx"] = predicted.apply(match_roots, axis = 1)

In [51]:
predicted["root_match_idx_first"]= predicted["root_match_idx"].apply(lambda x: x[0] if len(x)>0 else 0)

In [52]:
(predicted["root_match_idx_first"]==predicted["target"]).sum()/predicted.shape[0]

0.3977488070503893

In [55]:
predicted.to_csv("train_detect_sent.csv", index=None)

In [53]:
predicted[(predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11) &  (predicted["root_match_idx_first"]>10)]       



Unnamed: 0,answer_start,context,question,text,sentences,quest_emb,target,sent_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc,root_match_idx,root_match_idx_first


In [23]:
len(ast.literal_eval(predicted.iloc[21493,4]))

7

In [59]:
question = predicted["question"][21493].lower()
sentences = en_nlp(predicted["context"][21493].lower()).sents
    
question_root = st.stem(str([sent.root for sent in en_nlp(question).sents][0]))
    
li = []
for i,sent in enumerate(sentences):
    roots = [st.stem(chunk.root.head.text.lower()) for chunk in sent.noun_chunks]
    print(roots)

    if question_root in roots: li.append(i)

['street']
['°']
[]
['°']
['°', '°']
[]
['at', 'driv', 'of', 'turn', 'turn', 'join', 'west', 'on']
['turn', 'at', 'intersect', 'intersect', 'in']
['op', 'on', 'in', 'between', 'street']
['has', 'has', 'from', 'to']
['had', 'of', 'a', 'had', 'had', 'has', 'has']
['nam']
['was', 'of', 'toward', 'was', 'of', 'hom']


In [4]:
ast.literal_eval(predicted["sentences"][21493])

['10th Street (40°44′03″N 74°00′11″W\ufeff / \ufeff40.7342580°N 74.0029670°W\ufeff / 40.7342580; -74.0029670) begins at the FDR Drive and Avenue C. West of Sixth Avenue, it turns southward about 40 degrees to join the Greenwich Village street grid and continue to West Street on the Hudson River.',
 'Because West 4th Street turns northward at Sixth Avenue, it intersects 10th, 11th and 12th and 13th Streets in the West Village.',
 'The M8 bus operates on 10th Street in both directions between Avenue D and Avenue A, and eastbound between West Street and Sixth Avenue.',
 '10th Street has an eastbound bike lane from West Street to the East River.',
 'In 2009, the two-way section of 10th Street between Avenue A and the East River had bicycle markings and sharrows installed, but it still has no dedicated bike lane.',
 'West 10th Street was previously named Amos Street for Richard Amos.',
 "The end of West 10th Street toward the Hudson River was once the home of Newgate Prison, New York City's

In [5]:
predicted["context"][21493]

"10th Street (40°44′03″N 74°00′11″W\ufeff / \ufeff40.7342580°N 74.0029670°W\ufeff / 40.7342580; -74.0029670) begins at the FDR Drive and Avenue C. West of Sixth Avenue, it turns southward about 40 degrees to join the Greenwich Village street grid and continue to West Street on the Hudson River. Because West 4th Street turns northward at Sixth Avenue, it intersects 10th, 11th and 12th and 13th Streets in the West Village. The M8 bus operates on 10th Street in both directions between Avenue D and Avenue A, and eastbound between West Street and Sixth Avenue. 10th Street has an eastbound bike lane from West Street to the East River. In 2009, the two-way section of 10th Street between Avenue A and the East River had bicycle markings and sharrows installed, but it still has no dedicated bike lane. West 10th Street was previously named Amos Street for Richard Amos. The end of West 10th Street toward the Hudson River was once the home of Newgate Prison, New York City's first prison and the Uni

In [14]:
en_nlp = spacy.load('en')
sentences = en_nlp(predicted["context"][21493].lower()).sents

In [15]:
for item in sentences:
    print(item)

10th street
(40°44′03″n 74°00′11″w﻿
/
﻿40.7342580°n
74.0029670°w﻿
/ 40.7342580;
-74.0029670) begins at the fdr drive and avenue c. west of sixth avenue, it turns southward about 40 degrees to join the greenwich village street grid and continue to west street on the hudson river.
because west 4th street turns northward at sixth avenue, it intersects 10th, 11th and 12th and 13th streets in the west village.
the m8 bus operates on 10th street in both directions between avenue d and avenue a, and eastbound between west street and sixth avenue.
10th street has an eastbound bike lane from west street to the east river.
in 2009, the two-way section of 10th street between avenue a and the east river had bicycle markings and sharrows installed, but it still has no dedicated bike lane.
west 10th street was previously named amos street for richard amos.
the end of west 10th street toward the hudson river was once the home of newgate prison, new york city's first prison and the united states' seco

In [6]:
TfidfVectorizer(predicted["sentences"][0], ngram_range=(1,2))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',
        input='[\'Architecturally, the school has a Catholic character.\', "Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary.", \'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".\', \'Next ...ne that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.\']',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)