# Common stuff

In [26]:
#Common imports 
import pandas as pd
from IPython.display import Markdown, display, clear_output
from scipy import stats
from IPython.core.debugger import set_trace
from pathlib import Path

## Pickling

In [27]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

#  Reading the dataset

In [28]:
train = pd.read_json('../data/squad-v1/train-v1.1.json', orient='column')
dev = pd.read_json('../data/squad-v1/dev-v1.1.json', orient='column')

df = pd.concat([train, dev], ignore_index=True)

In [29]:
df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [30]:
text = df['data'][0]['paragraphs'][0]['context']
text

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [31]:
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load('en_core_web_md')

In [32]:
doc = nlp(text)

In [33]:
for sentence in doc.sents:
    print(len(sentence.text))
    print()

53

73

142

62

87

128

144



In [34]:
for token in doc:
    print(token)

Architecturally
,
the
school
has
a
Catholic
character
.
Atop
the
Main
Building
's
gold
dome
is
a
golden
statue
of
the
Virgin
Mary
.
Immediately
in
front
of
the
Main
Building
and
facing
it
,
is
a
copper
statue
of
Christ
with
arms
upraised
with
the
legend
"
Venite
Ad
Me
Omnes
"
.
Next
to
the
Main
Building
is
the
Basilica
of
the
Sacred
Heart
.
Immediately
behind
the
basilica
is
the
Grotto
,
a
Marian
place
of
prayer
and
reflection
.
It
is
a
replica
of
the
grotto
at
Lourdes
,
France
where
the
Virgin
Mary
reputedly
appeared
to
Saint
Bernadette
Soubirous
in
1858
.
At
the
end
of
the
main
drive
(
and
in
a
direct
line
that
connects
through
3
statues
and
the
Gold
Dome
)
,
is
a
simple
,
modern
stone
statue
of
Mary
.


In [35]:
for noun_chunk in doc.noun_chunks:
    print(noun_chunk)

the school
a Catholic character
the Main Building's gold dome
a golden statue
the Virgin Mary
front
the Main Building
it
a copper statue
Christ
arms
the legend
Venite Ad Me Omnes
the Main Building
the Basilica
the Sacred Heart
the basilica
the Grotto
a Marian place
prayer
reflection
It
a replica
the grotto
Lourdes
France
the Virgin Mary
Saint Bernadette Soubirous
the end
the main drive
a direct line
3 statues
the Gold Dome
a simple, modern stone statue
Mary


In [36]:
for ne in doc.ents:
    print(ne)

Catholic
the Main Building's
the Virgin Mary
the Main Building
Christ
Venite Ad Me Omnes
the Main Building
the Basilica of the Sacred Heart
Grotto
Marian
Lourdes
France
Mary
Saint Bernadette Soubirous
1858
3
the Gold Dome
Mary


In [37]:
def NerForWord(text):
    doc = nlp(text)
    
    entitiesFound = len(doc.ents)
    
    if (entitiesFound > 0):
        #TODO - Could potentially find multiple entities in the text. We're returning only the first one.
        return doc.ents[0].label_
    else:
        return ''

In [38]:
NerForWord('Venite Ad Me Omnes')

'PERSON'

In [39]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Architecturally architecturally ADV RB advmod Xxxxx True False
, , PUNCT , punct , False False
the the DET DT det xxx True False
school school NOUN NN nsubj xxxx True False
has have VERB VBZ ROOT xxx True False
a a DET DT det x True False
Catholic catholic ADJ JJ amod Xxxxx True False
character character NOUN NN dobj xxxx True False
. . PUNCT . punct . False False
Atop atop ADP IN prep Xxxx True False
the the DET DT det xxx True False
Main main PROPN NNP compound Xxxx True False
Building building PROPN NNP poss Xxxxx True False
's 's PART POS case 'x False False
gold gold NOUN NN compound xxxx True False
dome dome NOUN NN nsubj xxxx True False
is be VERB VBZ ROOT xx True False
a a DET DT det x True False
golden golden ADJ JJ amod xxxx True False
statue statue NOUN NN attr xxxx True False
of of ADP IN prep xx True False
the the DET DT det xxx True False
Virgin virgin PROPN NNP compound Xxxxx True False
Mary mary PROPN NNP pobj Xxxx True False
. . PUNCT . punct . False False
Immediately 

In [40]:
spacy.explain('NNP')

'noun, proper singular'

# Extracting words and it's features

In [61]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_md')

#There seems to be a bug with spacy's stop words.
from spacy.lang.en.stop_words import STOP_WORDS
for word in STOP_WORDS:
    for w in (word, word[0].capitalize(), word.upper()):
        lex = nlp.vocab[w]
        lex.is_stop = True

## Extracting words from a paragrapgh

In [42]:
text = df['data'][0]['paragraphs'][0]['context']
qas = df['data'][0]['paragraphs'][0]['qas']

In [43]:
doc = nlp(text)

In [44]:
#Extract answers and the sentence they are in
answers = []

senStart = 0
senId = 0

for sentence in doc.sents:
    senLen = len(sentence.text)
    
    for answer in qas:
        answerStart = answer['answers'][0]['answer_start']
        
        if (answerStart >= senStart and answerStart < (senStart + senLen)):
            answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})
            
    senStart += senLen
    senId += 1

In [45]:
answers

[{'sentenceId': 1, 'text': 'a golden statue of the Virgin Mary'},
 {'sentenceId': 2, 'text': 'a copper statue of Christ'},
 {'sentenceId': 3, 'text': 'the Main Building'},
 {'sentenceId': 4, 'text': 'a Marian place of prayer and reflection'},
 {'sentenceId': 5, 'text': 'Saint Bernadette Soubirous'}]

In [137]:
#TODO - Clean answers from stopwords?

In [134]:
def tokenIsAnswer(token, sentenceId):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

In [136]:
tokenIsAnswer('the Main Building', 4)

False

In [144]:
#Save named entities start points

neStarts = {}
for ne in doc.ents:
    neStarts[ne.start] = ne

In [140]:
if 6 in neStarts:
    print(neStarts[6].label_)

NORP


In [125]:
senStarts = []
for sentence in doc.sents:
    senStarts.append(sentence[0].i)
    
def getSentenceForWordPosition(wordPos):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1
    return i

In [126]:
senStarts

[0, 9, 25, 55, 68, 84, 108]

In [131]:
getSentenceForWordPosition(108)

6

In [178]:
#Creating the dataframe
wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
wordDf = pd.DataFrame(columns=wordColums)

#Save to pickle

#load df

#Add new words to array
newWord = ['koala', True, 0, 0, 4, 1, None, None, None, None, 'xxxxx']
newWords = []
#newWords.append(newWord)

#Make array to dataframe
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf

#Merge dataframes

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape


In [179]:
i = 0
while (i < len(doc)):
    #If the token is a start of a Named Entity, add it and push to index to end of the NE
    if (i in neStarts):
        word = neStarts[i]
        #add word
        currentSentence = getSentenceForWordPosition(word.start)
        wordLen = word.end - word.start
        shape = ''
        for wordIndex in range(word.start, word.end):
            shape += (' ' + doc[wordIndex].shape_)
        
        newWords.append([word.text,
                        tokenIsAnswer(word.text, currentSentence),
                        0,
                        0,
                        currentSentence,
                        wordLen,
                        word.label_,
                        None,
                        None,
                        None,
                        shape])
        i = neStarts[i].end - 1
    #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
    else:
        if (doc[i].is_stop == False and doc[i].is_alpha == True):
            word = doc[i]
            
            currentSentence = getSentenceForWordPosition(i)
            wordLen = 1

            newWords.append([word.text,
                            tokenIsAnswer(word.text, currentSentence),
                            0,
                            0,
                            currentSentence,
                            wordLen,
                            None,
                            word.pos_,
                            word.tag_,
                            word.dep_,
                            word.shape_])
    i += 1


In [180]:
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0,1,NORP,,,,Xxxxx
3,character,False,0,0,0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1,1,,ADP,IN,prep,Xxxx
5,the Main Building's,False,0,0,1,4,FAC,,,,xxx Xxxx Xxxxx 'x
6,gold,False,0,0,1,1,,NOUN,NN,compound,xxxx
7,dome,False,0,0,1,1,,NOUN,NN,nsubj,xxxx
8,golden,False,0,0,1,1,,ADJ,JJ,amod,xxxx
9,statue,False,0,0,1,1,,NOUN,NN,attr,xxxx
