In [34]:
import pandas as pd
from IPython.display import Markdown, display, clear_output
from scipy import stats
from pathlib import Path

## Pickling

In [3]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [4]:
#Displaying the percentage completed
def printProgress(currentStep, maxStep):
    stepSize = maxStep / 100
    
    if (int(currentStep / stepSize) > ((currentStep - 1) / stepSize)):
        clear_output()
        print('{}%'.format(int(currentStep / stepSize)))

## Reading the data

In [10]:
train = pd.read_json('data/squad/train-v1.1.json', orient='column')
dev = pd.read_json('data/squad/dev-v1.1.json', orient='column')

In [11]:
train.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


# Extracting answers and features

In [5]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_md')

#There seems to be a bug with spacy's stop words.
from spacy.lang.en.stop_words import STOP_WORDS
for word in STOP_WORDS:
    for w in (word, word[0].capitalize(), word.upper()):
        lex = nlp.vocab[w]
        lex.is_stop = True

In [13]:
currText = train['data'][0]['paragraphs'][0]['context']
currQas = train['data'][0]['paragraphs'][0]['qas']

currDoc = nlp(currText)

In [14]:
#Extract answers and the sentence they are in
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

In [15]:
currAnswers = extractAnswers(currQas, currDoc)
currAnswers

[{'sentenceId': 1, 'text': 'a golden statue of the Virgin Mary'},
 {'sentenceId': 2, 'text': 'a copper statue of Christ'},
 {'sentenceId': 3, 'text': 'the Main Building'},
 {'sentenceId': 4, 'text': 'a Marian place of prayer and reflection'},
 {'sentenceId': 5, 'text': 'Saint Bernadette Soubirous'}]

In [16]:
#TODO - Clean answers from stopwords?
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

In [17]:
tokenIsAnswer('the Main Building', 4, currAnswers)

False

In [18]:
#Save named entities start points

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

In [19]:
currNeStarts = getNEStartIndexs(currDoc)

if 6 in currNeStarts:
    print(currNeStarts[6].label_)

NORP


In [20]:
def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1

In [21]:
senStarts = getSentenceStartIndexes(currDoc)
senStarts

[0, 9, 25, 55, 68, 84, 108]

In [22]:
getSentenceForWordPosition(108, senStarts)

In [23]:
#Creating the dataframe
wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
wordDf = pd.DataFrame(columns=wordColums)

#Save to pickle

#load df

#Add new words to array
newWord = ['koala', True, 0, 0, 4, 1, None, None, None, None, 'xxxxx']
newWords = []
#newWords.append(newWord)

#Make array to dataframe
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf

#Merge dataframes

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape


In [24]:
def addWordsForParagrapgh(newWords, titleId, paragraphId, df):
    text = df['data'][titleId]['paragraphs'][paragraphId]['context']
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']

    doc = nlp(text)

    answers = extractAnswers(qas, doc)
    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            tokenIsAnswer(word.text, currentSentence, answers),
                            titleId,
                            paragraphId,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                tokenIsAnswer(word.text, currentSentence, answers),
                                titleId,
                                paragraphId,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1


In [25]:
newWords

[]

In [27]:
addWordsForParagrapgh(newWords, 0, 0, train)

In [28]:
newWords[0]

['Architecturally', False, 0, 0, 0, 1, None, 'ADV', 'RB', 'advmod', 'Xxxxx']

In [29]:
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx


In [30]:
newWordsDf[newWordsDf['isAnswer'] == True].head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
22,the Main Building,True,0,0,3.0,3,ORG,,,,xxx Xxxx Xxxxx
40,Saint Bernadette Soubirous,True,0,0,5.0,3,PERSON,,,,Xxxxx Xxxxx Xxxxx


In [35]:
words = []

#titlesCount = len(df['data'])
titlesCount = 2

for titleId in range(titlesCount):
    paragraphsCount = len(train['data'][titleId]['paragraphs'])
        
    printProgress(titleId, titlesCount - 1)

    for paragraphId in range(paragraphsCount):
        addWordsForParagrapgh(words, titleId, paragraphId, train)
        

100%


In [36]:
wordsDf = pd.DataFrame(words, columns=wordColums)
wordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx


In [37]:
print("Total words for 2 articles:", len(wordsDf))

Total words for 2 articles: 9147


In [38]:
def generateWordsDF(pickleName, df):
    #If the dataframe is already generated, load it.
    if (pickleExists(pickleName)):
        print("Pickle found. Saved some time.")
        wordsDf = loadPickle(pickleName)
    else:
        #Extracting words
        words = []

        titlesCount = len(df['data'])

        for titleId in range(titlesCount):
            paragraphsCount = len(df['data'][titleId]['paragraphs'])

            printProgress(titleId, titlesCount - 1)

            for paragraphId in range(paragraphsCount):
                addWordsForParagrapgh(words, titleId, paragraphId, df)

        #Create the dataframe
        wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
        wordsDf = pd.DataFrame(words, columns=wordColums)

        #Pickle the result
        dumpPickle(pickleName, wordsDf)
        print("Result was not pickled. You had to wait.")

In [40]:
wordPickleName = 'pickles/trainWordsDf.pkl'
generateWordsDF(wordPickleName, train)

100%
Result was not pickled. You had to wait.


In [41]:
wordPickleName = 'pickles/devWordsDf.pkl'
generateWordsDF(wordPickleName, dev)

100%
Result was not pickled. You had to wait.
