# Imports

In [1]:
import pandas as pd
import os

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
from IPython.display import Markdown, display, clear_output

def printBold(string):
    display(Markdown('**' + string + '**'))

# Printing data

In [4]:
def boldAnswers(titleId, paragraphId):

    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    
    # Collect the start index and end index of each answer, and then sort it
    answerRangeTuple = []
        
    for questionId in range(len( df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
        answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
        answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
        
        answerRangeTuple.append((answerStart, answerStart + len(answer)))
        
    answerRangeTuple.sort(key=lambda x: x[0], reverse=False)

    # Add the indices to be split by
    indices = [0] 
    
    for a in answerRangeTuple:
        indices.append(a[0])
        indices.append(a[1])
            
    # Split text by answers
    parts = [paragraph[i:j] for i,j in zip(indices, indices[1:]+[None])]
    
    # Append bold markdown symbols ('**') around each answer
    text = ""
    
    for i in range(len(parts)):
        if i % 2 == 0:
            text += parts[i]
        else:
            text += ' **' + parts[i] + '** '

    # Print
    printBold(title)
    print()
    display(Markdown(text))

In [5]:
def showQuestions(titleId, paragraphId):

    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']

    printBold('Questions')

    for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
        question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
        answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
        
        print(question)
        printBold(answer)

# Loading data

In [6]:
df = pd.read_json('../data/squad-v1/train-v1.1.json', orient='column')

In [7]:
titleId = 0
paragraphId = 0

title = df['data'][titleId]['title']
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']

In [8]:
doc = nlp(paragraph)

In [9]:
for sent in doc.sents:
    print(sent)

Architecturally, the school has a Catholic character.
Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [10]:
for qa in qas:
    print(qa['answers'][0]['answer_start'])

515
188
279
381
92


In [11]:
for sent in doc.sents:
    print(sent[0].idx)

0
54
128
271
334
422
551


# Functions


In [12]:

def get_answer_starts(titleId, paragraphId):
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']
    
    answer_starts = []
    
    for qa in qas:
        answer_starts.append(qa['answers'][0]['answer_start'])
        
    return answer_starts
    

In [13]:
answer_starts = get_answer_starts(0,0)

In [14]:
def label_sentences(titleId, paragraphId):
    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    
    answer_starts = get_answer_starts(titleId, paragraphId)
    
    important = []
    unimportant = []
    
    doc = nlp(paragraph)
    
    for sent in doc.sents:
        sentStart = sent[0].idx
        sentEnd = doc[sent.end - 1].idx
        found = False
        
        for answer_start in answer_starts:
            if answer_start >= sentStart and answer_start < sentEnd:
                important.append(sent)
                found = True
                break
        
        if not found:
            unimportant.append(sent)
                
    return important, unimportant

# Testing

In [15]:
important, unimportant = label_sentences(42, 1)

In [16]:
important

[Schwarzenegger began weight training at the age of 15.,
 He won the Mr. Universe title at age 20 and went on to win the Mr. Olympia contest seven times.,
 His breakthrough film was the sword-and-sorcery epic Conan the Barbarian in 1982, which was a box-office hit and resulted in a sequel.,
 He was nicknamed the "Austrian Oak" in his bodybuilding days, "Arnie" during his acting career, and "The Governator" (a portmanteau of "Governor" and "The Terminator", one of his best-known movie roles).]

In [17]:
unimportant

[Schwarzenegger has remained a prominent presence in bodybuilding and has written many books and articles on the sport.,
 He is widely considered to be among the greatest bodybuilders of all times as well as its biggest icon.,
 Schwarzenegger gained worldwide fame as a Hollywood action film icon.,
 In 1984, he appeared in James Cameron's science-fiction thriller film The Terminator, which was a massive critical and box-office success.,
 Schwarzenegger subsequently reprised the Terminator character in the franchise's later installments in 1991, 2003, and 2015.,
 He appeared in a number of successful films, such as Commando (1985), The Running Man (1987), Predator (1987), Twins (1988), Total Recall (1990), Kindergarten Cop (1990) and True Lies (1994).]

In [18]:
titleId = 0
textId = 0

important, unimportant = label_sentences(titleId, paragraphId)

boldAnswers(titleId, textId)
showQuestions(titleId, textId)

printBold('Important:')
for sent in important:
    print(sent)

printBold('Unimportant:')
for sent in unimportant:
    print(sent)


**University_of_Notre_Dame**




Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is  **a golden statue of the Virgin Mary** . Immediately in front of the Main Building and facing it, is  **a copper statue of Christ**  with arms upraised with the legend "Venite Ad Me Omnes". Next to  **the Main Building**  is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto,  **a Marian place of prayer and reflection** . It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to  **Saint Bernadette Soubirous**  in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

**Questions**

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


**Saint Bernadette Soubirous**

What is in front of the Notre Dame Main Building?


**a copper statue of Christ**

The Basilica of the Sacred heart at Notre Dame is beside to which structure?


**the Main Building**

What is the Grotto at Notre Dame?


**a Marian place of prayer and reflection**

What sits on top of the Main Building at Notre Dame?


**a golden statue of the Virgin Mary**

**Important:**

Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.


**Unimportant:**

Architecturally, the school has a Catholic character.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


# Create dataset

## Generate sentences

In [84]:
sentDf = pd.DataFrame(columns=['sent', 'isImportant', 'titleId', 'paragraphId'])

In [85]:
new_row = {'sent': 'Koala', 'isImportant':True, 'titleId': 0, 'paragraphId': 0}

In [86]:
sentDf = sentDf.append(new_row, ignore_index=True)

In [87]:
sentDf

Unnamed: 0,sent,isImportant,titleId,paragraphId
0,Koala,True,0,0


In [88]:
doc = nlp('The koala is an arboreal herbivorous marsupial native to Australia. It is the only extant representative of the family Phascolarctidae and its closest living relatives are the wombats, which comprise the family Vombatidae')

In [89]:
for sent in doc.sents:
    print(sent.start)

0
11


In [90]:
sentDf = pd.DataFrame(columns=['sent', 'isImportant', 'titleId', 'paragraphId', 'start', 'end'])

In [91]:
titles = 10

important = []
unimportant = []

for titleId in range(titles):
    paragraphCount = len(df['data'][titleId]['paragraphs'])
    
    for paragraphId in range(paragraphCount):
        currImportant, currUnimportant = label_sentences(titleId, paragraphId)
        
        for sent in currImportant:
            new_row = {'sent': sent.text, 'isImportant':True, 'titleId': titleId, 'paragraphId': paragraphId, 'start': sent.start, 'end': sent.end}
            sentDf = sentDf.append(new_row, ignore_index=True)
            
        for sent in currUnimportant:
            new_row = {'sent': sent.text, 'isImportant':False, 'titleId': titleId, 'paragraphId': paragraphId,'start': sent.start, 'end': sent.end}
            sentDf = sentDf.append(new_row, ignore_index=True)
                       

In [92]:
sentDf.head()

Unnamed: 0,sent,isImportant,titleId,paragraphId,start,end
0,Atop the Main Building's gold dome is a golden...,True,0,0,9,25
1,Immediately in front of the Main Building and ...,True,0,0,25,55
2,Next to the Main Building is the Basilica of t...,True,0,0,55,68
3,"Immediately behind the basilica is the Grotto,...",True,0,0,68,84
4,"It is a replica of the grotto at Lourdes, Fran...",True,0,0,84,108


In [93]:
sentDf['isImportant'].value_counts()

True     1657
False    1084
Name: isImportant, dtype: int64

In [94]:
# print(len(important), len(unimportant))

## Pickle

In [95]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

## Create df

[(lambda x: x.text)(im) for im in important][:10]

importantDf = pd.DataFrame(data=[(lambda x: x.text)(im) for im in important], columns=['Sentence'])

importantDf.head()

In [25]:
# dumpPickle('../data/important-100.pkl', importantDf)

In [26]:
# unimportantDf = pd.DataFrame(data=[(lambda x: x.text)(un) for un in unimportant], columns=['Sentence'])

In [27]:
# dumpPickle('../data/unimportant-100.pkl', unimportantDf)

In [96]:
dumpPickle('../data/sentDf-start-end-10.pkl', sentDf)

# Features

## Named entities

In [97]:
totals = []

for i in range(len(sentDf)):
    totals.append(len(nlp(sentDf.iloc[i]['sent']).ents))

In [98]:
newDf = sentDf

In [99]:
len(totals)

2741

In [100]:
newDf['ne_count'] = totals

In [173]:
newDf

Unnamed: 0,sent,isImportant,titleId,paragraphId,start,end,ne_count
0,Atop the Main Building's gold dome is a golden...,True,0,0,9,25,1
1,Immediately in front of the Main Building and ...,True,0,0,25,55,3
2,Next to the Main Building is the Basilica of t...,True,0,0,55,68,3
3,"Immediately behind the basilica is the Grotto,...",True,0,0,68,84,2
4,"It is a replica of the grotto at Lourdes, Fran...",True,0,0,84,108,5
...,...,...,...,...,...,...,...
2736,"This prompted criticism of the board online, e...",True,9,41,37,49,0
2737,They also muted all profanity.,False,9,41,31,37,0
2738,A sequel to Spectre will begin development in ...,True,9,42,0,11,2
2739,Christoph Waltz has signed on for two more fil...,True,9,42,26,53,4


In [102]:
# def getNeCount(sentences):
#     count = 0
    
#     for sent in sentences:
#         count += len(sent.ents)
    
#     return count

In [103]:
# ne_avg_important = getNeCount(important)/len(important)
# ne_avg_unimportant = getNeCount(unimportant)/len(unimportant)

# print('Average count of named entities in important', ne_avg_important)
# print('Average count of named entities in unimportant', ne_avg_unimportant)

In [104]:
ne_avg_important = sum(newDf[newDf['isImportant']==True]['ne_count']) / len(newDf[newDf['isImportant']==True])
ne_avg_unimportant = sum(newDf[newDf['isImportant']==False]['ne_count']) / len(newDf[newDf['isImportant']==False])

print('Average count of named entities in important', ne_avg_important)
print('Average count of named entities in unimportant', ne_avg_unimportant)

Average count of named entities in important 3.5576342788171393
Average count of named entities in unimportant 2.407749077490775


## TF/IDF

In [105]:
## load freq

In [106]:
import math

In [107]:
def get_frequencies(doc):
    freqs = {}
    
    for token in doc:
        if token.is_punct:
            continue

        word = token.text.lower()

        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 0

    return freqs

In [157]:
def compute_tf(doc, startToken, endToken, currFreq):
    keywordWordCount = endToken - startToken + 1
   
    totalFreq = 0

    for i in range(startToken, endToken):
        word = doc[i].text.lower()

        totalFreq += 0 if word not in currFreq else currFreq[word]

    tf = (totalFreq / keywordWordCount) / len(currFreq)
    
    return tf

In [158]:
def compute_idf(doc, startToken, endToken, wordFreq):
    keywordWordCount = endToken - startToken + 1

    totalFreq = 0

    for i in range(startToken, endToken):  
        word = doc[i].text.lower()
        
        totalFreq += 0 if word not in wordFreq else wordFreq[word]

    # In case the word is not found, count is a being found only once. Avoiding division by zero.
    currFreq = 1 if totalFreq / keywordWordCount == 0 else totalFreq / keywordWordCount
    idf = math.log(len(wordFreq) / currFreq)

    return idf

In [159]:
def compute_tfidf(doc, startToken, endToken, currFreq, paragraphFreq):
    
    tf = compute_tf(doc, startToken, endToken, currFreq)

    idfParagraph = compute_idf(doc, startToken, endToken, paragraphFreq)

    # Results
    return tf * idfParagraph        

## Compute TF/IDF

In [160]:
paragraphFreq = loadPickle('../data/idf/word-occurances-paragraph.pkl')

In [161]:
titleId = 0
paragraphId = 0

paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
doc = nlp(paragraph)

currFreq = get_frequencies(doc)

In [167]:
compute_tfidf(doc, 1, 5, currFreq, paragraphFreq)

0.01580971297087969

## Compute for sentences

In [176]:
tfidfs = []

for i in range(len(sentDf)):
    titleId = sentDf.iloc[i]['titleId']
    paragraphId = sentDf.iloc[i]['paragraphId']
    start = sentDf.iloc[i]['start']
    end = sentDf.iloc[i]['end']
    
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    doc = nlp(paragraph)

    currFreq = get_frequencies(doc)
    
    tfidf = compute_tfidf(doc, start, end, currFreq, paragraphFreq)
    
    tfidfs.append(tfidf) 

In [177]:
newDf['tfidf'] = tfidfs

In [179]:
newDf.head()

Unnamed: 0,sent,isImportant,titleId,paragraphId,start,end,ne_count,tfidf
0,Atop the Main Building's gold dome is a golden...,True,0,0,9,25,1,0.127637
1,Immediately in front of the Main Building and ...,True,0,0,25,55,3,0.081224
2,Next to the Main Building is the Basilica of t...,True,0,0,55,68,3,0.144167
3,"Immediately behind the basilica is the Grotto,...",True,0,0,68,84,2,0.10412
4,"It is a replica of the grotto at Lourdes, Fran...",True,0,0,84,108,5,0.081111


In [180]:
ne_avg_important = sum(newDf[newDf['isImportant']==True]['tfidf']) / len(newDf[newDf['isImportant']==True])
ne_avg_unimportant = sum(newDf[newDf['isImportant']==False]['tfidf']) / len(newDf[newDf['isImportant']==False])

print('Average count of named entities in important', ne_avg_important)
print('Average count of named entities in unimportant', ne_avg_unimportant)

Average count of named entities in important 0.04620313991023891
Average count of named entities in unimportant 0.046226850622545855
