# Imports

In [2]:
import pandas as pd
import os

In [15]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [96]:
from IPython.display import Markdown, display, clear_output

def printBold(string):
    display(Markdown('**' + string + '**'))

# Printing data

In [98]:
def boldAnswers(titleId, paragraphId):

    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    
    # Collect the start index and end index of each answer, and then sort it
    answerRangeTuple = []
        
    for questionId in range(len( df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
        answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
        answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
        
        answerRangeTuple.append((answerStart, answerStart + len(answer)))
        
    answerRangeTuple.sort(key=lambda x: x[0], reverse=False)

    # Add the indices to be split by
    indices = [0] 
    
    for a in answerRangeTuple:
        indices.append(a[0])
        indices.append(a[1])
            
    # Split text by answers
    parts = [paragraph[i:j] for i,j in zip(indices, indices[1:]+[None])]
    
    # Append bold markdown symbols ('**') around each answer
    text = ""
    
    for i in range(len(parts)):
        if i % 2 == 0:
            text += parts[i]
        else:
            text += ' **' + parts[i] + '** '

    # Print
    printBold(title)
    print()
    display(Markdown(text))

In [99]:
def showQuestions(titleId, paragraphId):

    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']

    printBold('Questions')

    for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
        question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
        answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
        
        print(question)
        printBold(answer)

# Loading data

In [5]:
df = pd.read_json('../data/squad-v1/train-v1.1.json', orient='column')

In [49]:
titleId = 0
paragraphId = 0

title = df['data'][titleId]['title']
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']

In [16]:
doc = nlp(paragraph)

In [37]:
for sent in doc.sents:
    print(sent)

Architecturally, the school has a Catholic character.
Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [38]:
for qa in qas:
    print(qa['answers'][0]['answer_start'])

515
188
279
381
92


In [48]:
for sent in doc.sents:
    print(sent[0].idx)

0
54
128
271
334
422
551


# Functions


In [94]:

def get_answer_starts(titleId, paragraphId):
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']
    
    answer_starts = []
    
    for qa in qas:
        answer_starts.append(qa['answers'][0]['answer_start'])
        
    return answer_starts
    

In [64]:
answer_starts = get_answer_starts(0,0)

In [83]:
def label_sentences(titleId, paragraphId):
    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    
    answer_starts = get_answer_starts(titleId, paragraphId)
    
    important = []
    unimportant = []
    
    doc = nlp(paragraph)
    
    for sent in doc.sents:
        sentStart = sent[0].idx
        sentEnd = doc[sent.end - 1].idx
        found = False
        
        for answer_start in answer_starts:
            if answer_start >= sentStart and answer_start < sentEnd:
                important.append(sent)
                found = True
                break
        
        if not found:
            unimportant.append(sent)
                
    return important, unimportant

# Testing

In [90]:
important, unimportant = label_sentences(42, 1)

In [95]:
important

[Schwarzenegger began weight training at the age of 15.,
 He won the Mr. Universe title at age 20 and went on to win the Mr. Olympia contest seven times.,
 His breakthrough film was the sword-and-sorcery epic Conan the Barbarian in 1982, which was a box-office hit and resulted in a sequel.,
 He was nicknamed the "Austrian Oak" in his bodybuilding days, "Arnie" during his acting career, and "The Governator" (a portmanteau of "Governor" and "The Terminator", one of his best-known movie roles).]

In [92]:
unimportant

[Schwarzenegger has remained a prominent presence in bodybuilding and has written many books and articles on the sport.,
 He is widely considered to be among the greatest bodybuilders of all times as well as its biggest icon.,
 Schwarzenegger gained worldwide fame as a Hollywood action film icon.,
 In 1984, he appeared in James Cameron's science-fiction thriller film The Terminator, which was a massive critical and box-office success.,
 Schwarzenegger subsequently reprised the Terminator character in the franchise's later installments in 1991, 2003, and 2015.,
 He appeared in a number of successful films, such as Commando (1985), The Running Man (1987), Predator (1987), Twins (1988), Total Recall (1990), Kindergarten Cop (1990) and True Lies (1994).]

In [108]:
titleId = 0
textId = 0

important, unimportant = label_sentences(titleId, paragraphId)

boldAnswers(titleId, textId)
showQuestions(titleId, textId)

printBold('Important:')
for sent in important:
    print(sent)

printBold('Unimportant:')
for sent in unimportant:
    print(sent)


**University_of_Notre_Dame**




Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is  **a golden statue of the Virgin Mary** . Immediately in front of the Main Building and facing it, is  **a copper statue of Christ**  with arms upraised with the legend "Venite Ad Me Omnes". Next to  **the Main Building**  is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto,  **a Marian place of prayer and reflection** . It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to  **Saint Bernadette Soubirous**  in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

**Questions**

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


**Saint Bernadette Soubirous**

What is in front of the Notre Dame Main Building?


**a copper statue of Christ**

The Basilica of the Sacred heart at Notre Dame is beside to which structure?


**the Main Building**

What is the Grotto at Notre Dame?


**a Marian place of prayer and reflection**

What sits on top of the Main Building at Notre Dame?


**a golden statue of the Virgin Mary**

**Important:**

Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.


**Unimportant:**

Architecturally, the school has a Catholic character.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
