In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import tokenize

import warnings
warnings.filterwarnings('ignore')

from IPython.display import Markdown, display, clear_output

import pickle
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy

import _pickle as cPickle
from pathlib import Path

In [4]:
def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [5]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

#There seems to be a bug with spacy's stop words.
from spacy.lang.en.stop_words import STOP_WORDS
for word in STOP_WORDS:
    for w in (word, word[0].capitalize(), word.upper()):
        lex = nlp.vocab[w]
        lex.is_stop = True
        
#Extract answers and the sentence they are in
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

#TODO - Clean answers from stopwords?
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

#Save named entities start points

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1
        
def addWordsForParagrapgh(newWords, text):
    doc = nlp(text)

    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            0,
                            0,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                0,
                                0,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1

def oneHotEncodeColumns(df):
    columnsToEncode = ['NER', 'POS', "TAG", 'DEP']

    for column in columnsToEncode:
        one_hot = pd.get_dummies(df[column])
        one_hot = one_hot.add_prefix(column + '_')

        df = df.drop(column, axis = 1)
        df = df.join(one_hot)
    
    return df

#predict whether a word is a keyword
def generateDf(text):
    words = []
    addWordsForParagrapgh(words, text)

    wordColums = ['text', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    df = pd.DataFrame(words, columns=wordColums)
    
    return df


def prepareDf(df):
    #One-hot encoding
    wordsDf = oneHotEncodeColumns(df)


    #Add missing colums 
    predictorFeaturesName = '/content/drive/My Drive/Capstone Project/Data/pickles/nb-predictor-features.pkl'
    featureNames = loadPickle(predictorFeaturesName)

    for feature in featureNames:
        if feature not in wordsDf.columns:
            wordsDf[feature] = 0    
                
    #Drop unused columns
    columnsToDrop = ['text', 'titleId', 'paragrapghId', 'sentenceId', 'shape', 'isAnswer']
    wordsDf = wordsDf.drop(columnsToDrop, axis = 1)
    feature_Names = featureNames.copy()
    feature_Names.remove('isAnswer')
    
    wordsDf = wordsDf[feature_Names]

    return wordsDf


def predictWords(wordsDf, df):
    
    predictorPickleName = '/content/drive/My Drive/Capstone Project/Data/pickles/nb-predictor.pkl'
    predictor = loadPickle(predictorPickleName)
    y_pred = predictor.predict_proba(wordsDf)
    
    labeledAnswers = []
    for i in range(len(y_pred)):
        labeledAnswers.append({'word': df.iloc[i]['text'], 'prob': y_pred[i][0]})
    
    return labeledAnswers

def blankAnswer(firstTokenIndex, lastTokenIndex, sentStart, sentEnd, doc):
    leftPartStart = doc[sentStart].idx
    leftPartEnd = doc[firstTokenIndex].idx
    rightPartStart = doc[lastTokenIndex].idx + len(doc[lastTokenIndex])
    rightPartEnd = doc[sentEnd - 1].idx + len(doc[sentEnd - 1])
    
    question = doc.text[leftPartStart:leftPartEnd] + '_____' + doc.text[rightPartStart:rightPartEnd]
    
    return question

def addQuestions(answers, text):
    doc = nlp(text)
    currAnswerIndex = 0
    qaPair = []
 
    #Check wheter each token is the next answer
    for sent in doc.sents:
        for token in sent:
            
            #If all the answers have been found, stop looking
            if currAnswerIndex >= len(answers):
                break
            
            #In the case where the answer is consisted of more than one token, check the following tokens as well.
            answerDoc = nlp(answers[currAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            #If the current token is corresponding with the answer, add it 
            if answerIsFound:
                question = blankAnswer(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                qaPair.append({'question' : question, 'answer': answers[currAnswerIndex]['word'], 'prob': answers[currAnswerIndex]['prob']})
                
                currAnswerIndex += 1
                
    return qaPair

def sortAnswers(qaPairs):
    orderedQaPairs = sorted(qaPairs, key=lambda qaPair: qaPair['prob'])
    
    return orderedQaPairs

In [6]:
test = pd.read_json('/content/drive/My Drive/Capstone Project/Data/dev-v1.1.json')
# train = pd.read_json('Data/train-v1.1.json')
test.head()

Unnamed: 0,data,version
0,"{'title': 'Super_Bowl_50', 'paragraphs': [{'co...",1.1
1,"{'title': 'Warsaw', 'paragraphs': [{'context':...",1.1
2,"{'title': 'Normans', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Nikola_Tesla', 'paragraphs': [{'con...",1.1
4,"{'title': 'Computational_complexity_theory', '...",1.1


In [7]:
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
tmp_file = '/content/drive/My Drive/Capstone Project/Data/word2vec-glove.6B.300d.txt'
model = KeyedVectors.load_word2vec_format(tmp_file)

In [10]:
def generate_distractors(answer, count):
    answer = str.lower(answer)

    ##Extracting closest words for the answer. 
    try:
        closestWords = model.most_similar(positive=[answer], topn=count)
    except:
        #In case the word is not in the vocabulary, or other problem not loading embeddings
        return []

    #Return count many distractors
    distractors = list(map(lambda x: x[0], closestWords))[0:count]

    return distractors



def addDistractors(qaPairs, count):
    if not model:
        print("Glove embeddings not found. Please download and place them in the following path: " + glove_file)

    for qaPair in qaPairs:
        distractors = generate_distractors(qaPair['answer'], count)
        qaPair['distractors'] = distractors

    return qaPairs

def generateQuestions(text, count):
    
    # Extract words 
    df = generateDf(text)
    wordsDf = prepareDf(df)
    
    # Predict 
    labeledAnswers = predictWords(wordsDf, df)
    # print(labeledAnswers)
    
    # Transform questions
    qaPairs = addQuestions(labeledAnswers, text)
    
    # Pick the best questions
    orderedQaPairs = sortAnswers(qaPairs)
    
    # Generate distractors
    questions = addDistractors(orderedQaPairs[:count], 4)
    
    # Print
    for i in range(count):
        display(Markdown('### Question ' + str(i + 1) + ':'))
        print(questions[i]['question'])

        display(Markdown('#### Answer:'))
        print(questions[i]['answer'])

        display(Markdown('#### Incorrect answers:'))
        for distractor in questions[i]['distractors']:
            print(distractor)

In [18]:
titleId = 1
paragraphId = 2

text = test['data'][titleId]['paragraphs'][paragraphId]['context']
text

"There are 13 natural reserves in Warsaw – among others, Bielany Forest, Kabaty Woods, Czerniaków Lake. About 15 kilometres (9 miles) from Warsaw, the Vistula river's environment changes strikingly and features a perfectly preserved ecosystem, with a habitat of animals that includes the otter, beaver and hundreds of bird species. There are also several lakes in Warsaw – mainly the oxbow lakes, like Czerniaków Lake, the lakes in the Łazienki or Wilanów Parks, Kamionek Lake. There are lot of small lakes in the parks, but only a few are permanent – the majority are emptied before winter to clean them of plants and sediments."

In [19]:
# text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."
generateQuestions(text, 10)

### Question 1:

_____ (9 miles) from Warsaw, the Vistula river's environment changes strikingly and features a perfectly preserved ecosystem, with a habitat of animals that includes the otter, beaver and hundreds of bird species.


#### Answer:

About 15 kilometres


#### Incorrect answers:

### Question 2:

About 15 kilometres (_____) from Warsaw, the Vistula river's environment changes strikingly and features a perfectly preserved ecosystem, with a habitat of animals that includes the otter, beaver and hundreds of bird species.


#### Answer:

9 miles


#### Incorrect answers:

### Question 3:

There are 13 natural reserves in Warsaw – among others, Bielany Forest, Kabaty Woods, _____.


#### Answer:

Czerniaków Lake


#### Incorrect answers:

### Question 4:

There are also several lakes in Warsaw – mainly the oxbow lakes, like _____, the lakes in the Łazienki or Wilanów Parks, Kamionek Lake.


#### Answer:

Czerniaków Lake


#### Incorrect answers:

### Question 5:

About 15 kilometres (9 miles) from Warsaw, the _____ river's environment changes strikingly and features a perfectly preserved ecosystem, with a habitat of animals that includes the otter, beaver and hundreds of bird species.


#### Answer:

Vistula


#### Incorrect answers:

dnieper
danube
meuse
scheldt


### Question 6:

There are _____ natural reserves in Warsaw – among others, Bielany Forest, Kabaty Woods, Czerniaków Lake.


#### Answer:

13


#### Incorrect answers:

14
17
16
12


### Question 7:

About 15 kilometres (9 miles) from Warsaw, the Vistula river's environment changes strikingly and features a perfectly preserved ecosystem, with a habitat of animals that includes the otter, beaver and _____ of bird species.


#### Answer:

hundreds


#### Incorrect answers:

thousands
dozens
tens
millions


### Question 8:

There are lot of small lakes in the parks, but only a few are permanent – the majority are emptied before _____ to clean them of plants and sediments.


#### Answer:

winter


#### Incorrect answers:

summer
autumn
spring
weather


### Question 9:

There are 13 natural reserves in Warsaw – among others, Bielany Forest, _____, Czerniaków Lake.


#### Answer:

Kabaty Woods


#### Incorrect answers:

### Question 10:

There are also several lakes in Warsaw – mainly the oxbow lakes, like Czerniaków Lake, the lakes in the Łazienki or _____, Kamionek Lake.


#### Answer:

Wilanów Parks


#### Incorrect answers:

In [None]:
test['data'][titleId]['paragraphs'][paragraphId]

{'context': "The league eventually narrowed the bids to three sites: New Orleans' Mercedes-Benz Superdome, Miami's Sun Life Stadium, and the San Francisco Bay Area's Levi's Stadium.",
 'qas': [{'answers': [{'answer_start': 56,
     'text': "New Orleans' Mercedes-Benz Superdome"},
    {'answer_start': 56, 'text': "New Orleans' Mercedes-Benz Superdome"},
    {'answer_start': 69, 'text': 'Mercedes-Benz Superdome'}],
   'id': '56be5438acb8001400a5031a',
   'question': 'Which Louisiana venue was one of three considered for Super Bowl 50?'},
  {'answers': [{'answer_start': 94, 'text': "Miami's Sun Life Stadium"},
    {'answer_start': 94, 'text': "Miami's Sun Life Stadium"},
    {'answer_start': 102, 'text': 'Sun Life Stadium'}],
   'id': '56be5438acb8001400a5031b',
   'question': 'Which Florida venue was one of three considered for Super Bowl 50?'},
  {'answers': [{'answer_start': 128,
     'text': "San Francisco Bay Area's Levi's Stadium"},
    {'answer_start': 128, 'text': "San Francisco B

In [None]:
text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."
generateQuestions(text, 10)

### Question 1:

Diatomic oxygen gas constitutes _____ of the Earth's atmosphere.


#### Answer:

20.8%


#### Incorrect answers:

### Question 2:

At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula _____.


#### Answer:

O2


#### Incorrect answers:

nadh
nadph
vodafone
h2o


### Question 3:

Diatomic oxygen gas constitutes 20.8% of the _____'s atmosphere.


#### Answer:

Earth


#### Incorrect answers:

planet
mars
planets
orbit


### Question 4:

As compounds including oxides, the element makes up almost half of the _____'s crust.


#### Answer:

Earth


#### Incorrect answers:

planet
mars
planets
orbit


### Question 5:

As compounds including oxides, the element makes up _____ of the Earth's crust.


#### Answer:

almost half


#### Incorrect answers:

### Question 6:

Oxygen is a chemical element with symbol O and atomic number _____.


#### Answer:

8


#### Incorrect answers:

9
7
6
5


### Question 7:

At standard temperature and pressure, _____ atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2.


#### Answer:

two


#### Incorrect answers:

three
four
five
six


### Question 8:

It is a member of _____ on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds.


#### Answer:

the chalcogen group


#### Incorrect answers:

### Question 9:

_____ is a chemical element with symbol O and atomic number 8.


#### Answer:

Oxygen


#### Incorrect answers:

hydrogen
nitrogen
helium
nutrients


### Question 10:

It is a member of the chalcogen group on the periodic _____, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds.


#### Answer:

table


#### Incorrect answers:

tables
sit
sitting
room
