In [1]:
import pandas as pd

In [2]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [3]:
from IPython.display import Markdown, display, clear_output

def printBold(string):
    display(Markdown('**' + string + '**'))


## Parsing text 

In [4]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_md')

#There seems to be a bug with spacy's stop words.
from spacy.lang.en.stop_words import STOP_WORDS
for word in STOP_WORDS:
    for w in (word, word[0].capitalize(), word.upper()):
        lex = nlp.vocab[w]
        lex.is_stop = True
        
#Extract answers and the sentence they are in
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

#TODO - Clean answers from stopwords?
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

#Save named entities start points

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1
        
def addWordsForParagrapgh(newWords, text):
    doc = nlp(text)

    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            0,
                            0,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                0,
                                0,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1

def oneHotEncodeColumns(df):
    columnsToEncode = ['NER', 'POS', "TAG", 'DEP']

    for column in columnsToEncode:
        one_hot = pd.get_dummies(df[column])
        one_hot = one_hot.add_prefix(column + '_')

        df = df.drop(column, axis = 1)
        df = df.join(one_hot)
    
    return df

In [5]:
def generateDf(text):
    words = []
    addWordsForParagrapgh(words, text)

    wordColums = ['text', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    df = pd.DataFrame(words, columns=wordColums)
    
    return df

In [6]:
def prepareDf(df):
    #One-hot encoding
    wordsDf = oneHotEncodeColumns(df)

    #Drop unused columns
    columnsToDrop = ['text', 'titleId', 'paragrapghId', 'sentenceId', 'shape']
    wordsDf = wordsDf.drop(columnsToDrop, axis = 1)

    #Add missing colums 
    predictorColumns = ['wordCount','NER_CARDINAL','NER_DATE','NER_EVENT','NER_FAC','NER_GPE','NER_LANGUAGE','NER_LAW','NER_LOC','NER_MONEY','NER_NORP','NER_ORDINAL','NER_ORG','NER_PERCENT','NER_PERSON','NER_PRODUCT','NER_QUANTITY','NER_TIME','NER_WORK_OF_ART','POS_ADJ','POS_ADP','POS_ADV','POS_CCONJ','POS_DET','POS_INTJ','POS_NOUN','POS_NUM','POS_PART','POS_PRON','POS_PROPN','POS_PUNCT','POS_SYM','POS_VERB','POS_X','TAG_''','TAG_-LRB-','TAG_.','TAG_ADD','TAG_AFX','TAG_CC','TAG_CD','TAG_DT','TAG_EX','TAG_FW','TAG_IN','TAG_JJ','TAG_JJR','TAG_JJS','TAG_LS','TAG_MD','TAG_NFP','TAG_NN','TAG_NNP','TAG_NNPS','TAG_NNS','TAG_PDT','TAG_POS','TAG_PRP','TAG_PRP$','TAG_RB','TAG_RBR','TAG_RBS','TAG_RP','TAG_SYM','TAG_TO','TAG_UH','TAG_VB','TAG_VBD','TAG_VBG','TAG_VBN','TAG_VBP','TAG_VBZ','TAG_WDT','TAG_WP','TAG_WRB','TAG_XX','DEP_ROOT','DEP_acl','DEP_acomp','DEP_advcl','DEP_advmod','DEP_agent','DEP_amod','DEP_appos','DEP_attr','DEP_aux','DEP_auxpass','DEP_case','DEP_cc','DEP_ccomp','DEP_compound','DEP_conj','DEP_csubj','DEP_csubjpass','DEP_dative','DEP_dep','DEP_det','DEP_dobj','DEP_expl','DEP_intj','DEP_mark','DEP_meta','DEP_neg','DEP_nmod','DEP_npadvmod','DEP_nsubj','DEP_nsubjpass','DEP_nummod','DEP_oprd','DEP_parataxis','DEP_pcomp','DEP_pobj','DEP_poss','DEP_preconj','DEP_predet','DEP_prep','DEP_prt','DEP_punct','DEP_quantmod','DEP_relcl','DEP_xcomp']

    for feature in predictorColumns:
        if feature not in wordsDf.columns:
            wordsDf[feature] = 0
    
    return wordsDf

In [7]:
def predictWords(wordsDf, df):
    
    predictorPickleName = 'pickles/predictor-spacy-features.pkl'
    predictor = loadPickle(predictorPickleName)
    
    y_pred = predictor.predict_proba(wordsDf)

    labeledAnswers = []
    for i in range(len(y_pred)):
        labeledAnswers.append({'word': df.iloc[i]['text'], 'prob': y_pred[i][0]})
    
    return labeledAnswers

In [8]:
def blankAnswer(firstTokenIndex, lastTokenIndex, sentStart, sentEnd, doc):
    leftPartStart = doc[sentStart].idx
    leftPartEnd = doc[firstTokenIndex].idx
    rightPartStart = doc[lastTokenIndex].idx + len(doc[lastTokenIndex])
    rightPartEnd = doc[sentEnd - 1].idx + len(doc[sentEnd - 1])
    
    question = doc.text[leftPartStart:leftPartEnd] + '_____' + doc.text[rightPartStart:rightPartEnd]
    
    return question
    
    
   #for i in range(sentEnd - sentStart):
   #    if sentStart + i != tokenIndex
   #    print(doc[13 + i])
   #

In [9]:
def addQuestions(answers, text):
    doc = nlp(text)
    currAnswerIndex = 0
    qaPair = []

    #Check wheter each token is the next answer
    for sent in doc.sents:
        for token in sent:
            
            #If all the answers have been found, stop looking
            if currAnswerIndex >= len(answers):
                break
            
            #In the case where the answer is consisted of more than one token, check the following tokens as well.
            answerDoc = nlp(answers[currAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            #If the current token is corresponding with the answer, add it 
            if answerIsFound:
                question = blankAnswer(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                qaPair.append({'question' : question, 'answer': answers[currAnswerIndex]['word'], 'prob': answers[currAnswerIndex]['prob']})
                
                currAnswerIndex += 1
                
    return qaPair

In [10]:
def sortAnswers(qaPairs):
    orderedQaPairs = sorted(qaPairs, key=lambda qaPair: qaPair['prob'])
    
    return orderedQaPairs    

In [11]:
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = 'data\embeddings\glove.6B.300d.txt'
tmp_file = "data\embeddings\word2vec-glove.6B.300d.txt"

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [12]:
def generateDistractors(sentence, answer, count):
    answer = str.lower(answer)
    
    ##Extracting closest words for the answer. 
    try:
        closestWords = model.most_similar(positive=[answer], topn=count)
    except:
        #In case the word is not in the vocabulary, or other problem not loading embeddings
        return []

    #Return count many distractors
    distractors = []
    for i in range(count):
        if i >= len(closestWords):
            break
            
        distractors.append(closestWords[i][0])
        
    return distractors

In [13]:
generateDistractors("Koala is cool", "Koala", 5)

['probo', 'koalas', 'orangutan', 'grizzly', 'marsupial']

In [14]:
def addDistractors(qaPairs, count):
    for qaPair in qaPairs:
        distractors = generateDistractors(qaPair['question'], qaPair['answer'], count)
        qaPair['distractors'] = distractors
    
    return qaPairs

In [15]:
def generateQuestions(text, count, isTest):
    
    df = generateDf(text)
    wordsDf = prepareDf(df)
    labeledAnswers = predictWords(wordsDf, df)
    qaPairs = addQuestions(labeledAnswers, text)
    orderedQaPairs = sortAnswers(qaPairs)
    questions = addDistractors(orderedQaPairs[:count], 4)

    for i in range(count):
        display(Markdown('### Question ' + str(i + 1) + ':'))
        print(questions[i]['question'])
        if isTest:
            answers = questions[i]['distractors']
            answers.append(questions[i]['answer'])
            
            display(Markdown('#### Answers:'))
            for answer in sorted(answers):
                print(str.lower(answer))
                
            print()
            print()
            print()
            print()
            print()
            print()
            print()
            display(Markdown('~~<sub>' + questions[i]['answer'][::-1] + '<sub>~~'))
        else:
            display(Markdown('#### Answer:'))
            print(questions[i]['answer'])
            display(Markdown('#### Incorrect answers:'))
            for distractor in questions[i]['distractors']:
                print(distractor)

In [16]:
text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."

generateQuestions(text, 5, False)

### Question 1:

At standard temperature and pressure, two _____ of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2.


#### Answer:

atoms


#### Incorrect answers:

molecules
electrons
ions
atom


### Question 2:

Oxygen is a chemical _____ with symbol O and atomic number 8.


#### Answer:

element


#### Incorrect answers:

elements
component
aspect
dimension


### Question 3:

It is a _____ of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds.


#### Answer:

member


#### Incorrect answers:

members
elected
committee
council


### Question 4:

By mass, oxygen is the third-most abundant _____ in the universe, after hydrogen and helium.


#### Answer:

element


#### Incorrect answers:

elements
component
aspect
dimension


### Question 5:

By mass, _____ is the third-most abundant element in the universe, after hydrogen and helium.


#### Answer:

oxygen


#### Incorrect answers:

hydrogen
nitrogen
helium
nutrients


In [27]:
def addQuestionsJson(answers, text):
    doc = nlp(text)
    currAnswerIndex = 0
    qaPair = []

    #Check wheter each token is the next answer
    for sent in doc.sents:
        for token in sent:
            
            #If all the answers have been found, stop looking
            if currAnswerIndex >= len(answers):
                break
            
            #In the case where the answer is consisted of more than one token, check the following tokens as well.
            answerDoc = nlp(answers[currAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            #If the current token is corresponding with the answer, add it 
            if answerIsFound:
                #question = blankAnswerJson(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                leftPartStart = doc[sent.start].idx
                rightPartEnd = doc[sent.end - 1].idx + len(doc[sent.end - 1])
                sentence = doc.text[leftPartStart:rightPartEnd]
                
                
                qaPair.append({'question' : sentence,
                               'answer': answers[currAnswerIndex]['word'], 
                               'prob': answers[currAnswerIndex]['prob'], 
                               'startIndex': doc[token.i].idx - leftPartStart})
                
                currAnswerIndex += 1
                
    return qaPair

In [18]:
def groupSentences(qaPairs):
    sentences = {}
    json = []
    
    for qaPair in qaPairs:
        sentences[qaPair['question']] = {'text': qaPair['question'], 'answers': []}
        
    for qaPair in qaPairs:
        sentences[qaPair['question']]['answers'].append({'correct': qaPair['answer'], 'startIndex': qaPair['startIndex'], 'confidence': qaPair['prob'], 'distractors': qaPair['distractors']}) 
        
    for sentence in sentences:
        json.append(sentences[sentence])
        
    return json

In [25]:
def generateJson(text, count):
    
    df = generateDf(text)
    wordsDf = prepareDf(df)
    labeledAnswers = predictWords(wordsDf, df)
    qaPairs = addQuestionsJson(labeledAnswers, text)
    orderedQaPairs = sortAnswers(qaPairs)
    questions = addDistractors(orderedQaPairs[:count], 4)
    sentences = groupSentences(questions)

    return sentences

In [28]:
text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."

generateJson(text, 20)

[{'text': 'At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2.',
  'answers': [{'correct': 'atoms',
    'startIndex': 42,
    'confidence': 0.0,
    'distractors': ['molecules', 'electrons', 'ions', 'atom']},
   {'correct': 'bind',
    'startIndex': 63,
    'confidence': 1.3478902193727544e-219,
    'distractors': ['binds', 'binding', 'proteins', 'molecules']},
   {'correct': 'two',
    'startIndex': 38,
    'confidence': 3.235488940537157e-114,
    'distractors': ['three', 'four', 'five', 'six']},
   {'correct': 'form',
    'startIndex': 71,
    'confidence': 1.0310724282881944e-94,
    'distractors': ['forms', 'forming', 'formed', 'which']}]},
 {'text': 'Oxygen is a chemical element with symbol O and atomic number 8.',
  'answers': [{'correct': 'element',
    'startIndex': 21,
    'confidence': 2.6397652324234365e-272,
    'distractors': ['elements', 'component', 'aspect', 'dimension']},
   {

### Web server

In [None]:
from werkzeug.wrappers import Request, Response
from flask import Flask
from flask import request

app = Flask(__name__)

@app.route("/")
def hello():
    return ''.join(map(str, generateJson(text, 20)))

@app.route("/generate", methods=["POST"])
def test():
    return ''.join(map(str, generateJson(request.args["text"], 20)))

if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 9002, app)

 * Running on http://localhost:9002/ (Press CTRL+C to quit)
127.0.0.1 - - [24/Mar/2019 11:40:07] "POST /generate?text=%22Oxygen%20is%20cool%22 HTTP/1.1" 200 -
127.0.0.1 - - [24/Mar/2019 11:41:54] "POST /generate?text=Wikipedia%20%28/ˌwɪkɪˈpiːdiə/%20%28About%20this%20soundlisten%29,%20/ˌwɪkiˈpiːdiə/%20%28About%20this%20soundlisten%29%20WIK-ih-PEE-dee-ə%29%20is%20a%20multilingual,%20web-based,%20free%20encyclopedia%20based%20on%20a%20model%20of%20openly%20editable%20and%20viewable%20content,%20a%20wiki.%20It%20is%20the%20largest%20and%20most%20popular%20general%20reference%20work%20on%20the%20World%20Wide%20Web,[3][4][5]%20and%20is%20one%20of%20the%20most%20popular%20websites%20by%20Alexa%20rank.[6]%20It%20is%20owned%20and%20supported%20by%20the%20Wikimedia%20Foundation,%20a%20non-profit%20organization%20that%20operates%20on%20money%20it%20receives%20from%20donors.[7][8][9] HTTP/1.1" 200 -
