# Demo
We'll gather all the functions from the other notebooks and generate questions for an entirely new text.

You can also run this notebook independently with any text and see the results. 

## Imports

In [1]:
# Common imports
import numpy as np
import pandas as pd
from IPython.display import Markdown, display, clear_output
import random
from tensorflow.keras.models import model_from_json

### Pickling

In [2]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

## *Extract all words from plain text and generate it's features*

In [3]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

#Extract answers and the sentence they are in
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

#TODO - Clean answers from stopwords?
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

#Save named entities start points

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1
        
def addWordsForParagrapgh(newWords, text):
    doc = nlp(text)

    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            0,
                            0,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                0,
                                0,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1

def oneHotEncodeColumns(df):
    columnsToEncode = ['NER', 'POS', "TAG", 'DEP']

    for column in columnsToEncode:
        one_hot = pd.get_dummies(df[column])
        one_hot = one_hot.add_prefix(column + '_')

        df = df.drop(column, axis = 1)
        df = df.join(one_hot)
    
    return df

## *Predict whether a word is a keyword* 

In [4]:
def generateDf(text):
    words = []
    addWordsForParagrapgh(words, text)

    wordColums = ['text', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    df = pd.DataFrame(words, columns=wordColums)
    
    return df

In [5]:
def prepareDf(df):
    #One-hot encoding
    wordsDf = oneHotEncodeColumns(df)


    #Add missing colums 
    predictorFeaturesName = '../input/paraquestion/nb-predictor-features_1.pkl'
    featureNames = loadPickle(predictorFeaturesName)

    for feature in featureNames:
        if feature not in wordsDf.columns:
            wordsDf[feature] = 0    
                
    #Drop unused columns
    columnsToDrop = ['text', 'titleId', 'paragrapghId', 'sentenceId', 'shape', 'isAnswer']
    wordsDf = wordsDf.drop(columnsToDrop, axis = 1)


    return wordsDf

In [6]:
def predictWords(wordsDf, df):
    
    # load json and create model
    json_file = open('../input/paraquestion/nb-predictor_nn_model_1.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights("../input/paraquestion/model.h5")
    loaded_model.compile(optimizer='adam', loss='mean_squared_error')
    y_pred = loaded_model.predict(np.reshape(np.array(wordsDf),(wordsDf.shape[0], wordsDf.shape[1],-1)))

    labeledAnswers = []
    for i in range(len(y_pred)):
        labeledAnswers.append({'word': df.iloc[i]['text'], 'prob': y_pred[i][0][0]})
    
    return labeledAnswers

## *Extract questions*

In [7]:
def blankAnswer(firstTokenIndex, lastTokenIndex, sentStart, sentEnd, doc):
    leftPartStart = doc[sentStart].idx
    leftPartEnd = doc[firstTokenIndex].idx
    rightPartStart = doc[lastTokenIndex].idx + len(doc[lastTokenIndex])
    rightPartEnd = doc[sentEnd - 1].idx + len(doc[sentEnd - 1])
    
    question = doc.text[leftPartStart:leftPartEnd] + '_____' + doc.text[rightPartStart:rightPartEnd]
    
    return question


In [8]:
def addQuestions(answers, text):
    doc = nlp(text)
    currAnswerIndex = 0
    qaPair = []

    #Check wheter each token is the next answer
    for sent in doc.sents:
        for token in sent:
            
            #If all the answers have been found, stop looking
            if currAnswerIndex >= len(answers):
                break
            
            #In the case where the answer is consisted of more than one token, check the following tokens as well.
            answerDoc = nlp(answers[currAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            #If the current token is corresponding with the answer, add it 
            if answerIsFound:
                question = blankAnswer(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                qaPair.append({'question' : question, 'answer': answers[currAnswerIndex]['word'], 'prob': answers[currAnswerIndex]['prob']})
                
                currAnswerIndex += 1
                
    return qaPair

In [9]:
def sortAnswers(qaPairs):
    random.shuffle(qaPairs)
    orderedQaPairs = sorted(qaPairs, key=lambda qaPair: qaPair['prob'])
    
    return orderedQaPairs    

## *Distractors*
Taken from the *04. Generating incorrect answers/Incorrect-answers* notebook.

In [10]:
# import os
# import gensim
# from gensim.test.utils import datapath, get_tmpfile
# from gensim.models import KeyedVectors

# glove_file = '../input/paraquestion/glove.6B.300d.txt'
# tmp_file = './word2vec-glove.6B.300d.txt'
# model = None

# if os.path.isfile(glove_file):
#     from gensim.scripts.glove2word2vec import glove2word2vec
#     glove2word2vec(glove_file, tmp_file)
#     model = KeyedVectors.load_word2vec_format(tmp_file)
# else:
#     print("Glove embeddings not found. Please download and place them in the following path: " + glove_file)

In [11]:
import os
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = '../input/paraquestion/glove.6B.300d.txt'
tmp_file = './word2vec-glove.6B.300d.txt'
ready_tmp_file = '../input/paraquestion/word2vec-glove.6B.300d.txt'
output_model_file = './model_ready.pkl'
ready_model_file = '../input/paraquestion/model_ready.pkl'
model = None

if os.path.isfile(glove_file):
    from gensim.scripts.glove2word2vec import glove2word2vec
    if not os.path.isfile(ready_model_file):
        if not os.path.isfile(ready_tmp_file):
            glove2word2vec(glove_file, tmp_file)
            print("Glove file not found")
        else:
            tmp_file=ready_tmp_file
            print("Glove file found")
        model = KeyedVectors.load_word2vec_format(tmp_file)
        dumpPickle(output_model_file, model)
    else:
        model = loadPickle(ready_model_file)
else:
    print("Glove embeddings not found. Please download and place them in the following path: " + glove_file)

In [12]:
from difflib import get_close_matches
def generate_distractors(answer, count):
    answer = str.lower(answer)
    answerWords = answer.split()
    
    # Find and replace the word in answer Words which is not in model.index_to_key
    for i, word in enumerate(answerWords):
        if word not in model.index_to_key:
            closest_words = get_close_matches(word, model.index_to_key, n=1, cutoff=0)
            closest_word = closest_words[0] if closest_words else "" 
            answerWords[i] = closest_word
    
    ##Extracting closest words for the answer. 
    try:
        closestWords = model.most_similar(positive=answerWords, topn=count)
    except:
        #In case the word is not in the vocabulary, or other problem not loading embeddings
        return []

    #Return count many distractors
    distractors = list(map(lambda x: x[0], closestWords))[0:count]
    
    return distractors

In [13]:
def addDistractors(qaPairs, count):
    if not model:
        print("Glove embeddings not found. Please download and place them in the following path: " + glove_file)
    
    for qaPair in qaPairs:
        distractors = generate_distractors(qaPair['answer'], count)
        qaPair['distractors'] = distractors
    
    return qaPairs

# Main function

In [14]:
def generateQuestions(text, count):
    
    # Extract words 
    df = generateDf(text)
    wordsDf = prepareDf(df)
    
    # Predict 
    labeledAnswers = predictWords(wordsDf, df)
    
    # Transform questions
    qaPairs = addQuestions(labeledAnswers, text)
    
    # Pick the best questions
    orderedQaPairs = sortAnswers(qaPairs)
    
    # Generate distractors
    questions = addDistractors(orderedQaPairs[:count], 3)
    
    # Print
    for i in range(count):
        display(Markdown('### Question ' + str(i + 1) + ':'))
        print(questions[i]['question'])

        display(Markdown('#### Answer:'))
        print(questions[i]['answer'])
        
        display(Markdown('#### Incorrect answers:'))
        for distractor in questions[i]['distractors']:
            print(distractor)
        
        print()

In [15]:
text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."

generateQuestions(text, 5)

### Question 1:

As compounds including oxides, the element makes up _____ of the Earth's crust.


#### Answer:

almost half


#### Incorrect answers:

nearly
just
than



### Question 2:

Diatomic oxygen gas constitutes _____ of the Earth's atmosphere.


#### Answer:

20.8%


#### Incorrect answers:

20.4
21.8
19.7



### Question 3:

Oxygen is a chemical _____ with symbol O and atomic number 8.


#### Answer:

element


#### Incorrect answers:

elements
component
aspect



### Question 4:

As compounds including _____, the element makes up almost half of the Earth's crust.


#### Answer:

oxides


#### Incorrect answers:

oxide
sulfur
nitrogen



### Question 5:

As compounds including oxides, the element _____ up almost half of the Earth's crust.


#### Answer:

makes


#### Incorrect answers:

make
making
does



## testing

In [16]:
text = "Newton’s first law states that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force. This tendency to resist changes in a state of motion is inertia. If all the external forces cancel each other out, then there is no net force acting on the object.  If there is no net force acting on the object, then the object will maintain a constant velocity."
generateQuestions(text, 25)

### Question 1:

Newton’s first law states that every object will remain at rest or in uniform motion in a straight line unless _____ to change its state by the action of an external force.


#### Answer:

compelled


#### Incorrect answers:

obliged
obligated
compel



### Question 2:

If all the external forces cancel each other out, then there is no net force _____ on the object.


#### Answer:

acting


#### Incorrect answers:

acted
role
directing



### Question 3:

Newton’s _____ law states that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

first


#### Incorrect answers:

second
third
fourth



### Question 4:

Newton’s first law _____ that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

states


#### Incorrect answers:

united
u.s.
countries



### Question 5:

Newton’s first law states that every object will remain at rest or in uniform motion in a _____ line unless compelled to change its state by the action of an external force.


#### Answer:

straight


#### Incorrect answers:

consecutive
fourth
fifth



### Question 6:

This tendency to resist changes in a state of motion is _____.


#### Answer:

inertia


#### Incorrect answers:

rotational
angular
rigidity



### Question 7:

Newton’s first law states that every object will remain at rest or in uniform motion in a straight line unless compelled to _____ its state by the action of an external force.


#### Answer:

change


#### Incorrect answers:

changes
changing
changed



### Question 8:

Newton’s first law states that every _____ will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

object


#### Incorrect answers:

objects
particular
inanimate



### Question 9:

If all the external forces cancel each other out, then there is no net _____ acting on the object.


#### Answer:

force


#### Incorrect answers:

forces
military
air



### Question 10:

 If there is no net _____ acting on the object, then the object will maintain a constant velocity.


#### Answer:

force


#### Incorrect answers:

forces
military
air



### Question 11:

This tendency to resist _____ in a state of motion is inertia.


#### Answer:

changes


#### Incorrect answers:

change
changing
adjustments



### Question 12:

This _____ to resist changes in a state of motion is inertia.


#### Answer:

tendency


#### Incorrect answers:

propensity
tendencies
tends



### Question 13:

Newton’s first law states that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the _____ of an external force.


#### Answer:

action


#### Incorrect answers:

actions
any
take



### Question 14:

This tendency to resist changes in a state of _____ is inertia.


#### Answer:

motion


#### Incorrect answers:

motions
action
reconsideration



### Question 15:

Newton’s first law states that every object will remain at rest or in uniform motion in a straight _____ unless compelled to change its state by the action of an external force.


#### Answer:

line


#### Incorrect answers:

lines
running
railway



### Question 16:

 If there is no net force acting on the object, then the object will maintain a _____ velocity.


#### Answer:

constant


#### Incorrect answers:

continual
continuous
relentless



### Question 17:

This tendency to _____ changes in a state of motion is inertia.


#### Answer:

resist


#### Incorrect answers:

resisting
resisted
temptation



### Question 18:

Newton’s first law states that every object will _____ at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

remain


#### Incorrect answers:

remained
remains
still



### Question 19:

Newton’s first law states that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an _____ force.


#### Answer:

external


#### Incorrect answers:

internal
foreign
affairs



### Question 20:

Newton’s first law states that every object will remain at _____ or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

rest


#### Incorrect answers:

remainder
all
.



### Question 21:

_____’s first law states that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

Newton


#### Incorrect answers:

andover
thandie
wold



### Question 22:

 If there is no _____ force acting on the object, then the object will maintain a constant velocity.


#### Answer:

net


#### Incorrect answers:

profit
quarter
profits



### Question 23:

This tendency to resist changes in a _____ of motion is inertia.


#### Answer:

state


#### Incorrect answers:

federal
states
government



### Question 24:

Newton’s first _____ states that every object will remain at rest or in uniform motion in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

law


#### Incorrect answers:

laws
legal
legislation



### Question 25:

Newton’s first law states that every object will remain at rest or in uniform _____ in a straight line unless compelled to change its state by the action of an external force.


#### Answer:

motion


#### Incorrect answers:

motions
action
reconsideration



In [17]:
text='Machine learning is a subfield of artificial intelligence, which is broadly defined as the capability of a machine to imitate intelligent human behavior. Artificial intelligence systems are used to perform complex tasks in a way that is similar to how humans solve problems.The goal of AI is to create computer models that exhibit “intelligent behaviors” like humans, according to Boris Katz, a principal research scientist and head of the InfoLab Group at CSAIL. This means machines that can recognize a visual scene, understand a text written in natural language, or perform an action in the physical world.Machine learning is one way to use AI. It was defined in the 1950s by AI pioneer Arthur Samuel as “the field of study that gives computers the ability to learn without explicitly being programmed.”'
generateQuestions(text, 5)

### Question 1:

The goal of AI is to create computer models that exhibit “intelligent behaviors” like humans, according to Boris Katz, a principal research scientist and head of _____ at CSAIL.


#### Answer:

the InfoLab Group


#### Incorrect answers:

groups
which
called



### Question 2:

It was defined in _____ by AI pioneer Arthur Samuel as “the field of study that gives computers the ability to learn without explicitly being programmed.”


#### Answer:

the 1950s


#### Incorrect answers:

1960s
1970s
1940s



### Question 3:

The goal of AI is to create computer models that exhibit “intelligent behaviors” like humans, according to _____, a principal research scientist and head of the InfoLab Group at CSAIL.


#### Answer:

Boris Katz


#### Incorrect answers:

yeltsin
viktor
rosen



### Question 4:

It was defined in the 1950s by AI pioneer _____ as “the field of study that gives computers the ability to learn without explicitly being programmed.”


#### Answer:

Arthur Samuel


#### Incorrect answers:

william
henry
h.



### Question 5:

The goal of AI is to create computer models that exhibit “intelligent behaviors” _____ humans, according to Boris Katz, a principal research scientist and head of the InfoLab Group at CSAIL.


#### Answer:

like


#### Incorrect answers:

such
even
you

