# Step 1: Import Libraries

In [1]:
################################
# Step 1 - Import the Required Libraries
################################
import pandas as pd
from IPython.display import Markdown, display, clear_output
import spacy
from spacy import displacy
import _pickle as cPickle
from pathlib import Path
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
import random

In [2]:
import configparser

import os

vAR_Config = configparser.ConfigParser(allow_no_value=True)

vAR_INI_FILE_PATH = os.getenv('AQG')
#print(vAR_INI_FILE_PATH)

vAR_Config.read(vAR_INI_FILE_PATH)

vAR_Data = vAR_Config.sections()

vAR_Pickle_Data1 =vAR_Config['FILE PATH']['PICKLE_DATA_1']
vAR_Pickle_Data2 =vAR_Config['FILE PATH']['PICKLE_DATA_2']
vAR_Training_Data1 = vAR_Config['FILE PATH']['TRAINING_DATA_MCQ1']
vAR_Training_Data2 =vAR_Config['FILE PATH']['TRAINING_DATA_MCQ2']
vAR_Test_Data = vAR_Config['FILE PATH']['TEST_DATA_MCQ1']
print(vAR_Pickle_Data1)
print(vAR_Pickle_Data2)
print(vAR_Training_Data1)
print(vAR_Training_Data2)
print(vAR_Test_Data)

C:\AI\AUTOMATIC QUESTION GENERATION\ML\TRAINING DATA\pickles\nb-predictor.pkl
C:\AI\AUTOMATIC QUESTION GENERATION\ML\TRAINING DATA\pickles\wordsDf.pkl
C:\AI\AUTOMATIC QUESTION GENERATION\ML\TRAINING DATA\embeddings\glove.6B.300d.txt
C:\AI\AUTOMATIC QUESTION GENERATION\ML\TRAINING DATA\embeddings\word2vec-glove.6B.300d.txt
C:\AI\AUTOMATIC QUESTION GENERATION\ML\TEST DATA\MCQ_AI_DATA.txt


# Step 2: Pickling

In [3]:

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

# Step 3: Extract words and generate features

In [4]:
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')

################################
#Extract answers and the sentence they are in
################################
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers
################################
# Cleaning answers from stopwords
################################
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False


# Step 4: Fixing named Entities start points

In [5]:
################################
#Save named entities start points
################################

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1
        
def addWordsForParagrapgh(newWords, text):
    doc = nlp(text)

    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            0,
                            0,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                0,
                                0,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1

def oneHotEncodeColumns(df):
    columnsToEncode = ['NER', 'POS', "TAG", 'DEP']

    for column in columnsToEncode:
        one_hot = pd.get_dummies(df[column])
        one_hot = one_hot.add_prefix(column + '_')

        df = df.drop(column, axis = 1)
        df = df.join(one_hot)
    
    return df

# Step 5: Predict whether word is keyword

In [6]:
def generateDf(text):
    words = []
    addWordsForParagrapgh(words, text)

    wordColums = ['text', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    df = pd.DataFrame(words, columns=wordColums)
    
    return df

In [7]:
def prepareDf(df):
    #One-hot encoding
    wordsDf = oneHotEncodeColumns(df)

    #Drop unused columns
    columnsToDrop = ['text', 'titleId', 'paragrapghId', 'sentenceId', 'shape']
    wordsDf = wordsDf.drop(columnsToDrop, axis = 1)

    #Add missing colums 
    predictorColumns = ['wordCount','NER_CARDINAL','NER_DATE','NER_EVENT','NER_FAC','NER_GPE','NER_LANGUAGE','NER_LAW','NER_LOC','NER_MONEY','NER_NORP','NER_ORDINAL','NER_ORG','NER_PERCENT','NER_PERSON','NER_PRODUCT','NER_QUANTITY','NER_TIME','NER_WORK_OF_ART','POS_ADJ','POS_ADP','POS_ADV','POS_CCONJ','POS_DET','POS_INTJ','POS_NOUN','POS_NUM','POS_PART','POS_PRON','POS_PROPN','POS_PUNCT','POS_SYM','POS_VERB','POS_X','TAG_''','TAG_-LRB-','TAG_.','TAG_ADD','TAG_AFX','TAG_CC','TAG_CD','TAG_DT','TAG_EX','TAG_FW','TAG_IN','TAG_JJ','TAG_JJR','TAG_JJS','TAG_LS','TAG_MD','TAG_NFP','TAG_NN','TAG_NNP','TAG_NNPS','TAG_NNS','TAG_PDT','TAG_POS','TAG_PRP','TAG_PRP$','TAG_RB','TAG_RBR','TAG_RBS','TAG_RP','TAG_SYM','TAG_TO','TAG_UH','TAG_VB','TAG_VBD','TAG_VBG','TAG_VBN','TAG_VBP','TAG_VBZ','TAG_WDT','TAG_WP','TAG_WRB','TAG_XX','DEP_ROOT','DEP_acl','DEP_acomp','DEP_advcl','DEP_advmod','DEP_agent','DEP_amod','DEP_appos','DEP_attr','DEP_aux','DEP_auxpass','DEP_case','DEP_cc','DEP_ccomp','DEP_compound','DEP_conj','DEP_csubj','DEP_csubjpass','DEP_dative','DEP_dep','DEP_det','DEP_dobj','DEP_expl','DEP_intj','DEP_mark','DEP_meta','DEP_neg','DEP_nmod','DEP_npadvmod','DEP_nsubj','DEP_nsubjpass','DEP_nummod','DEP_oprd','DEP_parataxis','DEP_pcomp','DEP_pobj','DEP_poss','DEP_preconj','DEP_predet','DEP_prep','DEP_prt','DEP_punct','DEP_quantmod','DEP_relcl','DEP_xcomp']

    for feature in predictorColumns:
        if feature not in wordsDf.columns:
            wordsDf[feature] = 0
    
    return wordsDf

In [8]:
def predictWords(wordsDf, df):
    
    predictorPickleName = vAR_Pickle_Data1
    predictor = loadPickle(predictorPickleName)
    
    y_pred = predictor.predict_proba(wordsDf)

    labeledAnswers = []
    for i in range(len(y_pred)):
        labeledAnswers.append({'word': df.iloc[i]['text'], 'prob': y_pred[i][0]})
    
    return labeledAnswers

# Step 6: Extract Questions

In [9]:
def blankAnswer(firstTokenIndex, lastTokenIndex, sentStart, sentEnd, doc):
    leftPartStart = doc[sentStart].idx
    leftPartEnd = doc[firstTokenIndex].idx
    rightPartStart = doc[lastTokenIndex].idx + len(doc[lastTokenIndex])
    rightPartEnd = doc[sentEnd - 1].idx + len(doc[sentEnd - 1])
    
    question = doc.text[leftPartStart:leftPartEnd] + '_____' + doc.text[rightPartStart:rightPartEnd]
    
    return question


# Step 7: Grouping questions and answers

In [10]:
def addQuestions(answers, text):
    doc = nlp(text)
    currAnswerIndex = 0
    qaPair = []

    #Check wheter each token is the next answer
    for sent in doc.sents:
        for token in sent:
            
            #If all the answers have been found, stop looking
            if currAnswerIndex >= len(answers):
                break
            
            #In the case where the answer is consisted of more than one token, check the following tokens as well.
            answerDoc = nlp(answers[currAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            #If the current token is corresponding with the answer, add it 
            if answerIsFound:
                question = blankAnswer(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                qaPair.append({'question' : question, 'answer': answers[currAnswerIndex]['word'], 'prob': answers[currAnswerIndex]['prob']})
                
                currAnswerIndex += 1
                
    return qaPair 

In [11]:
def sortAnswers(qaPairs):
    orderedQaPairs = sorted(qaPairs, key=lambda qaPair: qaPair['prob'])
    
    return orderedQaPairs     

# Step 8: Generating Distractors

In [12]:

glove_file = vAR_Training_Data1
tmp_file = vAR_Training_Data2

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [13]:
def generate_distractors(answer, count):
    answer = str.lower(answer)
    
    ##Extracting closest words for the answer. 
    try:
        closestWords = model.most_similar(positive=[answer], topn=count)
    except:
        #In case the word is not in the vocabulary, or other problem not loading embeddings
        return []

    #Return count many distractors
    distractors = list(map(lambda x: x[0], closestWords))[0:count]
    
    return distractors

In [14]:
def addDistractors(qaPairs, count):
    for qaPair in qaPairs:
        distractors = generate_distractors(qaPair['answer'], count)
        qaPair['distractors'] = distractors
    
    return qaPairs

# Step 9: Main Function

In [15]:

def generateQuestions(text, count):
    
    ################################
    # Extract words 
    ################################
    df = generateDf(text)
    wordsDf = prepareDf(df)
    
    ################################
    # Predict 
    ################################
    labeledAnswers = predictWords(wordsDf, df)
    
    ################################
    # Transform questions
    ################################
    qaPairs = addQuestions(labeledAnswers, text)
    
    ################################
    # Pick the best questions
    ################################
    orderedQaPairs = sortAnswers(qaPairs)
    
    ################################
    # Generate distractors
    ################################
    questions = addDistractors(orderedQaPairs[:count], 4)
    
    ################################
    # Print
    ################################
    for i in range(count):
        options = []
        options.append(questions[i]['answer'])
        
        display(Markdown('### Question ' + str(i + 1) + ':'))
        print(questions[i]['question'])

        
        
        display(Markdown('#### Options:'))
        for distractor in questions[i]['distractors']:
            options.append(distractor)
#             print(distractor)

       

        #################################
        # Shuffling options
        #################################
        
        random.shuffle(options)
        for num,letter in enumerate(options):
            print(num+1," ",letter)
        
#         print(ans)
        display(Markdown('#### Answer:'))
        for x,correct in enumerate(options):
            if correct==questions[i]['answer']:
                print(x+1,correct)
        print()

In [16]:
# text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."
f = open(vAR_Test_Data,mode='r')
vAR_Content = f.read()
display(Markdown('#### Content'))
print(vAR_Content)

print('')

generateQuestions(vAR_Content, 15)

#### Content

Artificial Intelligence refers to the intelligence of machines. This is in contrast to the natural intelligence of humans and animals. With Artificial Intelligence, machines perform functions such as learning, planning, reasoning and problem-solving. Most noteworthy, Artificial Intelligence is the simulation of human intelligence by machines. It is probably the fastest-growing development in the World of technology and innovation. Furthermore, many experts believe AI could solve major challenges and crisis situations. Artificial Intelligence in business would significantly save time and effort. There is an application of robotic automation to human business tasks. Furthermore, Machine learning algorithms help in better serving customers. Chatbots provide immediate response and service to customers.





### Question 1:

Artificial Intelligence _____ to the intelligence of machines.


#### Options:

1   commonly
2   describes
3   refer
4   referred
5   refers


#### Answer:

5 refers



### Question 2:

With Artificial Intelligence, machines _____ functions such as learning, planning, reasoning and problem-solving.


#### Options:

1   performed
2   required
3   performing
4   perform
5   performs


#### Answer:

4 perform



### Question 3:

Furthermore, many experts _____ AI could solve major challenges and crisis situations.


#### Options:

1   believed
2   know
3   believe
4   say
5   think


#### Answer:

3 believe



### Question 4:

Furthermore, Machine learning algorithms _____ in better serving customers.


#### Options:

1   helps
2   to
3   helped
4   helping
5   help


#### Answer:

5 help



### Question 5:

Chatbots _____ immediate response and service to customers.


#### Options:

1   provide
2   providing
3   provides
4   help
5   provided


#### Answer:

1 provide



### Question 6:

With Artificial Intelligence, machines perform functions such as learning, _____, reasoning and problem-solving.


#### Options:

1   planned
2   preparing
3   plans
4   plan
5   planning


#### Answer:

5 planning



### Question 7:

With Artificial Intelligence, machines perform functions such as learning, planning, _____ and problem-solving.


#### Options:

1   reasoning
2   logical
3   intuition
4   logic
5   deductive


#### Answer:

1 reasoning



### Question 8:

It is probably the fastest-growing development in the World of technology and _____.


#### Options:

1   innovations
2   creativity
3   innovation
4   entrepreneurship
5   technological


#### Answer:

3 innovation



### Question 9:

Artificial Intelligence in business would significantly save time and _____.


#### Options:

1   efforts
2   trying
3   effort
4   attempt
5   push


#### Answer:

3 effort



### Question 10:

Chatbots provide immediate response and _____ to customers.


#### Options:

1   service
2   news
3   network
4   services
5   .


#### Answer:

1 service



### Question 11:

Artificial Intelligence in business would significantly _____ time and effort.


#### Options:

1   help
2   saving
3   save
4   saves
5   saved


#### Answer:

3 save



### Question 12:

With Artificial Intelligence, machines perform functions such as learning, planning, reasoning and problem-_____.


#### Options:

1   problem
2   solve
3   solving
4   resolving
5   solved


#### Answer:

3 solving



### Question 13:

It is probably the fastest-_____ development in the World of technology and innovation.


#### Options:

1   increasing
2   grown
3   growing
4   grow
5   grew


#### Answer:

3 growing



### Question 14:

Furthermore, Machine learning algorithms help in better _____ customers.


#### Options:

1   serves
2   served
3   serve
4   jail
5   serving


#### Answer:

5 serving



### Question 15:

Most _____, Artificial Intelligence is the simulation of human intelligence by machines.


#### Options:

1   accomplishment
2   notable
3   noteworthy
4   accomplishments
5   interesting


#### Answer:

3 noteworthy

