Run these in the terminal directory where the main.ipynb file is located:
    wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
    tar -xvf  s2v_reddit_2015_md.tar.gz

In [37]:
import warnings
import os
import random
import re
import json
import requests
import operator
from nltk.corpus import wordnet as wn
from pywsd.lesk import cosine_lesk
from pywsd.lesk import simple_lesk
from pywsd.lesk import adapted_lesk
from pywsd.similarity import max_similarity
from flashtext import KeywordProcessor
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import nltk
import string
import spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('popular')
# File system manangement
# Suppress warnings
warnings.filterwarnings('ignore')

# load sense2vec vectors
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk('s2v_old')
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaelmbajwa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelmbajwa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/michaelmbajwa/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/michaelmbajwa/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/michaelmbajwa/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/michaelmbajwa/nltk_data...
[nltk_data]    |   Package gutenberg is already up-

## Data Loading

In [2]:
#Step 1- Import the text file/article that has to be used for MCQ generation

file = open("article2.txt","r") #"r" deontes read version open
text = file.read()

## Keyword Extraction

In [5]:
from nltk.corpus import stopwords #Stopwords are the words that we need to avoid while considering keyword extraction
import string

def find_important_word(model, article): 
    extractor=model()
    extractor.load_document(input=article,language='en')
    pos={'PROPN'} #We are only considering proper nouns as valid candidates for our keywords
    extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
    stops=list(string.punctuation) #Stoplist contains the words to be avoided
    stops+=['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] #These stand for the brackets as in lrb=left round bracket="(" and so on
    stops+=stopwords.words('english')
    extractor.candidate_selection(pos=pos) #Sets the candidate selection criteria, as in, which should be considered and which should be avoided
    extractor.candidate_weighting() #Sets the preference criteria for the candidates
    result=[] 
    ex=extractor.get_n_best(n=15) #Gets the 15 best candidates according to the criteria set
    for each in ex:
        result.append(each[0]) 
    return result


In [6]:
from pke.unsupervised import *
outputs = {}

# We use different unsupervised machine learning algorithms
for model in [FirstPhrases, TextRank, SingleRank, TopicRank, MultipartiteRank]:
    important_words=find_important_word(model, text)
    outputs[model.__name__] = important_words

In [None]:
# Run this to explore the result of each algorithm
for key, value in outputs.items():
    print(key)
    print(value)
    print("\n\n")

In [28]:
# Combine all key words generated from the 5 unsupervised models used
all_keywords = []
for k, v in outputs.items():
    all_keywords.extend(v)

# drop duplicates
important_words = list(set(all_keywords))

Split the whole text article into an array/list of individual sentences. This will help us fetch the sentences related to the keywords easily

In [29]:
# Step 3

from nltk.tokenize import sent_tokenize
def splitTextToSents(article, n):
    s=[sent_tokenize(article)]
    s=[y for x in s for y in x]
    s=[sent.strip() for sent in s if len(sent)>n] #Removes all the sentences that have length less than 15 so that we can ensure that our questions have enough length for context
    return s
sents=splitTextToSents(text, n=15) #Achieve a well splitted set of sentences from the text article
#print(sents)

Map the sentences which contain the keywords to the related keywords so that we can easily lookup the sentences related to the keywords


In [30]:
from collections import defaultdict

def map_sentence(important_words,sents):
    our_sents = list(set(sents)) # to remove duplicates
    processor=KeywordProcessor() #Using keyword processor as our processor for this task
    keySents={}
    
    for word in important_words:
        keySents[word]=set() #set for avoid duplicates
        processor.add_keyword(word) #Adds key word to the processor
    
    for sent in our_sents:
        found = processor.extract_keywords(sent) #Extract the keywords in the sentence
        
        # we select only one found keyword. Else questions will be similar
        if len(found) > 0:
            each = found[0]
            keySents[each].add(sent)
    
    for ky, val in keySents.items():
        keySents[ky] = list(val)
    
    for key in keySents.keys():
        temp=keySents[key]
        temp=sorted(temp,key=len,reverse=True) #Sort the sentences according to their decreasing length in order to ensure the quality of question for the MCQ 
        keySents[key]=temp
    return keySents

mapped_sentences=map_sentence(important_words,sents) # Extract the sentences that contain the keywords and map those sentences to the keywords using this function

#print(mapped_sentences)

## Distractor Generation

In [31]:
# Algorithm 1 for generating distractors
def get_answer_and_distractor_embeddings(answer,candidate_distractors):
    model= SentenceTransformer('all-MiniLM-L12-v2')
    answer_embedding = model.encode([answer])
    distractor_embeddings = model.encode(candidate_distractors)
    return answer_embedding,distractor_embeddings


def mmr(doc_embedding: np.ndarray,
        word_embeddings: np.ndarray,
        words: List[str],
        top_n: int = 5,
        diversity: float = 0.9) -> List[Tuple[str, float]]:
    """ Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords.


    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.

    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances

    """

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [(words[idx], round(float(word_doc_similarity.reshape(1, -1)[0][idx]), 4)) for idx in keywords_idx]


def get_distractors_sense2vec(originalword):
    # generates distractors for words using sense2vec package
    word = originalword.lower()
    word = word.replace(" ", "_")
    sense = s2v.get_best_sense(word)
    # function is terminated if sense2vec cannot get a sense of the word. In cases such as this, we use the second algorithm
    if not sense:
        return None
    # Get the most similar words, these are the distractors
    most_similar = s2v.most_similar(sense, n=20)
    
    distractors = []
    # clean the distractors and return the result
    for each_word in most_similar:
        append_word = each_word[0].split("|")[0].replace("_", " ")
        if append_word not in distractors and append_word != originalword:
            distractors.append(append_word)
    distractors.insert(0,originalword)
    return distractors


def best_distractors_sense2vec(originalword):
    distractors = get_distractors_sense2vec(originalword)
    if not distractors:
        return None
    answer_embedd, distractor_embedds = get_answer_and_distractor_embeddings(originalword,distractors)
    
    final_distractors = mmr(answer_embedd,distractor_embedds,distractors,len(distractors)-1)
    filtered_distractors = []
    for dist in final_distractors:
        filtered_distractors.append(dist[0])
    Filtered_Distractors =  filtered_distractors[1:len(filtered_distractors)-1]
    return Filtered_Distractors

In [32]:
# Algorithm 2 for generating distractors
# Get the sense of the word. In order to attain a quality set of distractors we need to get the right sense of the keyword. 
def get_sense_word(sent,word):
    word=word.lower() 
    if len(word.split())>0: #Splits the word with underscores(_) instead of spaces if there are multiple words
        word=word.replace(" ","_")
    synsets=wn.synsets(word,'n') #Sysnets from Google are invoked
    if synsets:
        wup=max_similarity(sent,word,'wup',pos='n')
        adapted_lesk_output = adapted_lesk(sent, word, pos='n')
        lowest_index=min(synsets.index(wup),synsets.index(adapted_lesk_output))
        return synsets[lowest_index]
    else:
        return None
    

# Get distractor from WordNet. These distractors work on the basis of hypernym and hyponym explained in detail in the documentation.
def find_distractors(syn,word):
    dists=[]
    word=word.lower()
    actword=word
    if len(word.split())>0: #Splits the word with underscores(_) instead of spaces if there are multiple words
        word.replace(" ","_")
    hypernym = syn.hypernyms() #Gets the hypernyms of the word
    if len(hypernym)==0: #If there are no hypernyms for the current word, we simple return the empty list of distractors
        return dists
    for each in hypernym[0].hyponyms(): #Otherwise we find the relevant hyponyms for the hypernyms
        name=each.lemmas()[0].name()
        if(name==actword):
            continue
        name=name.replace("_"," ")
        name=" ".join(w.capitalize() for w in name.split())
        if name is not None and name not in dists: #If the word is not already present in the list and is different from the actual word
            dists.append(name)
    return dists

# The primary goal of this step is to take our MCQ quality one step further. The WordNet might some times fail to produce a hypernym 
# for some words. In that case the ConcepNet comes into play as they help achieve our distractors when there are no hypernyms 
# present for it in the WordNet.
def find_distractors2(word):
    word=word.lower()
    actword=word
    if len(word.split())>0: #Splits the word with underscores(_) instead of spaces if there are multiple words
        word=word.replace(" ","_")
    dists=[]
    url= "http://api.conceptnet.io/query?node=/c/en/%s/n&rel=/r/PartOf&start=/c/en/%s&limit=5"%(word,word) #To get ditractors from ConceptNet's API
    obj=requests.get(url).json()
    for edge in obj['edges']:
        link=edge['end']['term']
        url2="http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"%(link,link)
        obj2=requests.get(url2).json()
        for edge in obj2['edges']:
            word2=edge['start']['label']
            if word2 not in dists and actword.lower() not in word2.lower(): #If the word is not already present in the list and is different from he actial word
                dists.append(word2)
    return dists

# ranks the distracts by the similarity to the key word
def rank_distractors_word_similarity(main_word, distractors):
    nlp = spacy.load('en_core_web_sm')
    g1 = nlp(main_word)
    r = {}
    for v in distractors:
        g2 = nlp(v)
        r[v] = g1.similarity(g2)
    result = []
    sorted_r = dict(sorted(r.items(), key=operator.itemgetter(1),reverse=True))
    for dist, _ in sorted_r.items():
        result.append(dist)
    return result


def hypernym_hyponym_distractors(sentence, word):
    wordsense = get_sense_word(sentence, word)
    if wordsense: # if the wordsense is not null/none
        dists=find_distractors(wordsense, word) # Gets the WordNet distractors
        if len(dists)==0: # If there are no WordNet distractors available for the current word
            dists=find_distractors2(word) # The gets the distractors from the ConceptNet API
        if len(dists)!=0: # If there are indeed distractors from WordNet available, 
            final_dists = rank_distractors_word_similarity(word, dists)
            return final_dists
    else: #If there is no wordsense, the directly searches/uses the ConceptNet
        dists = find_distractors2(word)
        if len(dists)>0:
            final_dists = rank_distractors_word_similarity(word, dists)
            return final_dists
        else:
            return []

In [33]:
def map_distractors(sentence, word):
    distractor_mapper = {}
    dists = best_distractors_sense2vec(word)    
    if not dists:
        dists = hypernym_hyponym_distractors(sentence, word)
    distractor_mapper[word] = dists
    return distractor_mapper

## MCQ Generation

In [34]:
original_file_name = file.name
output_file_name = file.name.split(".")[0] + "_MCQs.txt"

# Delete the file if it exists (this occurs if the code has been run before for similar article)
if os.path.exists(output_file_name):
      os.remove(output_file_name)
else:
  pass 
  
with open(output_file_name, 'w') as f:
    f.write("Multiple Choice Questions generated for the provided article: {0}.".format(file.name))
    f.write("\n")
    f.write("\n")
    iterator = 1 #To keep the count of the questions
    for each in mapped_sentences:
        sentences = mapped_sentences[each]
        for sent in sentences:
            distractor_mapper = map_distractors(sent, each)
            reg_compile = r'\b' + each + r'\b'
            p=re.compile(reg_compile, re.IGNORECASE)
            op=p.sub("________",sent) # Replaces the keyword with underscores(blanks)
            number_question = "Question %s ->"%(iterator) + " " + op
            f.write(number_question) # writes the question along with a question number
            options=[each.capitalize()] + distractor_mapper[each] # Capitalizes the options
            options=options[:5] # Selects only 5 options
            opts=['a','b','c','d', 'e']
            random.shuffle(options) # Shuffle the options so that order is not always same
            for i,ch in enumerate(options):
                f.write("\n"+ "\t" + opts[i] + ") " + ch)
            f.write("\n")
            f.write("\n")
            iterator+=1 #Increase the counter