!python -m spacy download en_core_web_sm
!pip install flashtext pywsd
!pip install git+https://github.com/boudinfl/pke.git

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('popular')


In [1]:
import nltk
import pke
import string
import random
import requests
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
from pywsd.similarity import max_similarity
from pywsd.lesk import adapted_lesk
from pywsd.lesk import cosine_lesk

Warming up PyWSD (takes ~10 secs)... took 4.874185800552368 secs.


In [2]:
# Step 1: Importing the text file
def read_text_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        print("File not found!")
        return None


In [3]:
# Step 2: Keyword Extraction
def get_important_words(text):
    extractor = pke.unsupervised.MultipartiteRank()
    extractor.load_document(input=text, language='en')
    pos = {'PROPN'}
    stoplist = list(string.punctuation) + ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] + stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_weighting()
    return [keyphrase[0] for keyphrase in extractor.get_n_best(n=25)]


In [4]:
# Step 3: Sentence Splitting
def split_text_to_sentences(text):
    return [sent.strip() for sent in sent_tokenize(text) if len(sent) > 15]

In [5]:
# Step 4: Mapping Sentences to Keywords
def map_sentences_to_keywords(important_words, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {word: [] for word in important_words}
    for word in important_words:
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        for word in keyword_processor.extract_keywords(sentence):
            keyword_sentences[word].append(sentence)
    return keyword_sentences

In [6]:
# Step 5: Getting Word Sense
def get_word_sense(sentence, word):
    word = word.lower()
    if len(word.split()) > 0:
        word = word.replace(" ", "_")
    synsets = wn.synsets(word, 'n')
    if synsets:
        wup = max_similarity(sentence, word, 'wup', pos='n')
        adapted_lesk_output = adapted_lesk(sentence, word, pos='n')
        lowest_index = min(synsets.index(wup), synsets.index(adapted_lesk_output))
        return synsets[lowest_index]
    else:
        return None

In [7]:
# Step 6: Getting Distractors from WordNet
def get_wordnet_distractors(synset, word):
    dists = []
    hypernyms = synset.hypernyms()
    if hypernyms:
        for hyponym in hypernyms[0].hyponyms():
            name = hyponym.lemmas()[0].name().replace("_", " ").capitalize()
            if name != word.capitalize() and name not in dists:
                dists.append(name)
    return dists


In [8]:
# Step 7: Getting Distractors from ConceptNet
def get_conceptnet_distractors(word):
    word = word.lower().replace(" ", "_")
    dists = []
    url = f"http://api.conceptnet.io/query?node=/c/en/{word}/n&rel=/r/PartOf&start=/c/en/{word}&limit=5"
    obj = requests.get(url).json()
    for edge in obj['edges']:
        link = edge['end']['term']
        url2 = f"http://api.conceptnet.io/query?node={link}&rel=/r/PartOf&end={link}&limit=10"
        obj2 = requests.get(url2).json()
        for edge in obj2['edges']:
            word2 = edge['start']['label']
            if word2.lower() != word and word2 not in dists:
                dists.append(word2)
    return dists

In [9]:
# Step 8: Mapping Distractors to Keywords
def map_distractors_to_keywords(important_words, keyword_sentences):
    mapped_distractors = {}
    for word in important_words:
        word_sense = get_word_sense(keyword_sentences[word][0], word)
        if word_sense:
            distractors = get_wordnet_distractors(word_sense, word)
            if not distractors:
                distractors = get_conceptnet_distractors(word)
            if distractors:
                mapped_distractors[word] = distractors
    return mapped_distractors


In [10]:
# Step 9: Formatting MCQs
def generate_mcqs(mapped_sentences, mapped_distractors):
    print("************************************** Multiple Choice Questions **************************************\n")
    iterator = 1
    for word, sentences in mapped_sentences.items():
        sentence = sentences[0]
        question_pattern = re.compile(word, re.IGNORECASE)
        question = question_pattern.sub("________", sentence)
        print(f"Question {iterator} -> {question}\n")
        options = [word.capitalize()] + mapped_distractors[word]
        options = random.sample(options, min(4, len(options)))
        random.shuffle(options)
        for idx, option in enumerate(options):
            print(f"\t({chr(97 + idx)}) {option}")
        print()
        iterator += 1

if __name__ == "__main__":
    file_path = "article.txt"
    text = read_text_file(file_path)
    if text:
        important_words = get_important_words(text)
        sentences = split_text_to_sentences(text)
        mapped_sentences = map_sentences_to_keywords(important_words, sentences)
        mapped_distractors = map_distractors_to_keywords(important_words, mapped_sentences)
        generate_mcqs(mapped_sentences, mapped_distractors)

TypeError: TopicRank.candidate_selection() got an unexpected keyword argument 'stoplist'