## Generate fill in the blanks from any content

In [2]:
# # Installing from https://github.com/boudinfl/pke library for Python Keyword extraction
# # We use a fixed commit as the later changes might break the code. If it was on pip we would have used exact version number.

# !pip install --quiet git+https://github.com/boudinfl/pke.git@dc4d5f21e0ffe64c4df93c46146d29d1c522476b
# !pip install --quiet flashtext==2.7

In [4]:
# To view the text  into notebook cell
import textwrap
text = """There is a lot of volcanic activity at divergent plate boundaries in the oceans. For example, many undersea volcanoes are found along the Mid-Atlantic Ridge. This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean. As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust. Molten rock, called magma, erupts through these cracks onto Earth’s surface. At the surface, the molten rock is called lava. It cools and hardens, forming rock. Divergent plate boundaries also occur in the continental crust. Volcanoes form at these boundaries, but less often than in ocean crust. That’s because continental crust is thicker than oceanic crust. This makes it more difficult for molten rock to push up through the crust. Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone. The leading edge of the plate melts as it is pulled into the mantle, forming magma that erupts as volcanoes. When a line of volcanoes forms along a subduction zone, they make up a volcanic arc. The edges of the Pacific plate are long subduction zones lined with volcanoes. This is why the Pacific rim is called the “Pacific Ring of Fire.”"""

wrapper = textwrap.TextWrapper(width=100)
word_list = wrapper.wrap(text=text)
for element in word_list: 
  print(element)

There is a lot of volcanic activity at divergent plate boundaries in the oceans. For example, many
undersea volcanoes are found along the Mid-Atlantic Ridge. This is a divergent plate boundary that
runs north-south through the middle of the Atlantic Ocean. As tectonic plates pull away from each
other at a divergent plate boundary, they create deep fissures, or cracks, in the crust. Molten
rock, called magma, erupts through these cracks onto Earth’s surface. At the surface, the molten
rock is called lava. It cools and hardens, forming rock. Divergent plate boundaries also occur in
the continental crust. Volcanoes form at these boundaries, but less often than in ocean crust.
That’s because continental crust is thicker than oceanic crust. This makes it more difficult for
molten rock to push up through the crust. Many volcanoes form along convergent plate boundaries
where one tectonic plate is pulled down beneath another at a subduction zone. The leading edge of
the plate melts as it is pu

In [5]:
# Importing required libraries
import json
import requests
import string
import re
import nltk
import string
import itertools
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
import pke
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import traceback
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Tokenizing sentence using nltk sent_tokenize
def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

sentences = tokenize_sentences(text)
print (sentences)

['There is a lot of volcanic activity at divergent plate boundaries in the oceans.', 'For example, many undersea volcanoes are found along the Mid-Atlantic Ridge.', 'This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean.', 'As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust.', 'Molten rock, called magma, erupts through these cracks onto Earth’s surface.', 'At the surface, the molten rock is called lava.', 'It cools and hardens, forming rock.', 'Divergent plate boundaries also occur in the continental crust.', 'Volcanoes form at these boundaries, but less often than in ocean crust.', 'That’s because continental crust is thicker than oceanic crust.', 'This makes it more difficult for molten rock to push up through the crust.', 'Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone.', 'The leadi

In [6]:
# keyword extraction using pke's MultipartiteRank Algorithm
# Working principle of MultipartiteRank Algorithm - https://www.aclweb.org/anthology/N18-2105.pdf
def get_noun_adj_verb(text):
    out=[]
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=text)
        #    not contain punctuation marks or stopwords as candidates.
        pos = {'VERB', 'ADJ', 'NOUN'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos, stoplist=stoplist)
        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=30)
        

        for val in keyphrases:
            out.append(val[0])
    except:
        out = []
        traceback.print_exc()

    return out

noun_verbs_adj = get_noun_adj_verb(text)
print ("keywords: ",noun_verbs_adj)

keywords:  ['divergent plate boundaries', 'volcanoes form', 'molten rock', 'crust', 'oceans', 'called magma', 'erupts', 'volcanic activity', 'cracks', 'tectonic plates pull', 'surface', 'makes', 'subduction zone', 'runs north', 'ocean crust', 'south', 'leading edge', 'forming rock', 'occur', 'lot', 'example', 'hardens', 'continental crust', 'create deep fissures', 'middle', 'push', 'difficult', 'volcanoes', 'cools', 'many volcanoes form']


In [7]:
# Identify and matching sentence for each keyword
from pprint import pprint
def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

keyword_sentence_mapping_noun_verbs_adj = get_sentences_for_keyword(noun_verbs_adj, sentences)
pprint (keyword_sentence_mapping_noun_verbs_adj)

{'called': ['Molten rock, called magma, erupts through these cracks onto '
            'Earth’s surface.',
            'This is why the Pacific rim is called the “Pacific Ring of Fire.”',
            'At the surface, the molten rock is called lava.'],
 'continental crust': ['Divergent plate boundaries also occur in the '
                       'continental crust.',
                       'That’s because continental crust is thicker than '
                       'oceanic crust.'],
 'cools': ['It cools and hardens, forming rock.'],
 'cracks': ['As tectonic plates pull away from each other at a divergent plate '
            'boundary, they create deep fissures, or cracks, in the crust.',
            'Molten rock, called magma, erupts through these cracks onto '
            'Earth’s surface.'],
 'create deep fissures': ['As tectonic plates pull away from each other at a '
                          'divergent plate boundary, they create deep '
                          'fissures, or cracks,

In [8]:
# Handling case sensitive and removing duplicate keywords
def get_fill_in_the_blanks(sentence_mapping):
    out={"title":"Fill in the blanks for these sentences with matching words at the top"}
    blank_sentences = []
    processed = []
    keys=[]
    for key in sentence_mapping:
        if len(sentence_mapping[key])>0:
            sent = sentence_mapping[key][0]
            # Compile a regular expression pattern into a regular expression object, which can be used for matching and other methods
            insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
            print('insensitive_sent - ',insensitive_sent)
            no_of_replacements =  len(re.findall(re.escape(key),sent,re.IGNORECASE))
            line = insensitive_sent.sub(' _________ ', sent)
            if (sentence_mapping[key][0] not in processed) and no_of_replacements<2:
                blank_sentences.append(line)
                processed.append(sentence_mapping[key][0])
                keys.append(key)
    out["sentences"]=blank_sentences[:10]
    out["keys"]=keys[:10]
    return out


fill_in_the_blanks = get_fill_in_the_blanks(keyword_sentence_mapping_noun_verbs_adj)
pprint(fill_in_the_blanks)

{'keys': ['divergent plate boundaries',
          'many undersea volcanoes',
          'crust',
          'molten rock',
          'forming rock',
          'subduction zone',
          'erupts',
          'continental crust',
          'ocean crust',
          'runs'],
 'sentences': ['There is a lot of volcanic activity at  _________  in the '
               'oceans.',
               'For example,  _________  are found along the Mid-Atlantic '
               'Ridge.',
               'As tectonic plates pull away from each other at a divergent '
               'plate boundary, they create deep fissures, or cracks, in the  '
               '_________ .',
               ' _________ , called magma, erupts through these cracks onto '
               'Earth’s surface.',
               'It cools and hardens,  _________ .',
               'Many volcanoes form along convergent plate boundaries where '
               'one tectonic plate is pulled down beneath another at a  '
               '____