In [1]:
#!pip install spacy

Base function for short summarization

In [2]:
def simple_question_generator(text):
    # Splitting the text into sentences
    sentences = text.split('. ')
    questions = []
    for sentence in sentences:
        if "will" in sentence:
            question = sentence.replace("will", "Who will", 1) + "?"
        elif "is" in sentence:
            question = sentence.replace("is", "What is", 1) + "?"
        elif "are" in sentence:
            question = sentence.replace("are", "What are", 1) + "?"
        else:
            words = sentence.split()
            if len(words) > 0:
                question = "What " + ' '.join(words) + "?"
            questions.append(question)

    return questions


In [6]:
# Shortened summary
summary = """
The sun is a star located at the center of our solar system. It provides the necessary heat and light for life on Earth. Planets orbit around the sun in predictable paths. Scientists are constantly studying the sun to understand its impact on Earth's climate. Future missions will explore the possibility of harnessing solar energy more effectively. The sun's surface is extremely hot, and its core generates energy through nuclear fusion. Understanding solar phenomena like solar flares and sunspots are crucial for space weather forecasting.
"""

# Generate questions
generated_questions = simple_question_generator(summary)
for question in generated_questions:
    print(question)


What It provides the necessary heat and light for life on Earth?
What Planets orbit around the sun in predictable paths?


Basic fill-in-the-blank quiz generation using nltk

In [25]:
#!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def generate_fill_in_the_blank_quiz(text):
    word_tokens = word_tokenize(text)
    pos_tags = pos_tag(word_tokens)

    questions = []
    for i, (word, tag) in enumerate(pos_tags):
        if tag in ['NN', 'NNS', 'NNP', 'NNPS']:  # Nouns
            question = ' '.join(word_tokens[:i] + ['____'] + word_tokens[i + 1:])
            questions.append((question, word))

    return questions

# Example usage
text = "Paris is the capital of France. The Eiffel Tower is located there."
quiz = generate_fill_in_the_blank_quiz(text)
for question, answer in quiz:
    print(f"Question: {question}\nAnswer: {answer}\n")


Question: ____ is the capital of France . The Eiffel Tower is located there .
Answer: Paris

Question: Paris is the ____ of France . The Eiffel Tower is located there .
Answer: capital

Question: Paris is the capital of ____ . The Eiffel Tower is located there .
Answer: France

Question: Paris is the capital of France . The ____ Tower is located there .
Answer: Eiffel

Question: Paris is the capital of France . The Eiffel ____ is located there .
Answer: Tower



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songzhixiao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/songzhixiao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True / False question using scikit-learn

In [27]:
#!pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

def generate_true_false_quiz(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    questions = []
    for i, sentence in enumerate(sentences):
        # Generate a false sentence
        false_sentence = random.choice(sentences)
        
        # Check similarity to avoid very similar sentences
        similarity = cosine_similarity(tfidf_matrix[i], vectorizer.transform([false_sentence]))
        if similarity < 0.5:
            questions.append((f"True or False: {sentence}", "True"))
            questions.append((f"True or False: {false_sentence}", "False"))

    return questions

# Example usage
sentences = [
    "The Eiffel Tower is in Paris.",
    "The capital of Italy is Rome.",
    "The Nile is the longest river in the world."
]
quiz = generate_true_false_quiz(sentences)
for question, answer in quiz:
    print(f"Question: {question}\nAnswer: {answer}\n")


Question: True or False: The capital of Italy is Rome.
Answer: True

Question: True or False: The Eiffel Tower is in Paris.
Answer: False

Question: True or False: The Nile is the longest river in the world.
Answer: True

Question: True or False: The capital of Italy is Rome.
Answer: False



Using ECR

In [23]:
#ECR
import spacy
import random

# Load SpaCy model for NER
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

def generate_questions_from_entities(text, entities):
    questions = []
    for entity, label in entities:
        if label in ['PERSON', 'ORG', 'GPE', 'LOC']:  # Focusing on specific entity types
            question = f"Who or what is {entity}?"
            questions.append(question)
        elif label in ['DATE', 'TIME']:
            question = f"When did {entity} occur?"
            questions.append(question)
        elif label in ['NORP', 'EVENT']:
            question = f"What can you tell about {entity}?"
            questions.append(question)
    return questions

# Example text
text = "Muhammadu Buhari plans to fight corruption in Nigeria and address the nation's unrest. He'll focus on violence in the northeast, where Boko Haram operates, and cooperate with Chad, Cameroon, and Niger."

# Extract entities and generate questions
entities = extract_entities(text)
questions = generate_questions_from_entities(text, entities)

for question in questions:
    print(question)


Who or what is Muhammadu Buhari?
Who or what is Nigeria?
Who or what is Boko Haram?
Who or what is Chad?
Who or what is Cameroon?
Who or what is Niger?


Seq2Seq Model for Question Generation

In [46]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import numpy as np

# Sample data (replace with actual data)
paragraphs = [
    'The Eiffel Tower, located on the Champ de Mars in Paris, is a wrought-iron lattice tower named after the engineer Gustave Eiffel. It was constructed from 1887 to 1889 as the entrance to the 1889 World’s Fair.',
    'The Sahara is the largest hot desert in the world, covering large parts of North Africa. It is known for its harsh environment and extreme temperatures during the day and night.'
]
questions = ['Where is the Eiffel Tower?', 'What is the Sahara?']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(paragraphs + questions)
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
seq_paragraphs = tokenizer.texts_to_sequences(paragraphs)
seq_questions = tokenizer.texts_to_sequences(questions)

# Padding
max_paragraph_length = max(len(seq) for seq in seq_paragraphs)
max_question_length = max(len(seq) for seq in seq_questions)
seq_paragraphs = pad_sequences(seq_paragraphs, maxlen=max_paragraph_length, padding='post')
seq_questions = pad_sequences(seq_questions, maxlen=max_question_length, padding='post')


In [47]:
latent_dim = 256  # Latent dimensionality

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')


In [48]:
# Encoder Inference Model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder Inference Model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [49]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    # Populate the first word of target sequence with the start token
    target_seq[0, 0] = tokenizer.word_index['start']  # Assuming 'start' is the start token

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        # Exit condition: either hit max length or find stop token
