In [1]:
# import spacy

# # Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# def preprocess_text(text):
#     doc = nlp(text)
#     return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

# def extract_keywords(text):
#     doc = nlp(text)
#     return [chunk.text for chunk in doc.noun_chunks]

# def generate_question(sentence, keyword):
#     question = sentence.replace(keyword, "___")
#     return question

# # Example text
# text = "Photosynthesis occurs in the chloroplasts of plant cells."

# # Preprocess text
# tokens = preprocess_text(text)
# print("Tokens:", tokens)

# # Extract keywords
# keywords = extract_keywords(text)
# print("Keywords:", keywords)

# # Generate question
# if keywords:
#     question = generate_question(text, keywords[0])
#     print("Generated Question:", question)

# # Example output
# # Tokens: ['photosynthesis', 'occur', 'chloroplast', 'plant', 'cell']
# # Keywords: ['Photosynthesis', 'the chloroplasts', 'plant cells']
# # Generated Question: ___ occurs in the chloroplasts of plant cells.


Tokens: ['photosynthesis', 'occur', 'chloroplast', 'plant', 'cell']
Keywords: ['Photosynthesis', 'the chloroplasts', 'plant cells']
Generated Question: ___ occurs in the chloroplasts of plant cells.


In [1]:
# today 29/5/2024
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, GPT2LMHeadModel, GPT2Tokenizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deepp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, GPT2LMHeadModel, GPT2Tokenizer
import nltk
import random
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Load T5 model and tokenizer for question generation
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)
t5_tokenizer.pad_token = t5_tokenizer.eos_token

# Load GPT-2 model and tokenizer for distractor generation
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def generate_question(context, answer):
    # Prepare the input text for T5
    input_text = f"generate question: {context} answer: {answer}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt', padding=True)

    # Generate question with attention mask
    attention_mask = input_ids.ne(t5_tokenizer.pad_token_id).long()
    outputs = t5_model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return question

def generate_distractors(answer, num_distractors=3):
    distractors = set()  # Use a set to avoid duplicates
    input_text = f"Generate distractors for: {answer}. Distractor:"

    while len(distractors) < num_distractors:
        input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt', padding=True)
        attention_mask = input_ids.ne(gpt2_tokenizer.pad_token_id).long()
        output = gpt2_model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=gpt2_tokenizer.pad_token_id)
        distractor = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
        distractor = distractor.replace(input_text, '').strip()

        if distractor and distractor.lower() != answer.lower() and distractor not in distractors:
            distractors.add(distractor)
    
    return list(distractors)

def generate_mcq(context):
    sentences = sent_tokenize(context)
    if not sentences:
        return None

    # Assume the answer is the last sentence for simplicity
    answer = sentences[-1]
    question = generate_question(context, answer)
    distractors = generate_distractors(answer)

    # Ensure we have 3 distractors
    if len(distractors) < 3:
        return None

    # Combine correct answer with distractors and shuffle
    options = distractors + [answer]
    random.shuffle(options)

    mcq = {
        "question": question,
        "options": options,
        "correct_answer": answer
    }

    return mcq

# Example usage
context = "The capital of France is Paris. It is known for its cafes and the Eiffel Tower."
mcq = generate_mcq(context)

if mcq:
    print("Question:", mcq["question"])
    print("Options:", mcq["options"])
    print("Correct Answer:", mcq["correct_answer"])
else:
    print("MCQ generation failed.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deepp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**mcq_generator**

In [2]:
import nltk
import random
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('wordnet')

In [4]:


def extract_keywords(text):
    sentences = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sentences]
    pos_tags = [pos_tag(word) for word in words]
    named_entities = [ne_chunk(tag) for tag in pos_tags]
    named_entities_tags = [tree2conlltags(ne) for ne in named_entities]

    keywords = []
    for sent in named_entities_tags:
        for word, tag, chunk in sent:
            if chunk != 'O':
                keywords.append(word)
    
    return list(set(keywords))

def generate_distractors(answer, num_distractors=3):
    distractors = set()
    synsets = wn.synsets(answer)
    if not synsets:
        return list(distractors)
    for syn in synsets:
        for lemma in syn.lemmas():
            if lemma.name().lower() != answer.lower():
                distractors.add(lemma.name().replace('_', ' '))
            if len(distractors) >= num_distractors:
                break
        if len(distractors) >= num_distractors:
            break
    return list(distractors)

def create_mcq(text):
    keywords = extract_keywords(text)
    questions = []
    for keyword in keywords:
        sentences = sent_tokenize(text)
        sentence = next((sent for sent in sentences if keyword in sent), None)
        if sentence:
            question = sentence.replace(keyword, "____")
            distractors = generate_distractors(keyword)
            if len(distractors) < 3:
                continue
            options = distractors + [keyword]
            random.shuffle(options)
            questions.append({
                'question': question,
                'options': options,
                'answer': keyword
            })
    return questions

# Example text
text = """
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, artificial neural networks have been able to surpass many previous approaches in performance.[2][3]

ML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine.[4][5] When applied to business problems, it is known under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.

The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning.[7][8]

From a theoretical viewpoint, probably approximately correct (PAC) learning provides a framework for describing machine learning.
"""

# Generate MCQs
mcqs = create_mcq(text)
for i, mcq in enumerate(mcqs):
    print(f"Q{i+1}: {mcq['question']}")
    for j, option in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {option}")
    print(f"Answer: {mcq['answer']}")
    print()


Q1: ____ mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning.
  A. data point
  B. datum
  C. Data
  D. information
Answer: Data

Q2: 
____ learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
  A. simple machine
  B. Machine
  C. political machine
  D. car
Answer: Machine



## 4-5 questions

In [10]:
import nltk
import random
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.chunk import tree2conlltags


def extract_keywords(text):
    sentences = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sentences]
    pos_tags = [pos_tag(word) for word in words]
    named_entities = [ne_chunk(tag) for tag in pos_tags]
    named_entities_tags = [tree2conlltags(ne) for ne in named_entities]

    keywords = []
    for sent in named_entities_tags:
        for word, tag, chunk in sent:
            if chunk != 'O':
                keywords.append(word)
    
    return list(set(keywords))

def generate_distractors(answer, num_distractors=3):
    distractors = set()
    synsets = wn.synsets(answer)
    if not synsets:
        return list(distractors)
    for syn in synsets:
        for lemma in syn.lemmas():
            if lemma.name().lower() != answer.lower():
                distractors.add(lemma.name().replace('_', ' '))
            if len(distractors) >= num_distractors:
                break
        if len(distractors) >= num_distractors:
            break
    return list(distractors)

def create_mcq(text, num_questions=4):
    keywords = extract_keywords(text)
    random.shuffle(keywords)
    questions = []
    for keyword in keywords:
        sentences = sent_tokenize(text)
        sentence = next((sent for sent in sentences if keyword in sent), None)
        if sentence:
            question = sentence.replace(keyword, "____")
            distractors = generate_distractors(keyword)
            if len(distractors) < 3:
                continue
            options = distractors + [keyword]
            random.shuffle(options)
            questions.append({
                'question': question,
                'options': options,
                'answer': keyword
            })
            if len(questions) >= num_questions:
                break
    return questions

# Example text
text = """
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, artificial neural networks have been able to surpass many previous approaches in performance.[2][3]

ML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine.[4][5] When applied to business problems, it is known under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.

The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning.[7][8]

From a theoretical viewpoint, probably approximately correct (PAC) learning provides a framework for describing machine learning.
As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in having machines learn from data. They attempted to approach the problem with various symbolic methods, as well as what were then termed "neural networks"; these were mostly perceptrons and other models that were later found to be reinventions of the generalized linear models of statistics.[23] Probabilistic reasoning was also employed, especially in automated medical diagnosis.[24]: 488 

However, an increasing emphasis on the logical, knowledge-based approach caused a rift between AI and machine learning. Probabilistic systems were plagued by theoretical and practical problems of data acquisition and representation. By 1980, expert systems had come to dominate AI, and statistics was out of favor.[25] Work on symbolic/knowledge-based learning did continue within AI, leading to inductive logic programming(ILP), but the more statistical line of research was now outside the field of AI proper, in pattern recognition and information retrieval.[24]: 708–710, 755  Neural networks research had been abandoned by AI and computer science around the same time. This line, too, was continued outside the AI/CS field, as "connectionism", by researchers from other disciplines including Hopfield, Rumelhart, and Hinton. Their main success came in the mid-1980s with the reinvention of backpropagation.[24]: 25 

Machine learning (ML), reorganized and recognized as its own field, started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory.[25]
"""

# Generate 4-5 MCQs
mcqs = create_mcq(text, num_questions=5)
for i, mcq in enumerate(mcqs):
    print(f"Q{i+1}: {mcq['question']}")
    for j, option in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {option}")
    print(f"Answer: {mcq['answer']}")
    print()


Q1: ____ mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning.
  A. information
  B. data point
  C. datum
  D. Data
Answer: Data

Q2: 
____ learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
  A. simple machine
  B. car
  C. Machine
  D. political machine
Answer: Machine

Q3: As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (____).
  A. three-toed sloth
  B. artificial intelligence
  C. AI
  D. Army Intelligence
Answer: AI



In [1]:
# import spacy
# from transformers import pipeline

# # Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Load pre-trained fill-mask model
# mask_filler = pipeline("fill-mask", model="bert-base-uncased")

# def preprocess_text(text):
#     doc = nlp(text)
#     return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

# def extract_keywords(text):
#     doc = nlp(text)
#     return [chunk.text for chunk in doc.noun_chunks]

# def generate_question(text, keyword):
#     masked_text = text.replace(keyword, mask_filler.tokenizer.mask_token)
#     results = mask_filler(masked_text)
#     questions = [result['sequence'].replace(mask_filler.tokenizer.mask_token, keyword) for result in results]
#     return questions[0] if questions else None

# def generate_distractors(answer, n=3):
#     # Placeholder for distractor generation using semantic similarity or rule-based methods
#     return [answer + " distractor " + str(i+1) for i in range(n)]

# # Example text
# text = input("Please enter your paragraph")
# # Preprocess text
# tokens = preprocess_text(text)
# print("Tokens:", tokens)

# # Extract keywords
# keywords = extract_keywords(text)
# print("Keywords:", keywords)

# # Generate question
# if keywords:
#     question = generate_question(text, keywords[0])
#     print("Generated Question:", question)

#     # Generate distractors
#     correct_answer = keywords[0]
#     distractors = generate_distractors(correct_answer)
#     print("Distractors:", distractors)

# # Example output
# # Tokens: ['photosynthesis', 'occur', 'chloroplast', 'plant', 'cell']
# # Keywords: ['Photosynthesis', 'the chloroplasts', 'plant cells']
# # Generated Question: [Generated question with mask token replaced by the keyword]
# # Distractors: ['Photosynthesis distractor 1', 'Photosynthesis distractor 2', 'Photosynthesis distractor 3']


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokens: ['scientific', 'endeavor', 'machine', 'learning', 'grow', 'quest', 'artificial', 'intelligence', 'AI', 'early', 'day', 'AI', 'academic', 'discipline', 'researcher', 'interested', 'have', 'machine', 'learn', 'datum', 'attempt', 'approach', 'problem', 'symbolic', 'method', 'term', 'neural', 'network', 'perceptron', 'model', 'later', 'find', 'reinvention', 'generalized', 'linear', 'model', 'statistics.[23', 'probabilistic', 'reasoning', 'employ', 'especially', 'automate', 'medical', 'diagnosis.[24', '\u200a', '488', '\u200a  ', 'increase', 'emphasis', 'logical', 'knowledge', 'base', 'approach', 'cause', 'rift', 'AI', 'machine', 'learning', 'probabilistic', 'system', 'plague', 'theoretical', 'practical', 'problem', 'datum', 'acquisition', 'representation.[24', '\u200a', '488', '\u200a ', '1980', 'expert', 'system', 'come', 'dominate', 'AI', 'statistic', 'favor.[25', 'work', 'symbolic', 'knowledge', 'base', 'learning', 'continue', 'AI', 'lead', 'inductive', 'logic', 'programming(ilp

In [14]:
import spacy
from transformers import pipeline

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load pre-trained question generation model
question_generator = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend")

def preprocess_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

def extract_keywords(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]

def generate_question(text, keyword):
    sentence = text.replace(keyword, "[MASK]")
    input_text = f"generate question: {sentence}"
    questions = question_generator(input_text)
    return questions[0]['generated_text'] if questions else None

def generate_distractors(answer, n=2):
    # Placeholder for distractor generation using semantic similarity or rule-based methods
    return [answer + " distractor " + str(i+1) for i in range(n)]

# Example text
text = """"Architecturally, the school has a Catholic character. 
Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it,
is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". 
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome),is a simple, modern stone statue of Mary"""
# Preprocess text
tokens = preprocess_text(text)
print("Tokens:", tokens)

# Extract keywords
keywords = extract_keywords(text)
print("Keywords:", keywords)

# Generate question
if keywords:
    question = generate_question(text, keywords[0])
    print("Generated Question:", question)

    # Generate distractors
    correct_answer = keywords[0]
    distractors = generate_distractors(correct_answer)
    print("Distractors:", distractors)

# Example output
# Tokens: ['photosynthesis', 'occur', 'chloroplast', 'plant', 'cell']
# Keywords: ['Photosynthesis', 'the chloroplasts', 'plant cells']
# Generated Question: What occurs in the chloroplasts of plant cells?
# Distractors: ['Photosynthesis distractor 1', 'Photosynthesis distractor 2', 'Photosynthesis distractor 3']


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokens: ['architecturally', 'school', 'catholic', 'character', '\n', 'atop', 'Main', 'Building', 'gold', 'dome', 'golden', 'statue', 'Virgin', 'Mary', '\n', 'immediately', 'Main', 'Building', 'face', '\n', 'copper', 'statue', 'Christ', 'arm', 'upraise', 'legend', 'Venite', 'Ad', 'Omnes', '\n', 'Main', 'Building', 'Basilica', 'Sacred', 'Heart', '\n', 'immediately', 'basilica', 'Grotto', 'marian', 'place', 'prayer', 'reflection', '\n', 'replica', 'grotto', 'Lourdes', 'France', 'Virgin', 'Mary', 'reputedly', 'appear', 'Saint', 'Bernadette', 'soubirous', '1858', '\n', 'end', 'main', 'drive', 'direct', 'line', 'connect', '3', 'statue', 'Gold', 'Dome),is', 'simple', 'modern', 'stone', 'statue', 'Mary']
Keywords: ['the school', 'a Catholic character', "the Main Building's gold dome", 'a golden statue', 'the Virgin Mary', 'front', 'the Main Building', 'it', 'a copper statue', 'Christ', 'arms', 'the legend', '"Venite Ad Me Omnes', 'the Main Building', 'the Basilica', 'the Sacred Heart', 'the ba

In [9]:
from datasets import load_dataset

# Load the SQuAD dataset
squad = load_dataset('squad')

# Access the training data
train_data = squad['train']
print(train_data[0])


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [10]:
train_data.to_csv("generator.csv")

Creating CSV from Arrow format:   0%|          | 0/88 [00:00<?, ?ba/s]

83272389

In [3]:
import spacy
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load T5 model for question generation
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

def preprocess_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

def extract_keywords(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]

def generate_question(text, keyword):
    # Create input text for T5
    input_text = f"generate question: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate question
    outputs = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

def generate_distractors(answer, n=3):
    # Placeholder for distractor generation using semantic similarity or rule-based methods
    return [answer + " distractor " + str(i+1) for i in range(n)]

# Example text
text = input ("enter your paragraph")

# Preprocess text
tokens = preprocess_text(text)
print("Tokens:", tokens)

# Extract keywords
keywords = extract_keywords(text)
print("Keywords:", keywords)

# Generate question
if keywords:
    question = generate_question(text, keywords[0])
    print("Generated Question:", question)

    # Generate distractors
    correct_answer = keywords[0]
    distractors = generate_distractors(correct_answer)
    print("Distractors:", distractors)


ModuleNotFoundError: No module named 'spacy'

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random

# Load pre-trained model and tokenizer from Hugging Face
model_name = "gpt2-medium"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def generate_text(prompt, max_new_tokens=100, num_return_sequences=1, temperature=0.7):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(
        inputs,
        max_new_tokens=max_new_tokens,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def generate_question(context, num_choices=4):
    """
    Generate a multiple choice question based on the given context.
    
    Args:
    - context (str): The context or passage from which to generate the question.
    - num_choices (int): The number of answer choices to generate.
    
    Returns:
    - question (str): The generated question.
    - choices (list): A list of answer choices including the correct answer.
    - correct_answer (str): The correct answer.
    """
    prompt = f"Generate a multiple choice question based on the following context:\n\n{context}\n\nQ: "
    generated = generate_text(prompt, max_new_tokens=150)[0]
    
    # Parse the response
    lines = generated.split('\n')
    question = lines[0].strip()
    correct_answer = lines[1].strip()
    distractors = generate_distractors(context, correct_answer, num_choices - 1)
    
    choices = [correct_answer] + distractors
    random.shuffle(choices)
    
    return question, choices, correct_answer

def generate_distractors(context, correct_answer, num_distractors):
    """
    Generate distractor answers based on the context and correct answer.
    
    Args:
    - context (str): The context or passage from which to generate the distractors.
    - correct_answer (str): The correct answer to base the distractors on.
    - num_distractors (int): The number of distractors to generate.
    
    Returns:
    - distractors (list): A list of distractor answers.
    """
    prompt = f"Generate {num_distractors} incorrect but plausible multiple choice answers for the following question based on the context:\n\nContext: {context}\n\nCorrect Answer: {correct_answer}\n\nIncorrect Answers: "
    generated = generate_text(prompt, max_new_tokens=100)[0]
    
    distractors = [line.strip() for line in generated.split('\n') if line.strip()]
    return distractors[:num_distractors]

# Example usage
context = input("enter your paragraph")
question, choices, correct_answer = generate_question(context)

print("Question:", question)
print("Choices:", choices)
print("Correct Answer:", correct_answer)


Question: Generate a multiple choice question based on the following context:
Choices: ['Correct Answer:', 'Context: As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in having machines learn from data. They attempted to approach the problem with various symbolic methods, as well as what were then termed "neural networks"; these were mostly perceptrons and other models that were later found to be reinventions of the generalized linear models of statistics.[23] Probabilistic reasoning was also employed, especially in automated medical diagnosis.[24]:\u200a488\u200a  However, an increasing emphasis on the logical, knowledge-based approach caused a rift between AI and machine learning. Probabilistic systems were plagued by theoretical and practical problems of data acquisition and representation.[24]:\u200a488\u200a By 1980, expert systems had come to domina

## option 2

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate a paragraph
def generate_paragraph(prompt, max_new_tokens=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=max_new_tokens, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

# Function to create MCQs from the paragraph
def create_mcqs(paragraph, num_questions=2):
    sentences = paragraph.split('. ')
    questions = []

    for sentence in sentences[:num_questions]:
        # Simple way to generate a question (more sophisticated methods can be used)
        question = sentence.replace(" is", " is?")
        # Dummy options
        options = [
            sentence,
            "Not " + sentence,
            "Opposite of " + sentence,
            sentence[::1]
        ]
        question_data = {
            "question": question,
            "options": options,
            "answer": sentence
        }
        questions.append(question_data)
    return questions

# Example usage
prompt = input("enter your paragraph")
paragraph = generate_paragraph(prompt)
mcqs = create_mcqs(paragraph)

# Display MCQs
for idx, mcq in enumerate(mcqs, 1):
    print(f"Q{idx}: {mcq['question']}")
    for opt_idx, option in enumerate(mcq['options'], 1):
        print(f"   {opt_idx}. {option}")
    print(f"Answer: {mcq['answer']}\n")


Q1: As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI)
   1. As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI)
   2. Not As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI)
   3. Opposite of As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI)
   4. )IA( ecnegilletni laicifitra rof tseuq eht fo tuo werg gninrael enihcam ,rovaedne cifitneics a sA
Answer: As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI)

Q2: In the early days of AI as an academic discipline, some researchers were interested in having machines learn from data
   1. In the early days of AI as an academic discipline, some researchers were interested in having machines learn from data
   2. Not In the early days of AI as an academic discipline, some researchers were interested in having machine

In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random  # Import the random module

# Load pre-trained model and tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate a paragraph
def generate_paragraph(prompt, max_new_tokens=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=max_new_tokens, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

# Function to create MCQs from the paragraph
def create_mcqs(paragraph, num_questions=2):
    sentences = paragraph.split('. ')
    questions = []

    for sentence in sentences[:num_questions]:
        question = sentence.replace(" is", " is?")

        # Generate distractor options
        correct_answer = sentence
        options = [correct_answer]

        # Generate plausible distractors
        for _ in range(3):
            inputs = tokenizer.encode(f"Generate a statement related to: {sentence}", return_tensors="pt")
            outputs = model.generate(inputs, max_new_tokens=20, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
            distractor = tokenizer.decode(outputs[0], skip_special_tokens=True).split(".")[0]
            if distractor and distractor != sentence:
                options.append(distractor)
        
        # Ensure we have exactly 4 options
        while len(options) < 4:
            options.append(f"Incorrect statement {_+1}")

        # Shuffle options to ensure the correct answer isn't always the first
        random.shuffle(options)

        question_data = {
            "question": question,
            "options": options,
            "answer": correct_answer
        }
        questions.append(question_data)
    return questions

# Example usage
prompt = input("Enter your paragraph: ")
paragraph = generate_paragraph(prompt)
mcqs = create_mcqs(paragraph)

# Display MCQs
for idx, mcq in enumerate(mcqs, 1):
    print(f"Q{idx}: {mcq['question']}")
    for opt_idx, option in enumerate(mcq['options'], 1):
        print(f"   {opt_idx}. {option}")
    print(f"Answer: {mcq['answer']}\n")


Q1: Machine learning (ML) is? a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, artificial neural networks have been able to surpass many previous approaches in performance.[2][3]  ML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine.[4][5] When applied to business problems, it is? known under the name predictive analytics
   1. Generate a statement related to: Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions
   2. Machine learning (ML) is a field of study in artificial intelligence concerned with the developmen