In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [19]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import PyPDF2

# Load the model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

# Function to extract and clean text from a PDF file
def extract_clean_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    # Clean and format the text
    text = text.replace("\n", " ").strip()
    paragraphs = text.split('. ')
    formatted_text = "\n\n".join(paragraphs)
    return formatted_text

# Function to generate a single MCQ using T5 model
def generate_mcq(text):
    prompt = (
        f"Generate a difficult multiple-choice question with distractors from the following text:\n\n{text}\n"
        "Format the output as follows:\n"
        "Q: <question>\n"
        "1) <distractor 1>\n"
        "2) <distractor 2>\n"
        "3) <distractor 3>\n"
        "4) <distractor 4>\n"
        "Correct Answer: <correct option>\n\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=512,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.9
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Function to parse the generated MCQs
def parse_mcqs(mcq_text):
    mcqs = []
    questions = mcq_text.split('\n\n')

    for question in questions:
        print(f"Debug: Raw question text: {question}")  # Debugging output
        lines = question.split('\n')
        if len(lines) >= 6:
            question_stem = lines[0].replace('Q:', '').strip()
            distractors = [line.split(')')[1].strip() for line in lines[1:5] if ")" in line]
            correct_answer = lines[-1].split(':')[1].strip() if ":" in lines[-1] else None
            if len(distractors) == 4 and correct_answer:
                # Identify the index of the correct answer
                try:
                    correct_option_index = distractors.index(correct_answer) + 1
                except ValueError:
                    correct_option_index = None

                mcqs.append((question_stem, distractors, correct_option_index))
            else:
                print(f"Skipping question due to missing distractors or correct answer: {question}")
        else:
            print(f"Skipping malformed question: {question}")

    return mcqs

# Function to create a DataFrame with the required structure
def create_mcq_dataset(mcqs):
    mcq_data = []

    for mcq in mcqs:
        question_stem, distractors, correct_option_index = mcq
        mcq_dict = {
            "question_text": question_stem,
            "option_1": distractors[0] if len(distractors) > 0 else "",
            "option_2": distractors[1] if len(distractors) > 1 else "",
            "option_3": distractors[2] if len(distractors) > 2 else "",
            "option_4": distractors[3] if len(distractors) > 3 else "",
            "correct_option": correct_option_index if correct_option_index else ""
        }
        mcq_data.append(mcq_dict)

    mcq_df = pd.DataFrame(mcq_data)
    return mcq_df

# Path to your PDF file
pdf_path = '/content/drive/MyDrive/Colab Notebooks/modelselector/python2.pdf'  # Replace with your actual PDF file path

# Extract and clean text from the PDF
pdf_text = extract_clean_text_from_pdf(pdf_path)

# Ask user how many questions they want to generate
num_questions = int(input("Enter the number of difficult MCQs to generate: "))

# Initialize list to store all MCQs
all_mcqs = []

# Generate MCQs one by one and append to the list
for _ in range(num_questions):
    mcq_text = generate_mcq(pdf_text)
    mcqs = parse_mcqs(mcq_text)
    all_mcqs.extend(mcqs)

# Create a DataFrame with the generated MCQs
mcq_df = create_mcq_dataset(all_mcqs)

# Display the DataFrame
print(mcq_df)

# Optionally, save the DataFrame to a CSV file
mcq_df.to_csv("mcq_dataset.csv", index=False)


Enter the number of difficult MCQs to generate: 20
Debug: Raw question text: If you want to learn Python, which day is a good time for you to take it?
Skipping malformed question: If you want to learn Python, which day is a good time for you to take it?
Debug: Raw question text: A computer can be connected to _.Options:A a computerB a computerC a computerD an internetAnswer:A
Skipping malformed question: A computer can be connected to _.Options:A a computerB a computerC a computerD an internetAnswer:A
Debug: Raw question text: What is the purpose of the first day of the text?
Skipping malformed question: What is the purpose of the first day of the text?
Debug: Raw question text: Question: What is the date when Python is becoming popular?Options:A 21-02-1991.B 10-04-2020.C 08-08-2020.D 05-04-2020.Answer:A
Skipping malformed question: Question: What is the date when Python is becoming popular?Options:A 21-02-1991.B 10-04-2020.C 08-08-2020.D 05-04-2020.Answer:A
Debug: Raw question text: Q