In [9]:
!pip install PyMuPDF nltk gradio Pillow numpy
import os
import re
import json
import fitz  # PyMuPDF
import nltk
import gradio as gr
from PIL import Image
import io
import base64
import numpy as np
from collections import defaultdict

# Install required packages if not already installed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class JEEQuestionExtractor:
    def __init__(self):
        self.question_patterns = {
            'MCQ': re.compile(r'Q\.(\d+)\.?\s+(.+?)(?=\([A-D]\)|$)', re.DOTALL),
            'option': re.compile(r'\(([A-D])\)\s*(.+?)(?=\([A-D]\)|$)', re.DOTALL),
            'answer': re.compile(r'Answer.*?:\s*\(([A-D])\)', re.IGNORECASE)
        }

    def extract_text_from_pdf(self, pdf_path):
        """Extract text and images from PDF"""
        doc = fitz.open(pdf_path)
        text = ""
        images = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            text += page.get_text()

            # Extract images
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]

                # Create a unique identifier for this image
                image_id = f"page_{page_num + 1}_img_{img_index + 1}"

                # Store image data
                image_data = {
                    "id": image_id,
                    "bytes": image_bytes,
                    "position": page_num + 1  # Page number where image appears
                }
                images.append(image_data)

        return text, images

    def preprocess_text(self, text):
        """Clean and preprocess extracted text"""
        # Replace multiple newlines with single newline
        text = re.sub(r'\n+', '\n', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        return text

    def extract_questions(self, text):
        """Extract questions from preprocessed text"""
        # Split text into sections that likely contain individual questions
        question_blocks = re.split(r'Q\.(\d+)', text)[1:]  # Skip first empty element

        questions = []

        # Process in pairs (question number and question text)
        for i in range(0, len(question_blocks), 2):
            if i+1 >= len(question_blocks):
                break

            question_id = question_blocks[i].strip()
            question_text = question_blocks[i+1].strip()

            # Find options if they exist
            options = self.question_patterns['option'].findall(question_text)
            answer_choices = []

            if options:
                # Clean up the question text
                clean_question = re.sub(r'\([A-D]\).+?(?=\([A-D]\)|$)', '', question_text, flags=re.DOTALL)
                clean_question = clean_question.strip()

                for option_letter, option_text in options:
                    answer_choices.append({
                        "label": option_letter,
                        "text": option_text.strip()
                    })

                # Determine question type
                question_type = "MCQ" if options else "Numerical/Subjective"

                # Look for the correct answer
                answer_match = self.question_patterns['answer'].search(question_text)
                correct_answer = answer_match.group(1) if answer_match else None

                question_data = {
                    "question_id": int(question_id),
                    "question_type": question_type,
                    "question_text": clean_question,
                    "answer_choices": answer_choices,
                    "correct_answer": correct_answer,
                    "image": None  # Will be updated later if images are found
                }

                questions.append(question_data)
            else:
                # Handle subjective/numerical questions
                question_data = {
                    "question_id": int(question_id),
                    "question_type": "Numerical/Subjective",
                    "question_text": question_text.strip(),
                    "answer_choices": [],
                    "correct_answer": None,  # Will be updated if found
                    "image": None  # Will be updated later if images are found
                }

                questions.append(question_data)

        return questions

    def match_images_to_questions(self, questions, images, text):
        """Match extracted images to their corresponding questions"""
        # This is a simplified approach - for a real solution,
        # you would need more sophisticated image-text proximity analysis

        # Sort questions by ID
        questions.sort(key=lambda q: q["question_id"])

        # Assuming images appear in order with questions
        if images and questions:
            # Distribute images among questions
            images_per_question = max(1, len(images) // len(questions))

            for i, question in enumerate(questions):
                start_idx = i * images_per_question
                end_idx = start_idx + images_per_question

                if start_idx < len(images):
                    # Get relevant images for this question
                    question_images = images[start_idx:min(end_idx, len(images))]

                    if question_images:
                        # For simplicity, just use the first image
                        img_data = question_images[0]

                        # Convert image bytes to base64 for JSON storage
                        img_base64 = base64.b64encode(img_data["bytes"]).decode('utf-8')

                        question["image"] = {
                            "id": img_data["id"],
                            "data": img_base64
                        }

        return questions

    def extract_and_format(self, pdf_path):
        """Main method to extract and format questions from PDF"""
        # Extract text and images from PDF
        text, images = self.extract_text_from_pdf(pdf_path)

        # Preprocess text
        processed_text = self.preprocess_text(text)

        # Extract questions
        questions = self.extract_questions(processed_text)

        # Match images to questions
        questions_with_images = self.match_images_to_questions(questions, images, processed_text)

        # Format as JSON
        result = {
            "paper": "JEE Advanced 2024",
            "questions": questions_with_images
        }

        return result

# Function to save JSON output
def save_json_output(json_data, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    return output_path

# Function for Gradio interface
def process_pdf(pdf_file):
    extractor = JEEQuestionExtractor()
    result = extractor.extract_and_format(pdf_file.name)

    # Save the JSON output
    output_file = os.path.join(os.path.dirname(pdf_file.name), "extracted_questions.json")
    save_json_output(result, output_file)

    # Format output for display
    formatted_output = json.dumps(result, indent=2)

    # Create a sample visualization of one question
    if result["questions"]:
        sample_question = result["questions"][0]
        sample_output = {
            "question_id": sample_question["question_id"],
            "question_type": sample_question["question_type"],
            "question_text": sample_question["question_text"],
            "answer_choices": [choice["text"] for choice in sample_question["answer_choices"]],
            "correct_answer": sample_question["correct_answer"],
            "has_image": sample_question["image"] is not None
        }
        sample_formatted = json.dumps(sample_output, indent=2)
    else:
        sample_formatted = "No questions extracted"

    return formatted_output, sample_formatted, output_file

# Create Gradio interface
def create_gradio_ui():
    with gr.Blocks(title="JEE Advanced Question Extractor") as app:
        gr.Markdown("# JEE Advanced Question Extractor")
        gr.Markdown("Upload a JEE Advanced question paper PDF to extract questions in JSON format")

        with gr.Row():
            pdf_input = gr.File(label="Upload PDF Question Paper", file_types=[".pdf"])

        with gr.Row():
            submit_btn = gr.Button("Extract Questions")

        with gr.Tabs():
            with gr.TabItem("Full JSON Output"):
                json_output = gr.Code(language="json", label="Extracted JSON")

            with gr.TabItem("Sample Question"):
                sample_display = gr.Code(language="json", label="Sample Question")

            with gr.TabItem("Download"):
                file_output = gr.File(label="Download JSON File")

        submit_btn.click(fn=process_pdf,
                         inputs=[pdf_input],
                         outputs=[json_output, sample_display, file_output])

    return app

# For running in Colab
if __name__ == "__main__":
    app = create_gradio_ui()
    app.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4dba11a8a00abfc3c4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
from google.colab import drive
drive.mount('/content/drive')