In [1]:
import fitz  # PyMuPDF
import spacy
from dateutil.parser import parse
import os
import re

# Entity Types Dictionary (modify and expand as needed)
entity_types = {
    "PERSON": "People",
    "GPE": "Geographic Locations",
    "ORG": "Organizations",
    "LOC": "Locations",
    "EVENT": "Events",
    "WORK_OF_ART": "Works of Art",
    "QUANTITY": "Quantities",
    "ORDINAL": "Ordinal Numbers",
    "CARDINAL": "Cardinal Numbers",
    "PERCENT": "Percentages",
    "MONEY": "Monetary Values",
    "FAC": "Facilities",
    "LAW": "Legal References",
    "PRODUCT": "Products",
    "NORP": "Nationalities or Religious or Political Groups",
    "FACILITY": "Facilities",
    "LANGUAGE": "Languages",
    "TIME": "Times",
    "TITLE": "Titles (e.g., of books, songs, movies)",
}

# Extract Text from PDF with Error Handling
def extract_text_from_pdf(pdf_path):
    try:
        document = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'[^\w\s,.?!]', ' ', text)  # Remove non-printable characters
    return text.strip()

# Perform Named Entity Recognition (NER) with Error Handling
def perform_ner(text):
    try:
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        entities = [(ent.text.strip(), ent.label_) for ent in doc.ents]
        return entities
    except Exception as e:
        print(f"Error performing NER: {e}")
        return []

# Check if the date entity is a valid date for generating questions
def is_valid_date_entity(entity_text):
    # Skip single years and specific dates that don't make sense for question generation
    if entity_text.isdigit() or entity_text.count(' ') < 2:
        return False
    try:
        parse(entity_text, fuzzy=True)  # Use dateutil to validate date format
        return True
    except:
        return False

# Generate Logical Questions for Kids (with user input)
def generate_questions(entities):
    questions = set()
    # Track question count per entity type
    count = {key: 0 for key in entity_types.keys()}
    found_entity_types = set(entity[1] for entity in entities)  # Track found entity types

    # Get user input for desired question types
    print("Enter question types separated by commas (e.g., PERSON, LOC, EVENT):")
    desired_types = input().upper().split(",")

    for entity in entities:
        entity_text = entity[0]
        entity_label = entity[1]

        if entity_label == "DATE" and not is_valid_date_entity(entity_text):
            continue  # Skip invalid dates

        if (entity_label in desired_types) and (count[entity_label] < 10):
            # Add question templates for each entity type
            if entity_label == "PERSON":
                questions.add(f"Who is {entity_text}?")
                questions.add(f"Can you tell me more about {entity_text}'s role?")
                count["PERSON"] += 2
            elif entity_label == "GPE":
                questions.add(f"What is {entity_text}?")
                questions.add(f"Where is {entity_text} located?")
                count["GPE"] += 2
            elif entity_label == "ORG":
                questions.add(f"What is {entity_text}?")
                questions.add(f"What does {entity_text} do?")
                count["ORG"] += 2
            elif entity_label == "LOC":
                questions.add(f"Where is {entity_text}?")
                questions.add(f"What are some interesting things to see in {entity_text}?")
                count["LOC"] += 2
            elif entity_label == "EVENT":
                questions.add(f"What is {entity_text}?")
                questions.add(f"Why is {entity_text} important?")
                count["EVENT"] += 2
            elif entity_label == "WORK_OF_ART":
                questions.add(f"Who created {entity_text}?")
                questions.add(f"What is the significance of {entity_text}?")
                count["WORK_OF_ART"] += 2
            elif entity_label == "PRODUCT":
                questions.add(f"What is {entity_text}?")
                questions.add(f"What is the use of {entity_text}?")
                count["PRODUCT"] += 2
            elif entity_label == "LANGUAGE":
                questions.add(f"What language is {entity_text}?")
                questions.add(f"Where is {entity_text} spoken?")
                count["LANGUAGE"] += 2
            elif entity_label == "LAW":
                questions.add(f"What is {entity_text}?")
                questions.add(f"What does {entity_text} regulate?")
                count["LAW"] += 2
            elif entity_label == "FAC":
                questions.add(f"What is {entity_text}?")
                questions.add(f"Where is {entity_text} located?")
                count["FAC"] += 2

    # Contingency for missing entity types
    for entity_type in desired_types:
        if entity_type not in found_entity_types:
            print(f"No {entity_types.get(entity_type, entity_type)} found in the document. Moving on...")

    return list(questions)

# Main Function
if __name__ == "__main__":
    # Allow user to input the PDF path once
    pdf_path = input("Enter the path to the PDF file: ")

    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        print(f"PDF file {pdf_path} does not exist.")
    else:
        # Extract and clean text from the PDF
        text = extract_text_from_pdf(pdf_path)
        cleaned_text = clean_text(text)
        
        while True:
            entities = perform_ner(cleaned_text)
            if entities:
                questions = generate_questions(entities)
                if questions:
                    for question in questions:
                        print(question)
                else:
                    print("No questions generated. Try different question types.")
            else:
                print("No entities found in the document.")

            # Option for the user to continue or exit
            user_input = input("\nDo you want to generate more questions (type 'yes' to continue or 'exit' to quit): ").strip().lower()
            if user_input == 'exit':
                print("Exiting...")
                break


Enter the path to the PDF file:  TraditionalFoodsofindia.pdf


Enter question types separated by commas (e.g., PERSON, LOC, EVENT):


 EVENT


No Events found in the document. Moving on...
No questions generated. Try different question types.



Do you want to generate more questions (type 'yes' to continue or 'exit' to quit):  PERSON


Enter question types separated by commas (e.g., PERSON, LOC, EVENT):


 PERSON


Can you tell me more about Jain's role?
Can you tell me more about T. Krishnakumar's role?
Can you tell me more about https's role?
Who is T. Krishnakumar?
Who is https?
Who is Jain?



Do you want to generate more questions (type 'yes' to continue or 'exit' to quit):  yes


Enter question types separated by commas (e.g., PERSON, LOC, EVENT):


 ORG


What is The Future of Traditional Foods?
What is Traditional Foods?
What is Central Tuber Crops Research Institute?
What does Central Tuber Crops Research Institute do?
What does The Future of Traditional Foods do?
What is Kerala 104 PUBLICATIONS?
What does Different Regions of India do?
What is Different Regions of India?
What does Traditional Foods do?
What does Kerala 104 PUBLICATIONS do?


KeyboardInterrupt: Interrupted by user