### Import Libraries

In [74]:
import os
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
load_dotenv()
from pprint import pprint
import re

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain




from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS

In [28]:
## Access the api keys
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

### Work with the data

In [3]:
# check the current dire
%pwd

'd:\\RAGProjects\\RAG-Questions-Creator\\NoteBooks'

In [10]:
# function to extract the data from the data directory
def data_extraction(data_dir):
     loader = DirectoryLoader(path=data_dir,
                              glob="*.pdf",
                              loader_cls=PyPDFLoader)
     documents = loader.load()
     return documents

extracted_data = data_extraction("../DataSets/")

In [12]:
# extracted_data

In [32]:
# function to chunk the extracted data
def text_chunking(extracted_data):
     text_chunker = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=200)
     chunks = text_chunker.split_documents(extracted_data)
     # return [chunk.page_content for chunk in chunks]
     return chunks

text_chunks = text_chunking(extracted_data=extracted_data)

In [33]:
print(f"Num of chunks: {len(text_chunks)}")

Num of chunks: 41


In [34]:
# check the type of the chunked data
type(text_chunks[0])

langchain_core.documents.base.Document

### Pipeline for Questions Generation

In [42]:
llm_question_generation_pipe = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
                                                     api_key=GEMINI_API_KEY, 
                                                     temperature=0.3)


In [None]:
prompt_template = """You are a knowledgeable and professional AI assistant that specializes in generating high-quality exam questions.

                         Your role is to:

                         1. Generate clear, accurate, and well-structured exam questions based on the provided context and requirements.
                         2. Ensure that questions are relevant to the given subject, topic, and difficulty level.
                         3. Vary the question types if requested (e.g., multiple choice, true/false, short answer, essay).
                         4. Provide correct answers or model answers if specified.
                         5. Follow academic standards and avoid overly simplistic or overly complex wording unless specified.
                         6. If context is not enough to generate meaningful questions, acknowledge the limitation and ask for more detail.

                         Remember:
                         - Do not make up unrelated information.
                         - Stick closely to the topic and subject area.
                         - Maintain clarity and educational value in all questions.
                         - Avoid repetition unless explicitly instructed.
                         - Ensure consistency with the question format and style.

                         Context:
                         {context}

                         Instructions:
                         - Subject: {subject}
                         - Topic: {topic}
                         - Number of Questions: {num_questions}
                         - Difficulty Level: {difficulty_level}
                         - Question Type(s): {question_types}
                         - Include Answers: {include_answers}

                         Now, generate the questions based on the above.
                         """

# Create the Langchain PromptTemplate
exam_prompt = PromptTemplate(
    input_variables=[
        "context",
        "subject",
        "topic",
        "num_questions",
        "difficulty_level",
        "question_types",
        "include_answers"
    ],
    template=prompt_template
)

# Batch size for processing
batch_size = 5
num_chunks = len(text_chunks) 

dic_Exam_ques = {}

for i in range(0, num_chunks, batch_size):
    batch = text_chunks[i:i+batch_size]
    
   
    batch_context = "\n\n".join([chunk.page_content for chunk in batch])
    

    filled_prompt = exam_prompt.format(
        context=batch_context,
        subject="Natural Language Processing",
        topic="Text Classification",
        num_questions="5",  
        difficulty_level="Medium",
        question_types="short answer",
        include_answers="Yes"
    )
    print(f"\n📄=== Questions from Batch {i//batch_size + 1} ===\n")
    response = llm_question_generation_pipe.invoke(filled_prompt)
    pprint(response)
    response = response.content
    response = re.sub(r'\s+', ' ', response).strip()
    question = response[response.index("Question"): response.index("Answer")]
    answer = response[response.index("Answer"):]
    dic_Exam_ques[question] = answer



📄=== Questions from Batch 1 ===

AIMessage(content='Okay, here are 5 short answer questions about Text Classification, based on the provided text, with a medium difficulty level, including answers:\n\n1.  **Question:** Define text classification and provide a real-world example of its application, different from the email spam filtering example given in the text.\n    **Answer:** Text classification is the task of assigning one or more categories to a given piece of text from a larger set of possible categories. An example is classifying customer reviews of a product as positive, negative, or neutral to understand customer sentiment.\n\n2.  **Question:** Explain the difference between binary, multiclass, and multilabel classification in the context of text classification. Provide an example for each.\n    **Answer:**\n    *   **Binary classification:** Categorizing text into one of two classes (e.g., spam or not spam).\n    *   **Multiclass classification:** Categorizing text into one

In [77]:
dic_Exam_ques

{'Question:** Define text classification and provide a real-world example of its application, different from the email spam filtering example given in the text. **': 'Answer:** Text classification is the task of assigning one or more categories to a given piece of text from a larger set of possible categories. An example is classifying customer reviews of a product as positive, negative, or neutral to understand customer sentiment. 2. **Question:** Explain the difference between binary, multiclass, and multilabel classification in the context of text classification. Provide an example for each. **Answer:** * **Binary classification:** Categorizing text into one of two classes (e.g., spam or not spam). * **Multiclass classification:** Categorizing text into one of several classes, where each text belongs to only one class (e.g., classifying customer reviews as negative, neutral, or positive). * **Multilabel classification:** Categorizing text into one or more classes simultaneously (e.g