### Import Libraries

In [1]:
import os
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
load_dotenv()
from pprint import pprint
import re

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain




from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS

In [2]:
## Access the api keys
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

### Work with the data

In [3]:
# check the current dire
%pwd

'd:\\RAGProjects\\RAG-Questions-Creator\\NoteBooks'

In [13]:
# function to extract the data from the data directory
def data_extraction(data_dir):
     loader = DirectoryLoader(path=data_dir,
                              glob="*.pdf",
                              loader_cls=PyPDFLoader)
     documents = loader.load()
     return documents

extracted_data = data_extraction("../DataSets/")

In [14]:
# extracted_data

In [15]:
# function to chunk the extracted data
def text_chunking(extracted_data):
     text_chunker = RecursiveCharacterTextSplitter(chunk_size=20000, chunk_overlap=200)
     chunks = text_chunker.split_documents(extracted_data)
     # return [chunk.page_content for chunk in chunks]
     return chunks

text_chunks = text_chunking(extracted_data=extracted_data)

In [16]:
print(f"Num of chunks: {len(text_chunks)}")

Num of chunks: 131


In [17]:
# check the type of the chunked data
type(text_chunks[0])

langchain_core.documents.base.Document

### Define Embedding model

In [18]:
# download the embedding model
embedding_model = GoogleGenerativeAIEmbeddings(google_api_key=GEMINI_API_KEY, model="models/embedding-001")

In [19]:
# function to built the vector store database
def vector_store(text_chunks: list[str]):
     text_chunks = [chunk.page_content for chunk in text_chunks]
     vector_db = FAISS.from_texts(texts=text_chunks, embedding=embedding_model)
     folder_name = "faiss-index"
     vector_db.save_local(folder_path=folder_name)
     print(f"The vector databese created successfully in the '{os.getcwd()} path' with name '{folder_name}'")
     return vector_db
vector_db = vector_store(text_chunks)


The vector databese created successfully in the 'd:\RAGProjects\RAG-Questions-Creator\NoteBooks path' with name 'faiss-index'


### Pipeline for Questions Generation

In [30]:
# Define the function to take user query and return results
def user_query(question, num_questions, difficulty_level, question_types, include_answers):
    embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001",
                                                   google_api_key=GEMINI_API_KEY)
    
    # Load FAISS index from local storage
    new_db = FAISS.load_local(folder_path="faiss-index",
                              embeddings=embedding_model,
                              allow_dangerous_deserialization=True)
    
    # Perform similarity search to get relevant documents
    docs = new_db.similarity_search(query=question, k=10)
    
    # Prepare the input for the chain
    chain_input = {
        "input_documents": docs,
        "input": question,
        "num_questions": num_questions,
        "difficulty_level": difficulty_level,
        "question_types": question_types,
        "include_answers": include_answers
    }
    
    exam_prompt = """You are a knowledgeable and professional AI assistant that specializes in generating high-quality exam questions.

                    Your role is to:

                    1. Generate clear, accurate, and well-structured exam questions based on the provided context and requirements.
                    2. Ensure that questions are relevant to the given subject, topic, and difficulty level.
                    3. Vary the question types if requested (e.g., multiple choice, true/false, short answer, essay).
                    4. Provide correct answers or model answers if specified.
                    5. Follow academic standards and avoid overly simplistic or overly complex wording unless specified.
                    6. If context is not enough to generate meaningful questions, acknowledge the limitation and ask for more detail.

                    Remember:
                    - Do not make up unrelated information.
                    - Stick closely to the topic and subject area.
                    - Maintain clarity and educational value in all questions.
                    - Avoid repetition unless explicitly instructed.
                    - Ensure consistency with the question format and style.
                    - Do not put words in " " or ' '

                    Context:
                    {context}

                    Instructions:
                    - Number of Questions: {num_questions} and print them under each other
                    - Difficulty Level: {difficulty_level}
                    - Question Type(s): {question_types}
                    - Include Answers: {include_answers}

                    Now, generate the questions based on the above."""

    
    exam_prompt_template = PromptTemplate(
        input_variables=["context", "num_questions", "difficulty_level", "question_types", "include_answers"],
        template=exam_prompt
    )

    
    model = ChatGoogleGenerativeAI(api_key=GEMINI_API_KEY, model="gemini-2.0-flash")

   
    chain = load_qa_chain(llm=model, prompt=exam_prompt_template)

    
    response = chain(chain_input, return_only_outputs=True)
    
    
    return response["output_text"]

user_inputs = {
    "num_questions": "50",
    "difficulty_level": "Medium",
    "question_types": "essay",
    "include_answers": "Yes"
}

response = user_query(
    question="Give me Questions on Chapter 3 only",  
    num_questions=user_inputs["num_questions"],
    difficulty_level=user_inputs["difficulty_level"],
    question_types=user_inputs["question_types"],
    include_answers=user_inputs["include_answers"]
)

# Output the generated questions
print(response)

I can generate 50 essay questions based on the provided text, but I must point out that the text is quite fragmented and covers a range of topics from Python installation and exercises to specific code snippets. This makes it difficult to create a cohesive set of essay questions without making some assumptions and connections between the different parts. Also the answers will be based on the context provided.

Here are 50 essay questions based on the provided content:

1.  Explain why Python 3 is preferred over Python 2 for new projects, according to the text.
    *   **Answer:** Python 3 is the present and future of the language and eliminates quirks that can trip up beginning programmers.

2.  Describe the process of installing the Python interpreter, as outlined in the text.
    *   **Answer:** Go to python.org/downloads/, click on the desired Python 3 version, and download the appropriate installer for your operating system.

3.  If you learn Python 3, will you understand codes wri

### Built RAG model for Answer about questions

In [16]:
prompt_template = """You are a knowledgeable and professional AI assistant specializing in providing accurate information from the given context.
                         Your role is to:\n\n"
                         "1. Provide clear, concise, and accurate answers based on the provided context and conversation history\n"
                         "2. If the context doesn't contain enough information to fully answer a question, acknowledge this limitation\n"
                         "3. Maintain a professional and helpful tone while ensuring factual accuracy\n"
                         "4. Use direct quotes from the context when relevant to support your answers\n"
                         "5. Organize complex responses in a structured, easy-to-read format\n"
                         "6. Consider the previous conversation history to maintain context and provide coherent responses\n"
                         "7. If you need to make assumptions, explicitly state them\n\n"
                         "8. If the Questions is multiple choices give me the correct answer and the reason in just 1 line\n"
                         "Remember:\n"
                         "- Stay within the scope of the provided context\n"
                         "- Use conversation history to better understand the context of questions\n"
                         "- Avoid making up information or speculating beyond the given content\n"
                         "- If multiple interpretations are possible, present them clearly\n"
                         "- Maintain consistency in your responses\n\n"
                         "Context from documents:\n{context}\n\n"
                         "Question: {input}\n\n"
                         "Answer: 
                         """
                         
model = ChatGoogleGenerativeAI(api_key=GEMINI_API_KEY, model="gemini-2.0-flash")
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "input"])
chain = load_qa_chain(llm=model, prompt=prompt)

In [17]:
# function to take the user questions

def user_query(questions: list[str]):
     new_db = FAISS.load_local(folder_path="faiss-index",
                               embeddings=embedding_model,
                               allow_dangerous_deserialization=True)
     
     docs = []
     for question in questions:
          doc = new_db.similarity_search(query=question, k=10)
          
          chain_input = {
               "input_documents": doc,
               "input": question
          }
          
          response = chain(chain_input,
                      return_only_outputs= True)
     
          docs.append([response["output_text"]])
     return docs

In [23]:
questions = [
     "Which of the following is the MOST accurate definition of text classification?\n    *   (A) The process of uncovering hidden topics within a text document.\n    *   (B) The task of assigning one or more categories to a given text from a predefined set of categories.\n    *   (C) The identification of the author of an unknown text.\n    *   (D) The conversion of text into numerical data for analysis.\n ",
     "Which of the following is an example of multiclass text classification?\n    *   (A) Identifying an email as spam or not spam.\n    *   (B) Determining if a news article is fake or real.\n    *   (C) Classifying customer reviews as positive, negative, or neutral.\n    *   (D) Tagging a news article as either 'sports' or 'politics'.\n",
]


answers = user_query(questions)

In [24]:
answers

[['(B) The task of assigning one or more categories to a given text from a predefined set of categories, because text classification is defined as "the task of assigning one or more categories to a given piece of text from a larger set of possible categories."'],
 ['(C) Classifying customer reviews as positive, negative, or neutral, because multiclass classification involves more than two classes.']]