<a href="https://colab.research.google.com/github/Indranil-R/rag-maester/blob/master/rag_maester.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<!-- ![](assets/img/image.png) -->
## RAG Maester
**Your AI Scholar**

Welcome to **RAG Maester**, an Academic AI assistant designed to support academic excellence.
It leverages **Retrieval Augmented Generation (RAG)** to meticulously search its knowledge base and craft well-informed responses, designed to assist with university assignments and tasks.


In [1]:
import os
import requests

In [4]:
# Downloading the required modules
if os.path.isfile("requirements.txt"):
  print("Requirements.txt already exists. Downloading modules...")
else:
  print("Requirements.txt doesn't exist downloading from github...")
  url = 'https://raw.githubusercontent.com/Indranil-R/rag-maester/refs/heads/master/requirements.txt'
  response = requests.get(url)

  with open('requirements.txt', 'w', encoding='utf-8') as file:
    file.write(response.text)
  print("File downloaded successfully.")

# !pip install -q -r requirements.txt  # Enable it only if dependencies are not installed, I have installed already

Requirements.txt already exists. Downloading modules...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.4/303.4 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.1/323.1 kB[0m [31m18.3 MB/s[0m eta [36m0:00:

## Importing all required third party libraries

---



In [7]:
if os.getenv("COLAB_RELEASE_TAG"):
    from google.colab import userdata
else:
    # do nothing
    pass


from loguru import logger
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI


In [8]:
# Setting up Google API key
if os.getenv('GOOGLE_API_KEY') == None:
  os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

## 1. Upload and Ingest Documents 📄

### Scan the docs directory for all available documents

In [9]:
# Fetch all file paths from a directory

def fetch_all_docs(docs_path: str) -> list[str]:
    docs_list = []
    if not os.path.isdir(docs_path):
        print(f"Warning: The path '{docs_path}' is not a valid directory or does not exist.")
        return []
    try:
        for item_name in os.listdir(docs_path):
            item_full_path = os.path.join(docs_path, item_name)
            if os.path.isfile(item_full_path):
                docs_list.append(item_full_path)
    except OSError as e:
        logger.error(f"Error accessing or reading directory '{docs_path}': {e}")
        return []
    return docs_list

In [24]:
# Fetching all documents from the docs directory
documents_list = fetch_all_docs(os.getcwd() + "/docs")

logger.info(f"Total number of documents found: {len(documents_list)}")

[32m2025-05-17 01:08:05.846[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m4[0m - [1mTotal number of documents found: 3[0m


#### Split the documents into smaller chunks

In [25]:
# Clean text by removing predefined phrases

def clean_text(text):
    removal_phrases = [
        "(c) Amity University Online",
        "Notes",
        "Amity Directorate of Distance & Online Education",
        "Introduction to E-Governance"
    ]
    for phrase in removal_phrases:
        text = text.replace(phrase, "")
    return text.strip()


In [26]:
# Load a PDF from the 6th page onward, clean, and split into chunks

def load_and_split_pdf(doc_path):
    loader = PyPDFLoader(file_path=doc_path, mode="page")
    all_pages = loader.load()
    relevant_pages = all_pages[5:]
    for page in relevant_pages:
        page.page_content = clean_text(page.page_content)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=250,
        separators=["\n\n", "\n", ".", " "],
    )
    return text_splitter.split_documents(relevant_pages)


In [27]:
# Process multiple PDF documents into cleaned, chunked outputs

def process_documents(documents_path_list: list[str]) -> list:
    all_processed_chunks = []
    for doc_path in documents_path_list:
        logger.info(f"Processing document: {doc_path}")
        try:
            single_doc_chunks = load_and_split_pdf(doc_path)
            if single_doc_chunks:
                all_processed_chunks.extend(single_doc_chunks)
                logger.info(f"Successfully processed and extracted {len(single_doc_chunks)} chunks from {doc_path}")
            else:
                logger.warning(f"No relevant chunks found in {doc_path}.")
        except FileNotFoundError:
            logger.error(f"File not found: {doc_path}. Please check the file path.")
        except Exception as e:
            logger.error(f"Error processing document {doc_path}: {e}")
    return all_processed_chunks


In [28]:
documents = process_documents(documents_list)

[32m2025-05-17 01:08:27.399[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_documents[0m:[36m6[0m - [1mProcessing document: /content/docs/Introduction to Data Science F-CSIT359-S.pdf[0m
[32m2025-05-17 01:08:32.310[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_documents[0m:[36m11[0m - [1mSuccessfully processed and extracted 961 chunks from /content/docs/Introduction to Data Science F-CSIT359-S.pdf[0m
[32m2025-05-17 01:08:32.311[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_documents[0m:[36m6[0m - [1mProcessing document: /content/docs/Introduction to E-Governance F-CSIT326 S.pdf[0m
[32m2025-05-17 01:08:37.847[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_documents[0m:[36m11[0m - [1mSuccessfully processed and extracted 978 chunks from /content/docs/Introduction to E-Governance F-CSIT326 S.pdf[0m
[32m2025-05-17 01:08:37.848[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_documents[0m:[36m6[0m - [1mProcessing documen

# 2. Create Embeddings 🧠

In [29]:
# Creating the embeddding function here

# Also using the latest embdedding function here :)
# embedding_fn = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-exp-03-07")
# Resource has been exhausted, its not free switching to a free one :(

embedding_fn = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [30]:
persist_directory = 'db'
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory, exist_ok=True)

# Creating the memory vector database
vectordb = Chroma.from_documents(documents,embedding=embedding_fn,persist_directory=persist_directory)

# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_fn)


InternalError: Error getting collection: Missing field: [Missing metadata segment]

### Creating the vector retreiver

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 7})
retrieved_docs = retriever.invoke("What is benefit of Bitcoin?")
logger.debug(retrieved_docs[0])

### Invoking the LLM to structure and return the response

In [None]:
logger.info("Initializing the Gemini LLM instance")
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",temperature=0.3, max_tokens=500)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, say that you don't know."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
    "Below are some examples showing a question and answer format:"
    """
    Question: The use of e-governance helps make all functions of the ____________ transparent.
              Question 1
              Answer a. retail.
              b. business.
              c. Both A & B.
              d. None of the above.

    Answer:  b. business.
                Because e-governance is a system that uses technology to improve the efficiency and transparency of government operations, making it easier for citizens to access information and services.


    Question: __________does not directly links to accountability.

              Question 2Answer
              a.
              Opaque.
              b.
              Transparency.
              c.
              Both A & B.
              d.
              None of the above.

    Answer:  a. Opaque.
                Because Opaque means not able to be seen through; not transparent. In the context of accountability, it suggests a lack of clarity or openness in processes or decisions, which does not directly link to accountability.



    Now, Answer the user question correctly given the example formats above:


    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": """
What is the advantage of Data Science?

Question 1Answer
a.
It is blurry

b.
Gives good salary

c.
A person can work on different approach

d.
It is very good defined
"""})
print(response["answer"])

## 3. Creating the UI

### 3.1. Using Streamlit