![](assets/img/image.png)
# RAG Maester
**Your AI Scholar**


Welcome to **RAG Maester**, an Academic AI assistant designed to support academic excellence.
It leverages **Retrieval Augmented Generation (RAG)** to meticulously search its knowledge base and craft well-informed responses, designed to assist with university assignments and tasks.

#### Imports all required librarires

In [None]:
import os
import langchain

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader


### 1. Upload and Ingest Documents 📄 

#### Scan the docs directory for all available documents 

In [None]:
def fetch_all_docs(docs_path: str) -> list[str]:
    """
    Lists all files in the specified directory and returns their fully qualified paths.

    This function iterates through all entries in the given directory. If an entry
    is identified as a file (as opposed to a subdirectory), its complete,
    fully qualified path is constructed and added to the returned list.

    Args:
        docs_path (str): The path to the directory from which to fetch file paths.
                         Example: "/path/to/your/documents" or "data/reports".

    Returns:
        list[str]: A list of fully qualified paths for all files found directly
                   within the specified directory.
                   Returns an empty list if:
                   - The `docs_path` does not exist or is not a directory.
                   - The directory contains no files.
                   - An OS-level error occurs (e.g., permission denied).

    Raises:
        # This function, as written, handles common OS errors internally and returns
        # an empty list. If you prefer to raise exceptions, the try-except block
        # can be modified.
    """
    docs_list = []  # Initialize an empty list to store the full paths of files

    # First, check if the provided docs_path is actually a directory and exists
    if not os.path.isdir(docs_path):
        print(f"Warning: The path '{docs_path}' is not a valid directory or does not exist.")
        return []  # Return an empty list if the path is not a directory

    try:
        # List all items (files and directories) in the given docs_path
        for item_name in os.listdir(docs_path):
            # Construct the fully qualified path for the current item
            item_full_path = os.path.join(docs_path, item_name)

            # Check if the constructed path points to a file (and not a directory)
            if os.path.isfile(item_full_path):
                # If it's a file, add its fully qualified path to our list
                docs_list.append(item_full_path)
    except OSError as e:
        # Handle potential OS-level errors, such as permission denied
        print(f"Error accessing or reading directory '{docs_path}': {e}")
        return [] # Return an empty list in case of such errors

    return docs_list

In [None]:
# Fetching all documents from the docs directory
documents_list = fetch_all_docs(os.getcwd() + "/docs")

print("Total number of documents found: ", len(documents_list))

#### Split the documents into smaller chunks

In [None]:
# Method to clean the text
# This function removes common watermark or repeated footer content

def clean_text(text):
    """
    Removes common watermark or repeated footer content.
    """
    removal_phrases = [
        "(c) Amity University Online",
        "Notes",
        "Amity Directorate of Distance & Online Education",
        "Introduction to E-Governance"
    ]
    for phrase in removal_phrases:
        text = text.replace(phrase, "")
    return text.strip()

In [None]:


def load_and_split_pdf(doc_path):
    """
    Loads a PDF from the 6th page onward, cleans watermark text, and splits efficiently.

    Args:
        doc_path (str): Full path to the PDF file.

    Returns:
        list: A list of cleaned and chunked documents.
    """
    # Load all pages
    loader = PyPDFLoader(file_path=doc_path, mode="page")
    all_pages = loader.load()

    # Ignore the first 5 pages
    relevant_pages = all_pages[5:]

    # Clean watermark from each page
    for page in relevant_pages:
        page.page_content = clean_text(page.page_content)

    # Use a more efficient splitter config
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=250,
        separators=["\n\n", "\n", ".", " "],  # smart fallback separator list
    )

    # Split and return
    return text_splitter.split_documents(relevant_pages)

In [None]:


def process_documents(documents_path_list: list[str]) -> list:
    """
    Processes a list of PDF document paths.
    For each document, it loads content from the 6th page onward,
    cleans it, and splits it into chunks using the `load_and_split_pdf` function.

    Args:
        documents_path_list (list[str]): A list of full file paths to PDF documents.

    Returns:
        list: A single list containing all cleaned and chunked Document objects
              from all successfully processed PDF files.
    """
    all_processed_chunks = []  # Initialize an empty list to store chunks from all documents

    # Iterate over each document path in the provided list
    for doc_path in documents_path_list:
        print(f"Processing document: {doc_path}")
        try:
            # Call the existing function to process a single document
            single_doc_chunks = load_and_split_pdf(doc_path)

            # Add the chunks from the current document to the main list
            if single_doc_chunks: # Ensure there are chunks to add
                all_processed_chunks.extend(single_doc_chunks)
                print(f"Successfully processed and extracted {len(single_doc_chunks)} chunks from {doc_path}")
            else:
                print(f"No relevant chunks extracted from {doc_path} (e.g., too few pages or empty content after cleaning).")

        except FileNotFoundError:
            print(f"Error: Document not found at {doc_path}. Skipping this document.")
        except Exception as e:
            # Catch any other errors during the processing of a single document
            print(f"Error processing document {doc_path}: {e}. Skipping this document.")

    return all_processed_chunks



In [None]:
texts = process_documents(documents_list)

### 2. Create Embeddings 🧠

## 3. Creating the UI

### 3.1. Using Streamlit

ModuleNotFoundError: No module named 'transformers'

In [None]:
!streamlit run streamlit_app.py