### RAG Pipeline - Data Ingestion to vector DB piplline

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


## Read all the pdf's inside the directory
def process_all_pdfs(pdf_direcotry):
    """Process all PDF files in the specified directory."""
    all_documents = []
    pdf_dir = Path(pdf_direcotry)

    #Find all PDF files in the directory
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files in the directory: {pdf_direcotry} to process.")

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")

        try:
            # Load the PDF document
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            #Add source information to metadata
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"

            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages.")

        except Exception as e:
            print(f"Error processing file {pdf_file}: {e}")
    
    print(f"Total documents loaded from all PDFs: {len(all_documents)}")
    return all_documents

#Process all pdfs in data directory
all_pdf_documents = process_all_pdfs("../data")

Found 1 PDF files in the directory: ../data to process.
Processing file: Project Proposal A-M-S.pdf
Loaded 16 pages.
Total documents loaded from all PDFs: 16


In [4]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}, page_content='TRIBHUV AN UNIVERSITY \nInstitute of Science and Technology \n \n \nA Project Proposal \nOn \n"E-Voting System" \n \nSubmitted to \nDepartment of Statistics and Computer Science \nPatan Multiple Campus \n \nIn partial fulfillement of the requriments for Bachelor Degree in Computer \nscience and Information Technology \n \n \nSubmitted By: \nAshutosh Adhikari (79010020) \nManish Basnet (79010054) \nSnehal Sigdel (79010119) \n \nDate: \n1st Feb 2026'),
 Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ash

In [5]:
### Text splitting

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks.")

    #show example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:500]}.....")  # Print the first 500 characters of the first chunk
        print(f"Metadata: {split_docs[0].metadata}")
        
    print(f"Total chunks created: {len(split_docs)}")
    return split_docs

In [6]:
chunks = split_documents(all_pdf_documents)
chunks

Split 16 documents into 31 chunks.

Example chunk:
Content: TRIBHUV AN UNIVERSITY 
Institute of Science and Technology 
 
 
A Project Proposal 
On 
"E-Voting System" 
 
Submitted to 
Department of Statistics and Computer Science 
Patan Multiple Campus 
 
In partial fulfillement of the requriments for Bachelor Degree in Computer 
science and Information Technology 
 
 
Submitted By: 
Ashutosh Adhikari (79010020) 
Manish Basnet (79010054) 
Snehal Sigdel (79010119) 
 
Date: 
1st Feb 2026.....
Metadata: {'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}
Total chunks created: 31


[Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}, page_content='TRIBHUV AN UNIVERSITY \nInstitute of Science and Technology \n \n \nA Project Proposal \nOn \n"E-Voting System" \n \nSubmitted to \nDepartment of Statistics and Computer Science \nPatan Multiple Campus \n \nIn partial fulfillement of the requriments for Bachelor Degree in Computer \nscience and Information Technology \n \n \nSubmitted By: \nAshutosh Adhikari (79010020) \nManish Basnet (79010054) \nSnehal Sigdel (79010119) \n \nDate: \n1st Feb 2026'),
 Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ash