RAG PIPELINES - DATA INGESTION TO VECTOR DB

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [5]:
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files.")

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata["source_file"] = str(pdf_file)
                doc.metadata["file_type"] = 'pdf'


            all_documents.extend(documents)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
        
    print(f"Total documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")


Found 3 PDF files.
Processing file: java.pdf
Processing file: machinelearning.pdf
Processing file: sql.pdf
Total documents loaded: 8


In [6]:
all_pdf_documents

[Document(metadata={'producer': 'Online2PDF.com', 'creator': 'Online2PDF.com', 'creationdate': '2025-12-18T09:23:34+01:00', 'source': '..\\data\\pdf\\java.pdf', 'file_path': '..\\data\\pdf\\java.pdf', 'total_pages': 5, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': "D:20251218092334+01'00'", 'page': 0, 'source_file': '..\\data\\pdf\\java.pdf', 'file_type': 'pdf'}, page_content='Java isa popular, high-level, object-oriented programming language and \ncomputing platformknown for its "Write Once, Run Anywhere" (WORA) \ncapability, allowing code to run on any device with a Java Virtual \nMachine (JVM). It\'s widely used for building mobile apps, enterprise \nsoftware, web applications, games, and big data solutions, offering a \nrobust, secure, and platform-independent environment for developers \n \nJava is a high-level, object-oriented programming language used to build \nweb apps, mobile applica

In [2]:
### TEXT SPLITTING ###

def split_documents(documents, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Total documents after splitting: {len(split_docs)} chunks")

    if split_docs:
        print("Sample split document:")
        print(f"Sample chunk content:\n{split_docs[0].page_content[:200]}...")
        print(f"Sample chunk metadata:\n{split_docs[0].metadata}")
    
    return split_docs

In [7]:
chunks = split_documents(all_pdf_documents)
print(chunks)

Total documents after splitting: 16 chunks
Sample split document:
Sample chunk content:
Java isa popular, high-level, object-oriented programming language and 
computing platformknown for its "Write Once, Run Anywhere" (WORA) 
capability, allowing code to run on any device with a Java Vi...
Sample chunk metadata:
{'producer': 'Online2PDF.com', 'creator': 'Online2PDF.com', 'creationdate': '2025-12-18T09:23:34+01:00', 'source': '..\\data\\pdf\\java.pdf', 'file_path': '..\\data\\pdf\\java.pdf', 'total_pages': 5, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': "D:20251218092334+01'00'", 'page': 0, 'source_file': '..\\data\\pdf\\java.pdf', 'file_type': 'pdf'}
[Document(metadata={'producer': 'Online2PDF.com', 'creator': 'Online2PDF.com', 'creationdate': '2025-12-18T09:23:34+01:00', 'source': '..\\data\\pdf\\java.pdf', 'file_path': '..\\data\\pdf\\java.pdf', 'total_pages': 5, 'format': 'PDF 1.4', 'titl