
RAG Pipeline- Date Injestion to Vector DB Pipeline

In [38]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [39]:
## read all the pdfs inside the directory

def process_all_pdfs(pdf_directory):
    """Process all the pdf files from the directory"""

    all_documents = []
    pdf_dir = Path(pdf_directory)

    # find all pdf files recursivley
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nprocessing: {pdf_file.name}")

        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            # add source info to meta data
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)

            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error {e}")
    
    print(f"Total Documents Loaded: {len(all_documents)}")
    return all_documents

# process all the pdf documents from the directory
all_pdf_documents = process_all_pdfs("../data/pdf")


Found 3 PDF files to process

processing: Manual for Skill registry - Industry Registration and Login.pdf
Loaded 17 pages

processing: ibm vmware broker documnet (2).pdf
Loaded 18 pages

processing: React Check List.pdf
Loaded 4 pages
Total Documents Loaded: 39


In [40]:
type(all_pdf_documents[0])

langchain_core.documents.base.Document

In [41]:
# Text Splitting into Chunks

def split_documents(documents, chunk_size=1000,chunk_overlap=200):
    """Split the documents into smaller chunks for better rag performance"""
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n","\n"," ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # sample chunk
    if split_docs:
        print("\nExample Chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"metadata: {split_docs[0].metadata}")

    return split_docs


In [42]:
chunks = split_documents(all_pdf_documents)

Split 39 documents into 41 chunks

Example Chunk: 
Content: Key Features of TNSKILL Registry 
1. Simple Registration & Login 
 
Register using Company / Industry details with admin approval 
process. 
 
Multiple login methods: Email/Password, Mobile/Password...
metadata: {'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-09-23T09:29:52+00:00', 'source': '../data/pdf/Manual for Skill registry - Industry Registration and Login.pdf', 'file_path': '../data/pdf/Manual for Skill registry - Industry Registration and Login.pdf', 'total_pages': 17, 'format': 'PDF 1.5', 'title': '', 'author': 'AVP-Portal Naanmudhalvan', 'subject': '', 'keywords': '', 'moddate': '2025-09-23T09:29:52+00:00', 'trapped': '', 'modDate': 'D:20250923092952Z', 'creationDate': "D:20250923092952+00'00'", 'page': 0, 'source_file': 'Manual for Skill registry - Industry Registration and Login.pdf', 'file_type': 'pdf'}


### embeddings and vectorStore DB

In [43]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
class EmbeddingManager:
    """Handles document embedding generation using sentence transformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """

        Initilze the embedding manager

        Args:
            model_name : Huggingface model name for sentence embeddings
        
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the sentence transformer model"""

        try:
            print(f"LOading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)

            print(f"Model Loaded Successfully. Embedded Dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model {self.model_name}:  {e}")
            raise
    
    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        """
        Generate Embeddings for a list of texts

        Args:
            texts: List of text strings to embbed
        
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        
        """


        if not self.model:
            raise ValueError("Model not LOaded")
        
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")

        return embeddings
    
    ## initilize the embedding maanger

embedding_manager = EmbeddingManager()
embedding_manager



LOading embedding model: all-MiniLM-L6-v2
Model Loaded Successfully. Embedded Dimension: 384


<__main__.EmbeddingManager at 0x7104956af020>

In [35]:
## Vector Store

class VectorStore:
    """Manages document embeddings in chromadb vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """

        Initilize the vector store

        Args:
            collection_name : Name of the chromadb collection
            persist_directory: Directory to persist the vector store
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initilize_store()

    def _initilize_store(self):
        """Initilize chromadb client and collection"""
        try:
            # create persistant chromadb client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata = {"description": "PDF Document embeddings for RAG"}
            )

            print(f"vector store initilized. Collection : {self.collection_name}")
            print(f"Exisiting documents in collection {self.collection.count()}")

        except Exception as e:
            print(f"ERror initilizing vector store {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add Documents and their embeddings to the vector store 

        Args:
            document:List of Langchain documents
            embeddings: corrosponding embeddings of the document
        """

        if len(documents) != len(embeddings):
            raise ValueError("number of documents should match the embeddings")
        
        print(f"Adding {len(documents)} documents to vector store")

        # prepare data for chromaDB

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list  = []

        for i , (doc, embedding) in enumerate(zip(documents, embeddings)):

            # create unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # prepare meta data

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document Content
            documents_text.append(doc.page_content)

            # embeddings

            embeddings_list.append(embedding.tolist())

        # add to collections

        try:
            self.collection.add(
                ids=ids,
                embeddings= embeddings_list,
                metadatas = metadatas,
                documents= documents_text
            )

            print(f"successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")    
        except Exception as e:
            print(f"ERror adding documents to vector store") 
            raise   
        
vectorstore = VectorStore()
vectorstore


vector store initilized. Collection : pdf_documents
Exisiting documents in collection 0


<__main__.VectorStore at 0x7104954b7230>

In [None]:
# convert text to embeddings
texts = [doc.page_content for doc in chunks]

# generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

# store in the vector store
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 41 texts


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]


Generated embeddings with shape: (41, 384)
Adding 41 documents to vector store
successfully added 41 documents to vector store
Total documents in collection: 41


In [47]:
#full content of the doc page content
# texts 

embeddings

array([[-0.11711098, -0.008301  , -0.0131607 , ...,  0.05842284,
        -0.00030941, -0.01528272],
       [-0.03013904,  0.04563437,  0.03822807, ...,  0.03544493,
        -0.02014904,  0.08997379],
       [-0.06649102,  0.00303016, -0.07202322, ...,  0.06313304,
         0.02408824, -0.03392732],
       ...,
       [-0.07805411,  0.02167594,  0.0020808 , ..., -0.00869135,
         0.02055571, -0.015007  ],
       [-0.06792241,  0.03647555,  0.0432132 , ..., -0.01364601,
         0.0210285 , -0.01765706],
       [-0.07062619,  0.02819395, -0.01981876, ..., -0.00553241,
        -0.03520308, -0.05317404]], shape=(41, 384), dtype=float32)

In [52]:
type(texts)

list

In [54]:
print(chunks[10].page_content)
print(texts[10])
print(embeddings[10])

Manual Search 
Click the Manual Search button, select filters such as College Type, Certification, 
Gender, Placement Status, District/Location, Year of Passing, Branch, College 
Name, etc., according to your requirements, and then click the Search Now button 
(highlighted in the image below). 
 
Once you click on Search Now, a list of eligible skilled professionals matching your 
requirements will be displayed.
Manual Search 
Click the Manual Search button, select filters such as College Type, Certification, 
Gender, Placement Status, District/Location, Year of Passing, Branch, College 
Name, etc., according to your requirements, and then click the Search Now button 
(highlighted in the image below). 
 
Once you click on Search Now, a list of eligible skilled professionals matching your 
requirements will be displayed.
[ 5.65390810e-02 -5.79172634e-02  1.48853613e-02  2.37953737e-02
 -4.28575985e-02  2.52832193e-02 -8.22808519e-02 -3.93530577e-02
 -1.29107952e-01 -2.98143532e-02 -4.54