
RAG Pipeline- Date Injestion to Vector DB Pipeline

In [90]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [93]:
## read all the pdfs inside the directory

def process_all_pdfs(pdf_directory):
    """Process all the pdf files from the directory"""

    all_documents = []
    pdf_dir = Path(pdf_directory)

    # find all pdf files recursivley
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nprocessing: {pdf_file.name}")

        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            # add source info to meta data
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)

            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error {e}")
    
    print(f"Total Documents Loaded: {len(all_documents)}")
    return all_documents

# process all the pdf documents from the directory
all_pdf_documents = process_all_pdfs("../data")


Found 2 PDF files to process

processing: Manual for Skill registry - Industry Registration and Login.pdf
Loaded 17 pages

processing: MANYAM-SANJAY-KUMAR-REDDY.pdf
Loaded 3 pages
Total Documents Loaded: 20


In [94]:
type(all_pdf_documents[0])

langchain_core.documents.base.Document

In [95]:
# Text Splitting into Chunks

def split_documents(documents, chunk_size=1000,chunk_overlap=200):
    """Split the documents into smaller chunks for better rag performance"""
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n","\n"," ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # sample chunk
    if split_docs:
        print("\nExample Chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"metadata: {split_docs[0].metadata}")

    return split_docs


In [96]:
chunks = split_documents(all_pdf_documents)

Split 20 documents into 26 chunks

Example Chunk: 
Content: Key Features of TNSKILL Registry 
1. Simple Registration & Login 
 
Register using Company / Industry details with admin approval 
process. 
 
Multiple login methods: Email/Password, Mobile/Password...
metadata: {'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-09-23T09:29:52+00:00', 'source': '../data/Manual for Skill registry - Industry Registration and Login.pdf', 'file_path': '../data/Manual for Skill registry - Industry Registration and Login.pdf', 'total_pages': 17, 'format': 'PDF 1.5', 'title': '', 'author': 'AVP-Portal Naanmudhalvan', 'subject': '', 'keywords': '', 'moddate': '2025-09-23T09:29:52+00:00', 'trapped': '', 'modDate': 'D:20250923092952Z', 'creationDate': "D:20250923092952+00'00'", 'page': 0, 'source_file': 'Manual for Skill registry - Industry Registration and Login.pdf', 'file_type': 'pdf'}


### embeddings and vectorStore DB

In [97]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
class EmbeddingManager:
    """Handles document embedding generation using sentence transformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """

        Initilze the embedding manager

        Args:
            model_name : Huggingface model name for sentence embeddings
        
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the sentence transformer model"""

        try:
            print(f"LOading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)

            print(f"Model Loaded Successfully. Embedded Dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model {self.model_name}:  {e}")
            raise
    
    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        """
        Generate Embeddings for a list of texts

        Args:
            texts: List of text strings to embbed
        
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        
        """


        if not self.model:
            raise ValueError("Model not LOaded")
        
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")

        return embeddings
    
    ## initilize the embedding maanger

embedding_manager = EmbeddingManager()
embedding_manager



LOading embedding model: all-MiniLM-L6-v2
Model Loaded Successfully. Embedded Dimension: 384


<__main__.EmbeddingManager at 0x7104956a75f0>

In [111]:
## Vector Store

class VectorStore:
    """Manages document embeddings in chromadb vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../chromadb/vector_store"):
        """

        Initilize the vector store

        Args:
            collection_name : Name of the chromadb collection
            persist_directory: Directory to persist the vector store
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initilize_store()

    def _initilize_store(self):
        """Initilize chromadb client and collection"""
        try:
            # create persistant chromadb client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata = {"description": "PDF Document embeddings for RAG"}
            )

            print(f"vector store initilized. Collection : {self.collection_name}")
            print(f"Exisiting documents in collection {self.collection.count()}")

        except Exception as e:
            print(f"ERror initilizing vector store {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add Documents and their embeddings to the vector store 

        Args:
            document:List of Langchain documents
            embeddings: corrosponding embeddings of the document
        """

        if len(documents) != len(embeddings):
            raise ValueError("number of documents should match the embeddings")
        
        print(f"Adding {len(documents)} documents to vector store")

        # prepare data for chromaDB

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list  = []

        for i , (doc, embedding) in enumerate(zip(documents, embeddings)):

            # create unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # prepare meta data

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document Content
            documents_text.append(doc.page_content)

            # embeddings

            embeddings_list.append(embedding.tolist())

        # add to collections

        try:
            self.collection.add(
                ids=ids,
                embeddings= embeddings_list,
                metadatas = metadatas,
                documents= documents_text
            )

            print(f"successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")    
        except Exception as e:
            print(f"ERror adding documents to vector store") 
            raise   
        
vectorstore = VectorStore()
vectorstore


vector store initilized. Collection : pdf_documents
Exisiting documents in collection 0


<__main__.VectorStore at 0x7104956ae420>

In [112]:
# convert text to embeddings
texts = [doc.page_content for doc in chunks]

In [113]:
# generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

Generating embeddings for 26 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]

Generated embeddings with shape: (26, 384)





In [114]:
# store in the vector store
vectorstore.add_documents(chunks, embeddings)

Adding 26 documents to vector store
successfully added 26 documents to vector store
Total documents in collection: 26


In [115]:
#full content of the doc page content
# texts 

embeddings

array([[-1.1711098e-01, -8.3010029e-03, -1.3160699e-02, ...,
         5.8422845e-02, -3.0940678e-04, -1.5282723e-02],
       [-3.0139044e-02,  4.5634367e-02,  3.8228072e-02, ...,
         3.5444930e-02, -2.0149043e-02,  8.9973792e-02],
       [-6.6491023e-02,  3.0301595e-03, -7.2023220e-02, ...,
         6.3133039e-02,  2.4088241e-02, -3.3927321e-02],
       ...,
       [-1.4644398e-01,  6.7608147e-05, -2.7036730e-02, ...,
        -3.2940016e-03, -1.2926569e-02,  2.4265300e-02],
       [-1.4477530e-01, -6.7603953e-02,  7.5297724e-03, ...,
        -5.3227521e-02, -3.9992481e-02,  6.2061094e-02],
       [-8.5309140e-02,  6.2356364e-02,  1.2115482e-02, ...,
        -6.6514254e-02, -2.1456851e-02,  6.7950841e-03]],
      shape=(26, 384), dtype=float32)

In [116]:
type(texts)

list

In [119]:
print(chunks[11].page_content)
print(texts[11])
print(embeddings[11])

Step 4: 
You can view Skilled professional details such as: 
 
Candidate Name 
 
Skills & Qualifications 
 
Location 
 
Experience 
 
Naan Mudhalvan Certification Details 
 
Resume 
 
Current Status 
 
Profile 
 
Year of Passing 
 
District, etc. 
To further shortlist candidates, you can use the filters in the left pane, such as: 
 
Gender 
 
Year of Passing 
 
College Type 
 
District, etc.
Step 4: 
You can view Skilled professional details such as: 
 
Candidate Name 
 
Skills & Qualifications 
 
Location 
 
Experience 
 
Naan Mudhalvan Certification Details 
 
Resume 
 
Current Status 
 
Profile 
 
Year of Passing 
 
District, etc. 
To further shortlist candidates, you can use the filters in the left pane, such as: 
 
Gender 
 
Year of Passing 
 
College Type 
 
District, etc.
[-1.57551828e-03  4.77492511e-02  3.82212293e-03  1.07069314e-02
 -2.38509523e-03  4.70534675e-02 -3.81530933e-02 -6.97295442e-02
 -1.42953724e-01 -7.32311904e-02 -4.76059802e-02 -1.

In [123]:
class RAGRetriever:
    """Handles Query based retrieval from vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """

        Initilize the retrivever

        Args:
            vector_store: Vector Store containing document Embeddings
            embeddings_manager : Manager for generating query embeddings

        """

        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """

        Retrieve the relevant elements for a query

        Args:
            query : the search query
            top_k : NUmber of top results to return
            score_threshold: Minimum similarity score threshold
         
        Returns:
            List of Dictionaries containing retrived documents and metadatas
        """

        print(f"Retrieving documents for the '{query}'")
        print(f"Top K : {top_k}, score threshold : {score_threshold}")

        # generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings = [query_embedding.tolist()],
                n_results = top_k
            )

            # processed results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids=results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # convery distance to similary score (chromadb uses cosine distance)

                    similrity_score = 1 - distance

                    if similrity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similrity_score,
                            'distance': distance,
                            'rank': i + 1 
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filetring)")

            else:
                print("NO Documents Found")

            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
            
rag_retriever = RAGRetriever(vectorstore, embedding_manager)
rag_retriever

    

<__main__.RAGRetriever at 0x71043a2ab5c0>

In [129]:
rag_retriever.retrieve("what is tnskill registry ")

Retrieving documents for the 'what is tnskill registry '
Top K : 5, score threshold : 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.28it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filetring)





[{'id': 'doc_5761f1bc_2',
  'content': '\uf0b7 \nPublish after Admin Approval. \n\uf0b7 \nView responses and engagement directly from the dashboard. \nTNSkill Registry Manual \nIndustry / Company /Employer Registration and Login \nStep 1: \nOpen your browser, enter naanmudhalvan.tn.gov.in in the address bar, and click on \nTNSkill Registry Button (as highlighted in the image below). \n \nStep 2: \nClick on “Register” button (as highlighted in the image below).',
  'metadata': {'source': '../data/Manual for Skill registry - Industry Registration and Login.pdf',
   'total_pages': 17,
   'file_type': 'pdf',
   'modDate': 'D:20250923092952Z',
   'doc_index': 2,
   'trapped': '',
   'format': 'PDF 1.5',
   'content_length': 403,
   'author': 'AVP-Portal Naanmudhalvan',
   'page': 1,
   'creationDate': "D:20250923092952+00'00'",
   'moddate': '2025-09-23T09:29:52+00:00',
   'creationdate': '2025-09-23T09:29:52+00:00',
   'file_path': '../data/Manual for Skill registry - Industry Registration