In [1]:
import os 
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.document_loaders import DirectoryLoader  
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Read all pdfs from the directory:
def proces_all_pdfs(pdf_directory):
    all_docs = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in directory {pdf_directory}")
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file}")
        loader = PyMuPDFLoader(str(pdf_file)) ## divides per page
        documents = loader.load()
        ## Add information to metadata
        for doc in documents:
            doc.metadata['source_file'] = pdf_file.name
            doc.metadata['file_type'] = 'pdf'
        all_docs.extend(documents)
        print(f"Loaded {len(documents)} documents from {pdf_file}")

    print(f"Total documents loaded from all PDFs: {len(all_docs)}")
    return all_docs

all_documents = proces_all_pdfs("../Data/")

Found 1 PDF files in directory ../Data/
Processing file: ../Data/SystemDesignInterview-v1-alex-xu.pdf
Loaded 269 documents from ../Data/SystemDesignInterview-v1-alex-xu.pdf
Total documents loaded from all PDFs: 269


In [3]:
### text splitting into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=300):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["\n\n", "\n", " ", ""])
    split_documents = text_splitter.split_documents(documents) # Splitting documents into chunks, not same function as load, its text splitting . object
    print(f"Total documents after splitting: {len(split_documents)}")
    print(f"Split {len(documents)} documents into {len(split_documents)} chunks.")


    if split_documents:
        print("Sample split document metadata and content:")
        sample_doc = split_documents[0]
        print(f"Metadata: {sample_doc.metadata}")
        print(f"Content (first 500 chars): {sample_doc.page_content[:500]}")

    return split_documents

In [4]:
chunks = split_documents(all_documents)
chunks


Total documents after splitting: 450
Split 269 documents into 450 chunks.
Sample split document metadata and content:
Metadata: {'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext, AppendMode 1.1', 'creator': 'calibre 3.9.0 [https://calibre-ebook.com]', 'creationdate': '2020-10-16T23:12:01+00:00', 'source': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'file_path': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'total_pages': 269, 'format': 'PDF 1.4', 'title': "System Design Interview – An insider's guide, Second Edition: Step by Step Guide, Tips and 15 System Design Interview Questions with Detailed Solutions", 'author': 'Alex Xu', 'subject': '', 'keywords': '', 'moddate': "D:20250310164445Z00'00'", 'trapped': '', 'modDate': "D:20250310164445Z00'00'", 'creationDate': "D:20201016231201+00'00'", 'page': 1, 'source_file': 'SystemDesignInterview-v1-alex-xu.pdf', 'file_type': 'pdf'}
Content (first 500 chars): System Design Interview: An Insider’s Guide
All rights reserved

[Document(metadata={'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext, AppendMode 1.1', 'creator': 'calibre 3.9.0 [https://calibre-ebook.com]', 'creationdate': '2020-10-16T23:12:01+00:00', 'source': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'file_path': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'total_pages': 269, 'format': 'PDF 1.4', 'title': "System Design Interview – An insider's guide, Second Edition: Step by Step Guide, Tips and 15 System Design Interview Questions with Detailed Solutions", 'author': 'Alex Xu', 'subject': '', 'keywords': '', 'moddate': "D:20250310164445Z00'00'", 'trapped': '', 'modDate': "D:20250310164445Z00'00'", 'creationDate': "D:20201016231201+00'00'", 'page': 1, 'source_file': 'SystemDesignInterview-v1-alex-xu.pdf', 'file_type': 'pdf'}, page_content='System Design Interview: An Insider’s Guide\nAll rights reserved. This book or any portion thereof may not be reproduced or used in any\nmanner whatsoever without the express written p

## Embedding and Vector db

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from  chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple

import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
class EmbeddingManager:
    """Handles doc embedding generation using sentence transformers"""

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self.__load_model()

    def __load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Emdedding dimension: {self.model.get_sentence_embedding_dimension()} ")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generates embeddings for a list of texts"""
        if not self.model:
            raise ValueError("Model not loaded.")
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings
    
### initialize embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Emdedding dimension: 384 


<__main__.EmbeddingManager at 0x13b138810>

“384 is the dimensionality of the embedding vector produced by the MiniLM sentence transformer. Each embedded sentence is represented as a 384-dimensional numerical vector in semantic space, allowing similarity comparisons via cosine distance or dot product.”

## Vector DB

In [13]:
class VectorStore:
    """Manages vector store using ChromaDB"""
    def __init__(self, collection_name: str ="pdf_documents", persist_directory: str ="../Data/vector_store/"):
        """
        Initialize the VectorStore with a collection name and persist directory.
        Args:
        collection_name (str): Name of the collection in ChromaDB.
        persist_directory (str): Directory to persist the ChromaDB database.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.__initialize_client()

    def __initialize_client(self):
        """Initializes the ChromaDB client and collection."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(name=self.collection_name, metadata={"description": "PDF Document Chunks Collection"})
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"Exusting number of vectors in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Adds documents and their embeddings to the vector store."""
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        ids = []
        metadatas = []
        document_texts = []
        embeddings_list = []


        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Unpack document and embedding, generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)  # Copy existing metadata
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)

            metadatas.append(metadata)

            # document text
            document_texts.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=document_texts,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collections : {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise   

vectorStore=VectorStore()
vectorStore


Vector store initialized with collection: pdf_documents
Exusting number of vectors in collection: 0


<__main__.VectorStore at 0x142efa910>

In [14]:
chunks

[Document(metadata={'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext, AppendMode 1.1', 'creator': 'calibre 3.9.0 [https://calibre-ebook.com]', 'creationdate': '2020-10-16T23:12:01+00:00', 'source': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'file_path': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'total_pages': 269, 'format': 'PDF 1.4', 'title': "System Design Interview – An insider's guide, Second Edition: Step by Step Guide, Tips and 15 System Design Interview Questions with Detailed Solutions", 'author': 'Alex Xu', 'subject': '', 'keywords': '', 'moddate': "D:20250310164445Z00'00'", 'trapped': '', 'modDate': "D:20250310164445Z00'00'", 'creationDate': "D:20201016231201+00'00'", 'page': 1, 'source_file': 'SystemDesignInterview-v1-alex-xu.pdf', 'file_type': 'pdf'}, page_content='System Design Interview: An Insider’s Guide\nAll rights reserved. This book or any portion thereof may not be reproduced or used in any\nmanner whatsoever without the express written p

In [16]:
### Convert chunks to texts for embedding
texts = [doc.page_content for doc in chunks]

### Generate embeddings

embeddings = embedding_manager.generate_embeddings(texts)

### store in vector store
vectorStore.add_documents(chunks, embeddings)

Adding 450 documents to vector store...
Successfully added 450 documents to vector store.
Total documents in collections : 900


## Retrival Pipeline from vectorStore

In [18]:
class RAGRetriver:
    """ Hanles query based retrieval from the vector stor """

    def __init__(self, vectorStore : VectorStore, embedding_manager : EmbeddingManager):
        """ Initializes the RAGRetriever with a vector store and embedding manager. """

        self.vectorStore = vectorStore
        self.embedding_manager = embedding_manager  

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieves revleant documetns for a query
        Args:
            query (str): The input query string.
            top_k (int): Number of top documents to retrieve.
            score_threshold (float): Minimum similarity score threshold.

        Returns: 
            List of retrieved documents with metadata and similarity scores. 
        """

        print(f"Retrieving documents for query: {query}")
        print(f"Top k: {top_k}, Score threshold: {score_threshold} ")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0] 

        # Perform similarity search in vector store

        try:
            results = self.vectorStore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                include=['metadatas', 'documents', 'distances']
            )
            # process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance  # Convert distance to similarity score
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")
            else:
                print("No documents retrieved.")
                return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            raise

rag_retriever = RAGRetriver(vectorStore, embedding_manager)

In [19]:
rag_retriever

<__main__.RAGRetriver at 0x142f16150>

In [21]:
rag_retriever.retrieve("url shortner")

Retrieving documents for query: url shortner
Top k: 5, Score threshold: 0.0 
Retrieved 5 documents after applying score threshold.
