## Data Ingestion

In [1]:
### document Structure

from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content="This is the main text content I am using to create RAG ",
    metadata={
        "source": "example.txt",
        "author": "Jitesh",
        "page": 1,
        "date_created": "2025-01-01",
    }
)
doc

Document(metadata={'source': 'example.txt', 'author': 'Jitesh', 'page': 1, 'date_created': '2025-01-01'}, page_content='This is the main text content I am using to create RAG ')

In [3]:
## create a simple text document
import os
os.makedirs("../data/text_files", exist_ok=True)

In [4]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [5]:
## Textloader
from langchain.document_loaders import TextLoader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [6]:
## directory loader
from langchain.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files", 
    glob="**/*.txt",
    loader_cls=TextLoader, 
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
)
documents=dir_loader.load()
documents

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.'),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervise

In [23]:


from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdf_files",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents



[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-22T13:39:41+00:00', 'source': '../data/pdf_files/Jitesh_resume.pdf', 'file_path': '../data/pdf_files/Jitesh_resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-22T13:39:41+00:00', 'trapped': '', 'modDate': 'D:20250822133941Z', 'creationDate': 'D:20250822133941Z', 'page': 0}, page_content='Jitesh Kumar\njitesh.me | jitesh.kumar05official@gmail.com | LinkedIn | GitHub | +91 6397983037\nEducation\nAnurag University\nCGPA: 8.74/10\nBachelor of Technology in Information Technology\n2023 – 2027\nKendriya Vidyalaya Picket, Hyderabad\nPercentage: 81%\nCBSE 12th\n2021 – 2023\nProjects\nGo To Buddy – AI-Powered Desktop Assistant | Link\n• Engineered a multi-threaded desktop assistant using Python & PyQt6 to ensure low-latency, real-time\nresponsiveness for voice commands.\n• Arch: PyQt6 UI →Multi-threaded Backen

In [24]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [25]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [26]:


chunks=split_documents(pdf_documents)
chunks



Split 1 documents into 3 chunks

Example chunk:
Content: Jitesh Kumar
jitesh.me | jitesh.kumar05official@gmail.com | LinkedIn | GitHub | +91 6397983037
Education
Anurag University
CGPA: 8.74/10
Bachelor of Technology in Information Technology
2023 – 2027
Ke...
Metadata: {'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-22T13:39:41+00:00', 'source': '../data/pdf_files/Jitesh_resume.pdf', 'file_path': '../data/pdf_files/Jitesh_resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-22T13:39:41+00:00', 'trapped': '', 'modDate': 'D:20250822133941Z', 'creationDate': 'D:20250822133941Z', 'page': 0}


[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-22T13:39:41+00:00', 'source': '../data/pdf_files/Jitesh_resume.pdf', 'file_path': '../data/pdf_files/Jitesh_resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-22T13:39:41+00:00', 'trapped': '', 'modDate': 'D:20250822133941Z', 'creationDate': 'D:20250822133941Z', 'page': 0}, page_content='Jitesh Kumar\njitesh.me | jitesh.kumar05official@gmail.com | LinkedIn | GitHub | +91 6397983037\nEducation\nAnurag University\nCGPA: 8.74/10\nBachelor of Technology in Information Technology\n2023 – 2027\nKendriya Vidyalaya Picket, Hyderabad\nPercentage: 81%\nCBSE 12th\n2021 – 2023\nProjects\nGo To Buddy – AI-Powered Desktop Assistant | Link\n• Engineered a multi-threaded desktop assistant using Python & PyQt6 to ensure low-latency, real-time\nresponsiveness for voice commands.\n• Arch: PyQt6 UI →Multi-threaded Backen

## Embedding and VectorStoreDB

In [27]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
class EmbeddingManager:
    """Handles Document Embedding using SentenceTransformers and stores in ChromaDB"""
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initializes the EmbeddingManager.
        Args:
            model_name: Hugging Face model name for sentence embeddings.
        """
        self.model_name = model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        """Loads the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generates embeddings for a list of texts.
        Args:
            texts: List of text strings to embed.
        Returns:
            Numpy array of embeddings.
        """
        if not self.model:
            raise ValueError("Model not loaded.")
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    

## initialize Embedding Manager
embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. embedding dimension: 384


<__main__.EmbeddingManager at 0x7f8e2015c110>

## VectorStoreDB

In [29]:
class VectorStoreDB:
    """Manages a vector store using ChromaDB."""
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store/"):
        """
        Initializes the VectorStoreDB.
        Args:
            collection_name: Name of the ChromaDB collection.
            persist_directory: Directory to persist the vector store.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client=None
        self.collection=None
        self._initialize_db()
    def _initialize_db(self):
        """Initializes the ChromaDB client and collection."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "pdf document embeddings for RAG"}  # assuming embedding dim is 384
                )
            print(f"VectorStoreDB initialized with collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing VectorStoreDB: {e}")
            raise

    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """
        Adds documents and their embeddings to the ChromaDB collection.
        Args:
            documents: List of Document objects to add.
            embeddings: Numpy array of embeddings corresponding to the documents.
        """
        if(len(documents) != embeddings.shape[0]):
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store...")

        #prepare data for insertion
        ids = []
        metadatas = []
        document_texts = []
        embedding_list = []

        for i, (doc, emb) in enumerate(zip(documents, embeddings)):
            doc_id = str(uuid.uuid4())
            ids.append(doc_id)
            metadatas.append(doc.metadata)
            document_texts.append(doc.page_content)
            embedding_list.append(emb.tolist())

        self.collection.add(
            ids=ids,
            metadatas=metadatas,
            documents=document_texts,
            embeddings=embedding_list
        )
        print(f"Documents added successfully. Total documents in collection: {self.collection.count()}")
    def similarity_search(self, query: str, top_k: int = 5) -> List[Tuple[Document, float]]:
        """
        Performs a similarity search in the vector store.
        Args:
            query: The query string to search for.
            top_k: The number of top similar documents to return.
        Returns:
            A list of tuples containing the matching documents and their similarity scores.
        """
        query_embedding = self._embed_query(query)
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k
        )
        return [(Document(page_content=res.document, metadata=res.metadata), res.score) for res in results]
    def _embed_query(self, query: str) -> List[float]:
        """Embeds the query string using the embedding model."""
        if not embedding_manager.model:
            raise ValueError("Embedding model not loaded.")
        return embedding_manager.model.encode([query], convert_to_numpy=True)[0].tolist()

In [30]:
vectorstore=VectorStoreDB()
vectorstore

VectorStoreDB initialized with collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStoreDB at 0x7f8dd69cb050>

In [31]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-22T13:39:41+00:00', 'source': '../data/pdf_files/Jitesh_resume.pdf', 'file_path': '../data/pdf_files/Jitesh_resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-22T13:39:41+00:00', 'trapped': '', 'modDate': 'D:20250822133941Z', 'creationDate': 'D:20250822133941Z', 'page': 0}, page_content='Jitesh Kumar\njitesh.me | jitesh.kumar05official@gmail.com | LinkedIn | GitHub | +91 6397983037\nEducation\nAnurag University\nCGPA: 8.74/10\nBachelor of Technology in Information Technology\n2023 – 2027\nKendriya Vidyalaya Picket, Hyderabad\nPercentage: 81%\nCBSE 12th\n2021 – 2023\nProjects\nGo To Buddy – AI-Powered Desktop Assistant | Link\n• Engineered a multi-threaded desktop assistant using Python & PyQt6 to ensure low-latency, real-time\nresponsiveness for voice commands.\n• Arch: PyQt6 UI →Multi-threaded Backen

In [33]:
## convert the text to embeddings

texts = [doc.page_content for doc in chunks]

In [34]:
texts

['Jitesh Kumar\njitesh.me | jitesh.kumar05official@gmail.com | LinkedIn | GitHub | +91 6397983037\nEducation\nAnurag University\nCGPA: 8.74/10\nBachelor of Technology in Information Technology\n2023 – 2027\nKendriya Vidyalaya Picket, Hyderabad\nPercentage: 81%\nCBSE 12th\n2021 – 2023\nProjects\nGo To Buddy – AI-Powered Desktop Assistant | Link\n• Engineered a multi-threaded desktop assistant using Python & PyQt6 to ensure low-latency, real-time\nresponsiveness for voice commands.\n• Arch: PyQt6 UI →Multi-threaded Backend →GenAI (GitHub Models) & TTS (Murf AI) APIs.\nStudent Performance Prediction Web App | Link\n• Developed a full-stack ML service using Python & Flask with a scikit-learn model achieving 88%+ prediction\naccuracy.\n• Ops: Containerized the application with Docker and deployed a scalable MLOps pipeline on AWS and Azure.\nGeeksforGeeks AUSC Chapter Website | Link\n• Launched a full-stack portal serving 4000+ students using JavaScript and Firebase (Authentication & Firesto

In [35]:
## generate embeddings
embeddings=embedding_manager.generate_embeddings(texts)

## store in the vector store
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 3 texts...
Generated embeddings with shape: (3, 384)
Adding 3 documents to the vector store...
Documents added successfully. Total documents in collection: 3


## RAG retrieval pipeline and vector store

In [None]:
class RAGRetriever:
    """Handles Query based retrieval from the vector store"""
    def __init__(self, vectorStore: VectorStoreDB, embedding_manager: EmbeddingManager):
        """
        Initializes the RAGRetriever.
        Args:
            vector_store: An instance of VectorStoreDB to perform retrieval from.
            embedding_manager: An instance of EmbeddingManager to handle query embeddings.
        """
        self.vector_store = vectorStore
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieves the most relevant documents for a given query.
        Args:
            query: The query string to search for.
            top_k: The number of top similar documents to return.
            score_threshold: Minimum similarity score to consider a document relevant.
        Returns:
            A list of dictionaries containing the matching documents and their similarity scores.
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Using top_k={top_k} and score_threshold={score_threshold}")
        query_embedding = self.embedding_manager.embed_query(query)
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                for doc, score, metadata in zip(results['documents'][0], results['distances'][0], results['metadatas'][0]):
                    if score >= score_threshold:
                        retrieved_docs.append({
                            "document": doc,
                            "score": score,
                            "metadata": metadata
                        })
            print(f"Retrieved {len(retrieved_docs)} documents above the score threshold.")
            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
retriever=RAGRetriever(vectorstore,embedding_manager)
retriever.retrieve("What is Python programming?", top_k=3, score_threshold=0.1)


AttributeError: 'str' object has no attribute 'document'