Data Ingestion

In [2]:
## Document Structure

from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="This is the content of the document.",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author": "Haritha Nagamalla",
        "date_created": "2026-01-20"
    }
)

In [4]:
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Haritha Nagamalla', 'date_created': '2026-01-20'}, page_content='This is the content of the document.')

In [5]:
## create a simple txt file
import os
os.makedirs('data/text_files', exist_ok=True)

In [7]:
sample_texts = {
    "../data/text_files/machine_learning.txt": """Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data.

    Types of Machine Learning:
    1. Supervised learning with labaled data.
    2. Unsupervised learning with unlabeled data.
    3. Reinforcement learning through rewards and penalties.
    """

}

for file_path, content in sample_texts.items():
    with open(file_path, 'w', encoding="utf-8") as f:
        f.write(content)

In [9]:
### TextLoader
# from langchain.document_loaders import TextLoader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/machine_learning.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data.\n\n    Types of Machine Learning:\n    1. Supervised learning with labaled data.\n    2. Unsupervised learning with unlabeled data.\n    3. Reinforcement learning through rewards and penalties.\n    ')]


In [11]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data.\n\n    Types of Machine Learning:\n    1. Supervised learning with labaled data.\n    2. Unsupervised learning with unlabeled data.\n    3. Reinforcement learning through rewards and penalties.\n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='This is the content of document one. It contains information about Python programming\n    Python programming is a versatile language used for various applications including web development, data analysis, and artificial intelligence.\n    Key features:\n    - Easy to learn and read\n    - Extensive libraries and frameworks\n    - Strong community support\n    ')]

In [15]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf",
    glob = "**/*.pdf",
    loader_cls=PyMuPDFLoader,
    # loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
)

pdf_documents = dir_loader.load()
pdf_documents



[Document(metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2025-12-29T17:09:33+05:30', 'source': '..\\data\\pdf\\HarithaSreeNagamalla_1020_EmploymentRelatedDocument_29-12-25.pdf', 'file_path': '..\\data\\pdf\\HarithaSreeNagamalla_1020_EmploymentRelatedDocument_29-12-25.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Atyeti', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': "D:20251229170933+05'30'", 'page': 0}, page_content='Date: 29-12-2025\nRE-DESIGNATION LETTER - ATYETI\nDear Haritha\xa0Nagamalla,\nWe are pleased to inform you that you have been re-designated as Associate with effect from 29-12-2025.\nAll the other terms and conditions remain unchanged as stipulated in employee handbook uploaded in HRMS \ntool.\nAtyeti congratulates you on your Re-designation and the organization wishes you a long-term career with us.\nThank you. \nRegards,\nMahathi Busireddy\nHead of India & APAC\nAtyeti I

In [16]:
type(pdf_documents[0])

langchain_core.documents.base.Document

Embedding and VectorStoreDB

In [19]:
## chunking the documents
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the embedding manager.

        Args:
            model_name: Huggingface model name for sentence embeddings
        """

        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.

        Args:
            texts: List of strings to embed
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not Loaded")
        
        print(f"Generting embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar = True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
 
## Initialize the embedding manager

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x244f1445fd0>

In [22]:
### Vector Store
class VectorStore:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the vector store.

        Args:
            collection_name: Name of the ChromaDB collection.
            persist_directory: Directory to persist the vector store
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.initialize_store()

    def initialize_store(self):
        """Initialize the ChromaDB client and collection."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(name=self.collection_name,
                                        metadata={"description": "PDF document embeddings for RAG"})
            print(f"Vector store initalized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing ChromaDB client or collection: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store.

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documnets to vector store")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document Content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise
vector_store = VectorStore()
vector_store
    
    

Vector store initalized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x244f1962180>

In [23]:
chunks

NameError: name 'chunks' is not defined