In [2]:
from langchain.schema import Document

In [3]:
type(Document)

pydantic._internal._model_construction.ModelMetaclass

In [5]:
Document

langchain_core.documents.base.Document

## Types of Loader

###### PDFLoader
###### CSVLoader
###### WebBaseLoader
###### DirectoryLoader

In [8]:
from langchain_core.documents import Document # same as above import

In [18]:
doc = Document(
    page_content = "This is a txt file data", 
    metadata={
        "source":"example.txt",
        "pages": "5",
        "author":"Gaurav"
    }
)

In [19]:
doc

Document(metadata={'source': 'example.txt', 'pages': '5', 'author': 'Gaurav'}, page_content='This is a txt file data')

In [44]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/Users/gdange/Documents/Personal/Gaurav_Dange_Resume.pdf")
docs = loader.load()

In [55]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # characters per chunk
    chunk_overlap=200,      # overlap to preserve context
    separators=["\n\n", "\n", " ", ""]
)

pdf_chunks = text_splitter.split_documents(pdf_documents)

print(f"Original documents: {len(pdf_documents)}")
print(f"Chunks created: {len(pdf_chunks)}")




Original documents: 1
Chunks created: 4


'r, Kubernetes, CI/CD, Jenkins, AWS, Git (BitBucket)\nArchitecture & Tools: Microservices, ETL, Swagger, Postman, Jira, Kibana\nAI Tools: LLM APIs, RAG, MCP\nWORK EXPERIENCE\nProduct Solutions Expert - II May 2025 - Present\nEightfold - AI Bangalore, India\n• Serve as the single point of contact and primary Solution Expert for enterprise clients with a combined ACV of\n$5M, leading end-to-end integrations and owning post–go-live technical support and escalations.'

AttributeError: 'list' object has no attribute 'page_content'

In [27]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import numpy as np
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os



In [17]:
class EmbeddingManager:

    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading Embedding model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully. Embedding Dimension is {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading Embedding model {self.model_name} with {e}")
            raise

    def generate_embeddings(self, texts=List[str]) -> np.ndarray:
        """
        Generate embedding for list of texts
        """

        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embedding for the texts")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings for the texts with shape {embeddings.shape}")

        return embeddings

In [18]:
embeddings_manager = EmbeddingManager()

Loading Embedding model all-MiniLM-L6-v2
Model Loaded Successfully. Embedding Dimension is  384


In [19]:
embeddings_manager.generate_embeddings(docs

<__main__.EmbeddingManager at 0x35cd01d60>

# VectorStore

In [78]:
class VectorStore:

    def __init__(self, collection_name:str = "pdf_documents", persist_directory : str = "/Users/gdange/Documents/Personal/VectorStore"):

        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client= None
        self.collection=None
        self._initialize_store()

    def _initialize_store(self):

        """
        Initialize chromadb client and collection
        """

        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path = self.persist_directory)
        
        self.collection = self.client.get_or_create_collection(
            name = self.collection_name,
            metadata = {
                "description": "PDF documents embedding for the RAG"
            }
        )

        print(f"Vector Store intialized Collection {self.collection_name}")
        print(f"Existing documents in collection {self.collection.count()}")

    def add_documents(self, documents : List[Any], embeddings : np.ndarray):

        if len(documents)!=len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embeddings) in enumerate(zip(documents, embeddings)):

            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)

            embeddings_list.append(embeddings.tolist())
        
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )

            print(f"Sucessfully added {len(documents)} to collection")
        except Exception as e:
            raise
            
            
            

In [79]:
vector_store = VectorStore()

Vector Store intialized Collection pdf_documents
Existing documents in collection 0


In [71]:
texts = [doc.page_content for doc in pdf_chunks]

In [62]:
embeddings  = embeddings_manager.generate_embeddings(texts)

Generating embedding for the texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings for the texts with shape (4, 384)


In [63]:
embeddings

array([[-0.11325298, -0.02346559, -0.03093507, ..., -0.00713788,
        -0.03728819,  0.02549302],
       [-0.06875946,  0.00552643,  0.02087624, ..., -0.05638217,
        -0.0034661 ,  0.04931682],
       [-0.05549109,  0.01490006,  0.04847825, ...,  0.00833948,
        -0.00590183,  0.02723115],
       [-0.07883993,  0.01037853, -0.07615634, ..., -0.00950061,
        -0.01788303,  0.06012804]], dtype=float32)

In [80]:
vector_store.add_documents(pdf_chunks, embeddings)

Adding 4 documents to vector store...
Sucessfully added 4 to collection


In [81]:
vector_store

<__main__.VectorStore at 0x35ec025b0>