### Document structure

In [2]:
from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content = "This is a sample text content",
    metadata = {
        "source":"examples",
        "pages" : 1,
        "author" : "ABC"
    } # metadata is useful for applying them as filters
)

In [4]:
doc

Document(metadata={'source': 'examples', 'pages': 1, 'author': 'ABC'}, page_content='This is a sample text content')

In [5]:
## Create a simple txt file 
import os 
os.makedirs("data/text_files", exist_ok=True)

In [6]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    print(filepath)
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample text files created!")

data/text_files/python_intro.txt
data/text_files/machine_learning.txt
Sample text files created!


In [7]:
## TextLoader
from langchain.document_loaders.text import TextLoader

loader = TextLoader("data/text_files/python_intro.txt")
documents = loader.load()
print(f"Number of documents: {len(documents)}")
print(documents[0].page_content)

Number of documents: 1
Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.


In [9]:
## DirectoryLoader
from langchain_core.document_loaders import DirectoryLoader
dir_loader = DirectoryLoader("data/text_files", 
                             glob="*.txt", 
                             loader_cls=TextLoader,
                             loader_kwargs={"encoding":"utf-8"},
                             show_progress=False # set to True to see progress bar (requires tqdm package)
                             )

documents = dir_loader.load()
documents 

ImportError: cannot import name 'DirectoryLoader' from 'langchain_core.document_loaders' (/Users/kiranm/Desktop/MySpace/rag/rag_project/rag_env/lib/python3.9/site-packages/langchain_core/document_loaders/__init__.py)

In [12]:
# ## PDF loader
# from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
# from pathlib import Path

# dir_loader = DirectoryLoader("data/pdf_files",
#                                 glob="*.pdf", 
#                                 loader_cls=PyMuPDFLoader,
#                                 show_progress=False # set to True to see progress bar (requires tqdm package)
#                                 )

# pdf_documents = dir_loader.load()
# print(f"Number of PDF documents: {len(pdf_documents)}")
# # pdf_documents


In [10]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
pdf_documents = process_all_pdfs("data/pdf_files")

Found 2 PDF files to process

Processing: vLLM_paper.pdf
  ✓ Loaded 16 pages

Processing: HumanEval_on_LLMs.pdf
  ✓ Loaded 6 pages

Total documents loaded: 22


In [13]:
len(pdf_documents[3].page_content)

5428

In [14]:
len(pdf_documents[0].page_content)

4679

## Chunk the pdf documents

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Example: pdf_documents is already loaded like this:
# pdf_documents = [Document(page_content="..."), Document(page_content="...")]

# Initialize a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # number of characters per chunk
    chunk_overlap=200,    # overlap between chunks
    length_function=len,  # function to measure length
    separators=["\n\n", "\n", " ", ""]  # order of splitting preference
)

# Split into chunks
docs_chunks = text_splitter.split_documents(pdf_documents)

In [16]:
print(f"Original docs: {len(pdf_documents)}")
print(f"Chunked docs: {len(docs_chunks)}")
print(docs_chunks[0]) 

Original docs: 22
Chunked docs: 144
page_content='Efficient Memory Management for Large Language
Model Serving with PagedAttention
Woosuk Kwon1,∗ Zhuohan Li1,∗ Siyuan Zhuang1 Ying Sheng1,2 Lianmin Zheng1 Cody Hao Yu3
Joseph E. Gonzalez1 Hao Zhang4 Ion Stoica1
1UC Berkeley 2Stanford University 3Independent Researcher 4UC San Diego
Abstract
High throughput serving of large language models (LLMs)
requires batching sufficiently many requests at a time. How-
ever, existing systems struggle because the key-value cache
(KV cache) memory for each request is huge and grows
and shrinks dynamically. When managed inefficiently, this
memory can be significantly wasted by fragmentation and
redundant duplication, limiting the batch size. To address
this problem, we propose PagedAttention, an attention al-
gorithm inspired by the classical virtual memory and pag-
ing techniques in operating systems. On top of it, we build
vLLM, an LLM serving system that achieves (1) near-zero
waste in KV cache memory

### Embedding and VectorDB

In [17]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
class EmbeddingManager : 
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully. Embdedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")
        embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        return embeddings
    
    
    ## Initialise the embedding manager
embedding_manager = EmbeddingManager(model_name="all-MiniLM-L6-v2")
embedding_manager
        

Model all-MiniLM-L6-v2 loaded successfully. Embdedding dimension: 384


<__main__.EmbeddingManager at 0x31e977310>

### VectorStore

In [19]:
class VectorStore:
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        
        try:   
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Document Embdeddings for RAG"})
            
            print(f"Vector store initialized at {self.persist_directory} with collection {self.collection_name}")
            print(f"Existing documents in collectio : {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        
        if len(documents) != embeddings.shape[0]:
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store...")
        
        # prepare data for chromadb
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            # generate unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # prepare metadata
            metadata = dict(doc.metadata) if doc.metadata else {}
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)
            
            # document content
            documents_text.append(doc.page_content)
            
            # embedding
            embeddings_list.append(embedding.tolist())
            
        # add to collection
        try :
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Documents added successfully. Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise e    
        
        
vectorstore = VectorStore()
vectorstore

Vector store initialized at data/vector_store with collection pdf_documents
Existing documents in collectio : 144


<__main__.VectorStore at 0x31f6db880>

In [20]:
[len(pdf_documents[i].page_content) for i in range(len(pdf_documents))] 

[4679,
 5337,
 5572,
 5428,
 4944,
 5615,
 5690,
 5873,
 5176,
 4197,
 4109,
 4537,
 5463,
 6904,
 7621,
 788,
 5234,
 5099,
 5163,
 4841,
 6437,
 1981]

In [21]:
## convert the text to embeddings
texts = [doc.page_content for doc in docs_chunks]
texts[:10]

['Efficient Memory Management for Large Language\nModel Serving with PagedAttention\nWoosuk Kwon1,∗ Zhuohan Li1,∗ Siyuan Zhuang1 Ying Sheng1,2 Lianmin Zheng1 Cody Hao Yu3\nJoseph E. Gonzalez1 Hao Zhang4 Ion Stoica1\n1UC Berkeley 2Stanford University 3Independent Researcher 4UC San Diego\nAbstract\nHigh throughput serving of large language models (LLMs)\nrequires batching sufficiently many requests at a time. How-\never, existing systems struggle because the key-value cache\n(KV cache) memory for each request is huge and grows\nand shrinks dynamically. When managed inefficiently, this\nmemory can be significantly wasted by fragmentation and\nredundant duplication, limiting the batch size. To address\nthis problem, we propose PagedAttention, an attention al-\ngorithm inspired by the classical virtual memory and pag-\ning techniques in operating systems. On top of it, we build\nvLLM, an LLM serving system that achieves (1) near-zero\nwaste in KV cache memory and (2) flexible sharing of KV

In [22]:
## generate embeddings
embeddings = embedding_manager.generate_embeddings(texts)
embeddings.shape

## strore in vectorstore
vectorstore.add_documents(docs_chunks, embeddings)

Batches: 100%|██████████| 5/5 [00:01<00:00,  4.02it/s]

Adding 144 documents to the vector store...
Documents added successfully. Total documents in collection: 288





### Rag Retrieval from Vector Store

In [23]:
class RAGRetriever:
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str,Any]]:
        # generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # perform similarity search in vector store 
        try : 
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
        
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance  # convert distance to similarity score
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "document": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                        
                print(f"Retrieved {len(retrieved_docs)} documents for the query: '{query}'")
            else:
                print(f"No documents retrieved for the query: '{query}'")
                
            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
            
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

In [24]:
rag_retriever

<__main__.RAGRetriever at 0x31e9e8a00>

In [32]:
query = "What is memory management in Large Language Models?"
retrieved_li = rag_retriever.retrieve(query)
len(retrieved_li)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]

Retrieved 5 documents for the query: 'What is memory management in Large Language Models?'





5

In [36]:
retrieved_li[4]['document']

'the decoding process advances.\nScheduling for unknown input & output lengths. The\nrequests to an LLM service exhibit variability in their input\nand output lengths. This requires the memory management\nsystem to accommodate a wide range of prompt lengths. In\naddition, as the output length of a request grows at decoding,\nthe memory required for its KV cache also expands and may\nexhaust available memory for incoming requests or ongoing\ngeneration for existing prompts. The system needs to make\nscheduling decisions, such as deleting or swapping out the\nKV cache of some requests from GPU memory.\n3.1 Memory Management in Existing Systems\nSince most operators in current deep learning frameworks\n[33, 39] require tensors to be stored in contiguous memory,\nprevious LLM serving systems [ 31, 60] also store the KV\ncache of one request as a contiguous tensor across the differ-\nent positions. Due to the unpredictable output lengths from\nthe LLM, they statically allocate a chunk of me

### Augmented Generation

In [None]:
### Simple RAG Pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize Groq LLM
groq_api_key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(api_key=groq_api_key, model="gemma2-9b-it",temperature=0.1, max_tokens=1024)

### Simple RAG function : retrieve + generate
def rag_simple(query, retriever, llm, top_k=3):
    # retrieve relevant documents
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['document'] for doc in results]) if results else "No relevant documents found."
    if not context: 
        return "No relevant documents found."
    
    prompt = f"""Use the following context to answer the question conscisely.
    Context: 
    {context}

    Question: {query}

    Answer:""".strip()
    
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content

In [28]:
answer = rag_simple("Explain memory management in LLMs", rag_retriever, llm, top_k=3)

NameError: name 'rag_simple' is not defined

### Enhanced RAG pipeline

In [None]:
def rag_advanced(query, retriever, llm, top_k=5, min_score = 0.2, return_context = False):
    """
    RAG pipeline with extra features.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return "No relevant documents found."   
    
    context = "\n\n".join([doc['document'] for doc in results])
    
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page' : doc['metadata'].get('page', doc['metadata'].get('pages', 'unknown')),
        'score' : doc['similarity_score'],
        'preview' : doc['document'][:300] + "..." if len(doc['document']) > 300 else doc['document']
    } for doc in results]
    
    confidence = max(doc['similarity_score'] for doc in results)
    
    prompt = f"""Use the following context to answer the question conscisely.
    Context: 
    {context}
    Question: {query}
    Answer:""".strip()
    
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        "answer": response.content,
        "confidence": confidence,
        "sources": sources
    }
    if return_context:
        output['context'] = context
        
    return output

    

In [27]:
result = rag_advanced("Explain memory management in Large Language Models?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Confidence:", result['confidence'])
print("Sources:", result['sources'])
print("Context Preview:", result['context'][:300])  

NameError: name 'rag_advanced' is not defined