# Python for Dummies RAG

## Chunking

In [4]:
def chunk_by_paragraphs(text, min_chunk_size):
    """
    Split text by paragraphs (double newlines).
    
    Args:
        text: The text to chunk
        min_chunk_size: Minimum characters per chunk (combine small paragraphs)
    
    Returns:
        List of text chunks
    """
    # Split by double newlines (paragraph separator)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        # If paragraph is too small, combine with next
        if len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            # Save previous chunk if exists
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start new chunk with this paragraph
            current_chunk = para
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# # Test it
# chunks = chunk_by_paragraphs(sample_document, min_chunk_size=100)

# print(f"Number of chunks: {len(chunks)}\n")
# for i, chunk in enumerate(chunks, 1):
#     print(f"Chunk {i} ({len(chunk)} chars):")
#     print(chunk)
#     print("-" * 80)

## Loading and chunking

In [None]:
def load_and_chunk_pdf(file_path, chunk_size=400):
    """
    Load a PDF file and chunk it.
    
    Args:
        file_path: Path to the PDF file
        chunk_size: Characters per chunk
    
    Returns:
        List of chunks with metadata (including page numbers)
    """
    import PyPDF2
    import os
    
    # chunks_without_metadata = []
    all_chunks = []
    file_name = os.path.basename(file_path)
    
    # Open PDF
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        num_pages = len(pdf_reader.pages)
        
        # Process each page
        for page_num in range(num_pages):
            # Extract text from page
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            
            # Chunk the page text
            page_chunks = chunk_by_paragraphs(text, min_chunk_size=chunk_size)


            for chunk in page_chunks:
                if chunk.strip():  # avoid empty chunks
                    all_chunks.append(chunk.strip())
    
    # return chunks_without_metadata
    return all_chunks

# Example (you would use this with a real PDF file)
print("PDF loading function ready!")
print("\nUsage:")
print("chunks = load_and_chunk_pdf('your_document.pdf', chunk_size=500)")



PDF loading function ready!

Usage:
chunks = load_and_chunk_pdf('your_document.pdf', chunk_size=500)


## Embedding using  Chromadb

In [6]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient(path="./chroma_db")

sentence_transform_embed = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name= "all-MiniLM-L6-v2"
)

# Create new collection with custom embedding function
collection_custom = client.get_or_create_collection(
    name="custom_embeddings",
    embedding_function=sentence_transform_embed
)




In [7]:
#Adding documents
file_path = r"C:\Users\ncc333\Desktop\Deep_Learning\RAG\Python For Dummies.pdf"

documents = load_and_chunk_pdf(file_path)

ids = [f"chunk_{i}" for i in range(len(documents))]
collection_custom.add(documents=documents, ids=ids)

print(f"✅ Collection with custom embeddings created")
print(f"Documents: {collection_custom.count()}")

✅ Collection with custom embeddings created
Documents: 398


## Quering

In [None]:
results = collection_custom.query(
    query_texts=["what is python programming?"],
    n_results=5,
    
)

for i, (doc, metadata, distance)in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]),1):
    print(f"Document: \n\n{doc}")

Document: 

89  Chapter 5: Storing and Modifying Information
Understanding Boolean values
It may seem amazing, but computers always give you a straight answer! A 
computer will never provide “maybe” as output. Every answer you get is either 
True or False. In fact, there is an entire branch of mathematics called Boolean 
algebra that was originally defined by George Boole (a super-geek of his time) 
that computers rely upon to make decisions. Contrary to common belief, 
Boolean algebra has existed since 1854 — long before the time of computers.Understanding the need for multiple number types
A lot of new developers (and even some older 
ones) have a hard time understanding why 
there is a need for more than one numeric 
type. After all, humans can use just one kind 
of number. To understand the need for multiple 
number types, you have to understand a little 
about how a computer works with numbers.
An integer is stored in the computer as simply a 
series of bits that the computer read