<a href="https://colab.research.google.com/github/Houstonsboy/Retreival-Augmented-Generation-AI/blob/master/RAG1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import fitz  # PyMuPDF to read PDFs
import os
from google.colab import drive
import re
import tiktoken
import time
import numpy as np
from datetime import datetime
from typing import List, Dict
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langdetect import detect, LangDetectException

In [38]:
# !pip install PyMuPDF


In [39]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [40]:

try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except ValueError as e:
    print(f"Error mounting Google Drive: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [41]:
import os

drive_path = '/content/drive/My Drive'
if os.path.exists(drive_path):
  files = os.listdir(drive_path)
  print(files)
else:
  print(f"The directory {drive_path} does not exist. Please ensure Google Drive is mounted correctly.")

['IMG-20230723-WA0010(1).jpg', 'Classroom', 'Node.java', 'Bubblesort (1).java', 'Bubblesort.java', 'EGYPT CIVILIZATION (1).rtf.gdoc', 'EGYPT CIVILIZATION.rtf.gdoc', 'netmesh', '167998-NandAstableMultivibrator', '3 - Adv . DB - Transaction Management-1.gdoc', 'Untitled0.ipynb', 'Colab Notebooks', 'melb_data.csv', 'melbv1.csv', 'Managing User Password.gdoc', 'Untitled document (2).gdoc', 'KRISTEIN GICHUHI MWAURA.gdoc', 'Managing file system permission.gdoc', '167998-SALabPART 2.gdoc', 'Controlling access to files.gdoc', 'LAB2 -SECRET ENCRYPTION LAB.gdoc', 'Untitled document (1).gdoc', 'Untitled document.gdoc', 'Bobs_superheroes.gdoc', 'Bobs_superheroes.pdf']


In [42]:
def chunk_document(text: str, min_tokens=500, max_tokens=800, overlap_percent=20, title="Document"):
    """
    Chunk document text into semantic, token-based chunks.

    Args:
        text: Extracted text from PDF
        min_tokens: Minimum tokens per chunk (default: 500)
        max_tokens: Maximum tokens per chunk (default: 800)
        overlap_percent: Overlap percentage between chunks (default: 20)
        title: Document title for metadata (default: "Document")

    Returns:
        List of chunk dictionaries with metadata
    """
    # Initialize tokenizer
    try:
        encoding = tiktoken.get_encoding("cl100k_base")
    except:
        import subprocess
        subprocess.check_call(['pip', 'install', '-q', 'tiktoken'])
        import tiktoken
        encoding = tiktoken.get_encoding("cl100k_base")

    def count_tokens(txt):
        return len(encoding.encode(txt))

    def split_sentences(txt):
        """Split text into sentences preserving meaning."""
        txt = re.sub(r'(\w)\.(\s+[A-Z])', r'\1.<SPLIT>\2', txt)
        txt = re.sub(r'(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Inc|Ltd|Co)\.', r'\1<DOT>', txt)
        sentences = re.split(r'(?<=[.!?])\s+', txt)
        sentences = [s.replace('<DOT>', '.').replace('<SPLIT>', '') for s in sentences]
        return [s.strip() for s in sentences if s.strip()]

    # Calculate overlap
    overlap_tokens = int(max_tokens * overlap_percent / 100)

    # Split into sentences
    sentences = split_sentences(text)
    chunks = []
    current_chunk = []
    current_tokens = 0
    chunk_num = 1
    overlap_buffer = []
    overlap_buffer_tokens = 0

    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)

        # Check if adding sentence exceeds max_tokens
        if current_tokens + sentence_tokens > max_tokens and current_chunk:
            # Save current chunk if meets minimum
            if current_tokens >= min_tokens:
                chunk_text = ' '.join(current_chunk)
                chunks.append({
                    'chunk_number': chunk_num,
                    'title': title,
                    'text': chunk_text,
                    'token_count': current_tokens,
                    'sentence_count': len(current_chunk),
                    'char_count': len(chunk_text)
                })
                chunk_num += 1

                # Build overlap buffer
                overlap_buffer = []
                overlap_buffer_tokens = 0
                for sent in reversed(current_chunk):
                    sent_tokens = count_tokens(sent)
                    if overlap_buffer_tokens + sent_tokens <= overlap_tokens:
                        overlap_buffer.insert(0, sent)
                        overlap_buffer_tokens += sent_tokens
                    else:
                        break

                # Start new chunk with overlap
                current_chunk = overlap_buffer.copy()
                current_tokens = overlap_buffer_tokens

        # Add sentence to current chunk
        current_chunk.append(sentence)
        current_tokens += sentence_tokens

    # Add final chunk
    if current_chunk and current_tokens >= min_tokens:
        chunk_text = ' '.join(current_chunk)
        chunks.append({
            'chunk_number': chunk_num,
            'title': title,
            'text': chunk_text,
            'token_count': current_tokens,
            'sentence_count': len(current_chunk),
            'char_count': len(chunk_text)
        })

    return chunks


def view_chunk(chunks: List[Dict], n: int):
    """
    Display a specific chunk with its metadata.

    Args:
        chunks: List of chunks returned by chunk_document()
        n: Chunk number to display (1-indexed)
    """
    if n < 1 or n > len(chunks):
        print(f"❌ Error: Chunk {n} does not exist. Valid range: 1-{len(chunks)}")
        return

    chunk = chunks[n - 1]
    print(f"\n{'='*80}")
    print(f"📄 CHUNK {chunk['chunk_number']} | {chunk['title']}")
    print(f"{'='*80}")
    print(f"📊 Tokens: {chunk['token_count']} | Sentences: {chunk['sentence_count']} | Characters: {chunk['char_count']}")
    print(f"{'-'*80}")
    print(chunk['text'])
    print(f"{'='*80}\n")


def print_summary(chunks: List[Dict]):
    """Print summary of all chunks."""
    print(f"\n{'='*80}")
    print(f"📚 CHUNKING SUMMARY")
    print(f"{'='*80}")
    print(f"Total chunks: {len(chunks)}")
    if chunks:
        print(f"Token range: {chunks[0]['token_count']}-{max(c['token_count'] for c in chunks)}")
    print(f"{'='*80}\n")

    for chunk in chunks:
        preview = chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text']
        print(f"Chunk {chunk['chunk_number']}: {chunk['token_count']} tokens | {preview}")


In [43]:
# def install_dependencies():
#     """Install required packages for embeddings and vector database."""
#     print("📦 Installing dependencies...")
#     import subprocess

#     packages = [
#         'chromadb',
#         'sentence-transformers',
#         'langdetect'
#     ]

#     for package in packages:
#         print(f"   Installing {package}...")
#         subprocess.check_call(['pip', 'install', '-q', package])

#     print("✅ All dependencies installed!\n")

# install_dependencies()


In [61]:
def initialize_embedding_model(model_name='all-MiniLM-L6-v2'):
    """
    Initialize the embedding model.

    Available models (sorted by quality/size):
    - 'all-MiniLM-L6-v2': Fast, 384 dims (Recommended for speed)
    - 'all-mpnet-base-v2': Best quality, 768 dims (Recommended for accuracy)
    - 'multi-qa-mpnet-base-dot-v1': Great for Q&A tasks, 768 dims
    - 'paraphrase-multilingual-MiniLM-L12-v2': Multilingual, 384 dims

    Args:
        model_name: Name of the sentence-transformer model

    Returns:
        Loaded embedding model
    """
    print(f"🧠 Initializing embedding model: {model_name}")
    print("   (First run will download the model...)")

    start_time = time.time()
    model = SentenceTransformer(model_name)

    print(f"✅ Model loaded in {time.time() - start_time:.2f}s")
    print(f"   Embedding dimension: {model.get_sentence_embedding_dimension()}")
    print()

    return model


# =============================================================================
# STEP 3: INITIALIZE CHROMADB (FREE VECTOR DATABASE)
# =============================================================================

def initialize_vector_db(collection_name='document_chunks', persist_directory='./chroma_db'):
    """
    Initialize ChromaDB - a free, robust vector database.

    Args:
        collection_name: Name for your collection
        persist_directory: Directory to persist the database

    Returns:
        ChromaDB collection object
    """
    print(f"💾 Initializing ChromaDB...")
    print(f"   Collection: {collection_name}")
    print(f"   Persist directory: {persist_directory}")

    # Initialize ChromaDB client with persistence
    client = chromadb.PersistentClient(path=persist_directory)

    # Create or get collection
    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"description": "Document chunks with embeddings"}
    )

    print(f"✅ ChromaDB initialized!")
    print(f"   Existing documents in collection: {collection.count()}")
    print()

    return collection


# =============================================================================
# STEP 4: DETECT LANGUAGE
# =============================================================================

def detect_language(text: str) -> str:
    """Detect language of text."""
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"


# =============================================================================
# STEP 5: EMBED CHUNKS WITH BATCH PROCESSING
# =============================================================================

def embed_chunks(chunks: List[Dict],
                 embedding_model,
                 file_id: str = "default_file",
                 batch_size: int = 128,
                 normalize: bool = True):
    """
    Generate embeddings for chunks with batch processing and L2 normalization.

    Args:
        chunks: List of chunk dictionaries from chunk_document()
        embedding_model: Loaded SentenceTransformer model
        file_id: Unique identifier for the source file
        batch_size: Number of chunks to process per batch (default: 128)
        normalize: Whether to L2-normalize embeddings for cosine similarity

    Returns:
        List of chunks with embeddings and metadata
    """
    print(f"🔮 Generating embeddings for {len(chunks)} chunks...")
    print(f"   Batch size: {batch_size}")
    print(f"   L2 normalization: {normalize}")
    print()

    embedded_chunks = []
    total_batches = (len(chunks) + batch_size - 1) // batch_size

    start_time = time.time()

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        batch_num = i // batch_size + 1

        print(f"   Processing batch {batch_num}/{total_batches} ({len(batch)} chunks)...")

        # Extract texts for embedding
        texts = [chunk['text'] for chunk in batch]

        # Generate embeddings
        batch_embeddings = embedding_model.encode(
            texts,
            normalize_embeddings=normalize,
            show_progress_bar=False
        )

        # Add embeddings and metadata to chunks
        for j, chunk in enumerate(batch):
            embedding = batch_embeddings[j]

            # Detect language
            language = detect_language(chunk['text'])

            # Create enriched chunk with metadata
            enriched_chunk = {
                **chunk,  # Original chunk data
                'embedding': embedding.tolist(),  # Convert numpy to list
                'file_id': file_id,
                'page_no': None,  # Will be updated if page info available
                'heading': chunk.get('title', 'Unknown'),
                'chunk_text': chunk['text'],
                'embedding_model': getattr(embedding_model, 'model_name_or_path', 'all-MiniLM-L6-v2'),
                'ingest_time': datetime.now().isoformat(),
                'language': language,
                'embedding_dim': len(embedding)
            }

            embedded_chunks.append(enriched_chunk)

    elapsed_time = time.time() - start_time
    print(f"\n✅ Embeddings generated in {elapsed_time:.2f}s")
    print(f"   Average: {elapsed_time/len(chunks):.3f}s per chunk")
    print(f"   Throughput: {len(chunks)/elapsed_time:.1f} chunks/second")
    print()

    return embedded_chunks


# =============================================================================
# STEP 6: STORE IN VECTOR DATABASE
# =============================================================================

def store_in_vector_db(collection, embedded_chunks: List[Dict]):
    """
    Store embedded chunks in ChromaDB with all metadata.

    Args:
        collection: ChromaDB collection
        embedded_chunks: Chunks with embeddings from embed_chunks()
    """
    print(f"💾 Storing {len(embedded_chunks)} chunks in vector database...")

    # Prepare data for ChromaDB
    ids = []
    embeddings = []
    documents = []
    metadatas = []

    for chunk in embedded_chunks:
       chunk_id = f"{chunk['file_id']}_chunk_{chunk['chunk_number']}"
       ids.append(chunk_id)
       embeddings.append(chunk['embedding'])
       documents.append(chunk['chunk_text'])

       metadata = {
        'chunk_number': chunk.get('chunk_number', -1),
        'title': chunk.get('title', 'unknown'),
        'file_id': chunk.get('file_id', 'unknown'),
        'page_no': chunk.get('page_no', 'unknown'),
        'heading': chunk.get('heading', 'unknown'),
        'token_count': chunk.get('token_count', 0),
        'sentence_count': chunk.get('sentence_count', 0),
        'char_count': chunk.get('char_count', 0),
        'embedding_model': chunk.get('embedding_model', 'unknown'),
        'ingest_time': chunk.get('ingest_time', 'unknown'),
        'language': chunk.get('language', 'unknown'),
        'embedding_dim': chunk.get('embedding_dim', 0)
    }

    # Clean out None values
       clean_metadata = {k: (v if v is not None else "unknown") for k, v in metadata.items()}
       metadatas.append(clean_metadata)


    # Add to collection
    start_time = time.time()
    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas
    )

    elapsed_time = time.time() - start_time
    print(f"✅ Stored in {elapsed_time:.2f}s")
    print(f"   Total documents in collection: {collection.count()}")
    print()


# =============================================================================
# STEP 7: QUERY VECTOR DATABASE
# =============================================================================

def query_vector_db(collection,
                    embedding_model,
                    query_text: str,
                    n_results: int = 5,
                    filter_metadata: Dict = None):
    """
    Query the vector database for similar chunks.

    Args:
        collection: ChromaDB collection
        embedding_model: Loaded SentenceTransformer model
        query_text: Text to search for
        n_results: Number of results to return
        filter_metadata: Optional metadata filters (e.g., {'file_id': 'doc1'})

    Returns:
        Query results with documents and metadata
    """
    print(f"🔍 Querying: '{query_text}'")
    print(f"   Returning top {n_results} results")

    # Generate query embedding
    query_embedding = embedding_model.encode(
        [query_text],
        normalize_embeddings=True
    )[0].tolist()

    # Query the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        where=filter_metadata
    )

    print(f"✅ Found {len(results['documents'][0])} results\n")

    return results


def print_query_results(results):
    """Print query results in a readable format."""
    documents = results['documents'][0]
    metadatas = results['metadatas'][0]
    distances = results['distances'][0]

    print(f"{'='*80}")
    print(f"SEARCH RESULTS")
    print(f"{'='*80}\n")

    for i, (doc, meta, dist) in enumerate(zip(documents, metadatas, distances), 1):
        similarity = 1 - dist  # Convert distance to similarity
        print(f"Result {i} | Similarity: {similarity:.4f}")
        print(f"   File: {meta['file_id']} | Chunk: {meta['chunk_number']}")
        print(f"   Title: {meta['title']}")
        print(f"   Tokens: {meta['token_count']} | Language: {meta['language']}")
        print(f"   Ingested: {meta['ingest_time']}")
        print(f"\n   Text preview: {doc[:200]}...")
        print(f"{'-'*80}\n")


# =============================================================================
# STEP 8: VERIFICATION FUNCTIONS
# =============================================================================

def verify_embeddings(embedded_chunks: List[Dict], sample_size: int = 3):
    """Verify embeddings were created correctly."""
    print(f"🔍 EMBEDDING VERIFICATION")
    print(f"{'='*80}")
    print(f"Total chunks embedded: {len(embedded_chunks)}")

    if embedded_chunks:
        sample = embedded_chunks[:sample_size]

        for i, chunk in enumerate(sample, 1):
            print(f"\nChunk {i}:")
            print(f"   Chunk Number: {chunk['chunk_number']}")
            print(f"   Title: {chunk['title']}")
            print(f"   File ID: {chunk['file_id']}")
            print(f"   Language: {chunk['language']}")
            print(f"   Embedding Model: {chunk['embedding_model']}")
            print(f"   Embedding Dimension: {chunk['embedding_dim']}")
            print(f"   Token Count: {chunk['token_count']}")
            print(f"   Ingest Time: {chunk['ingest_time']}")
            print(f"   Text Preview: {chunk['chunk_text'][:100]}...")

            # Check embedding
            embedding = np.array(chunk['embedding'])
            print(f"   Embedding Stats:")
            print(f"      - Shape: {embedding.shape}")
            print(f"      - Mean: {embedding.mean():.4f}")
            print(f"      - Std: {embedding.std():.4f}")
            print(f"      - L2 Norm: {np.linalg.norm(embedding):.4f}")

    print(f"\n{'='*80}\n")


def verify_vector_db(collection):
    """Verify vector database contents."""
    print(f"🔍 VECTOR DATABASE VERIFICATION")
    print(f"{'='*80}")
    print(f"Collection Name: {collection.name}")
    print(f"Total Documents: {collection.count()}")

    if collection.count() > 0:
        # Peek at first few documents
        sample = collection.peek(limit=3)

        print(f"\nSample Documents:")
        for i, (id, meta) in enumerate(zip(sample['ids'], sample['metadatas']), 1):
            print(f"\n   Document {i}:")
            print(f"      ID: {id}")
            print(f"      Chunk: {meta['chunk_number']}")
            print(f"      Title: {meta['title']}")
            print(f"      File: {meta['file_id']}")
            print(f"      Language: {meta['language']}")
            print(f"      Tokens: {meta['token_count']}")

    print(f"\n{'='*80}\n")


# =============================================================================
# COMPLETE WORKFLOW EXAMPLE
# =============================================================================

def complete_embedding_workflow(chunks, file_id="bobs_superheroes"):
    """
    Complete workflow: Initialize → Embed → Store → Query

    Args:
        chunks: Chunks from chunk_document() function
        file_id: Unique identifier for the source file
    """
    print(f"\n{'#'*80}")
    print(f"# COMPLETE EMBEDDING WORKFLOW")
    print(f"{'#'*80}\n")

    # Step 1: Initialize embedding model
    print("STEP 1: Initialize Embedding Model")
    print("-" * 80)
    embedding_model = initialize_embedding_model('all-MiniLM-L6-v2')

    # Step 2: Initialize vector database
    print("STEP 2: Initialize Vector Database")
    print("-" * 80)
    collection = initialize_vector_db(collection_name='superhero_docs')

    # Step 3: Generate embeddings
    print("STEP 3: Generate Embeddings")
    print("-" * 80)
    embedded_chunks = embed_chunks(
        chunks=chunks,
        embedding_model=embedding_model,
        file_id=file_id,
        batch_size=128,
        normalize=True
    )

    # Step 4: Verify embeddings
    print("STEP 4: Verify Embeddings")
    print("-" * 80)
    verify_embeddings(embedded_chunks)

    # Step 5: Store in vector database
    print("STEP 5: Store in Vector Database")
    print("-" * 80)
    store_in_vector_db(collection, embedded_chunks)

    # Step 6: Verify database
    print("STEP 6: Verify Database Storage")
    print("-" * 80)
    verify_vector_db(collection)

    # Step 7: Test query
    print("STEP 7: Test Query")
    print("-" * 80)
    results = query_vector_db(
        collection=collection,
        embedding_model=embedding_model,
        query_text="Who is James",
        n_results=5
    )
    print_query_results(results)

    print(f"\n{'#'*80}")
    print(f"# WORKFLOW COMPLETE!")
    print(f"{'#'*80}\n")

    return embedding_model, collection


# =============================================================================
# USAGE WITH YOUR CHUNKS
# =============================================================================

"""
# Assuming you have chunks from the previous chunking code:
# chunks = chunk_document(text, title="Bobs Superheroes")

# Run the complete workflow:
embedding_model, collection = complete_embedding_workflow(
    chunks=chunks,
    file_id="bobs_superheroes_2024"
)

# Now you can query anytime:
results = query_vector_db(
    collection=collection,
    embedding_model=embedding_model,
    query_text="your search query here",
    n_results=5
)
print_query_results(results)
"""

'\n# Assuming you have chunks from the previous chunking code:\n# chunks = chunk_document(text, title="Bobs Superheroes")\n\n# Run the complete workflow:\nembedding_model, collection = complete_embedding_workflow(\n    chunks=chunks,\n    file_id="bobs_superheroes_2024"\n)\n\n# Now you can query anytime:\nresults = query_vector_db(\n    collection=collection,\n    embedding_model=embedding_model,\n    query_text="your search query here",\n    n_results=5\n)\nprint_query_results(results)\n'

In [63]:
pdf_path = '/content/drive/My Drive/Bobs_superheroes.pdf'
doc = fitz.open(pdf_path)
text = ''
for page in doc:
    text += page.get_text()
doc.close()

# Create chunks (from previous code)
chunks = chunk_document(text, title="Bobs Superheroes")

# Run complete embedding workflow with verification at each step

# print_summary(chunks)   # Optional — to see overview
view_chunk(chunks, 8)   # 👈 View chunk 3


📄 CHUNK 8 | Bobs Superheroes
📊 Tokens: 793 | Sentences: 27 | Characters: 2456
--------------------------------------------------------------------------------
Gaydos, 
Sarah (ed). "Players, Chapter Three: Landing on Boardwalk" Bobs Burger 22 (November 20, 
2012), New York, NY: DC Comics 
 Hopps, Kevin (writer) & Oliva, Jay, Divar, Tim (directors) (April 14, 2012). "Usual Suspects". Bobs Burger. Season 1. Episode 25. Cartoon Network. 
 Weisman, Greg (writer) & Chang, Michael, Montgomery, Lauren (directors) (April 21, 2012). "Auld Acquaintance". Bobs Burger. Season 1. Episode 26. Cartoon Network. 
 Weisman, Greg (2022-04-08). Question #25790. Ask Greg. Retrieved 2022-04-08. 
 Scott, Sharon, Brandon Vietti, Greg Weisman (writers), Bobs Burger: Legacy (November 19, 
2013): Sam biography. Santa Ana, CA: Little Orbit 
 Scott, Sharon, Brandon Vietti, Greg Weisman (writers), Bobs Burger: Legacy (November 19, 
2013): Greece. Santa Ana, CA: Little Orbit 
 Scott, Sharon, Brandon Vietti, Greg Wei

In [62]:
embedding_model, collection = complete_embedding_workflow(
    chunks=chunks,
    file_id="bobs_superheroes_2024"
)


################################################################################
# COMPLETE EMBEDDING WORKFLOW
################################################################################

STEP 1: Initialize Embedding Model
--------------------------------------------------------------------------------
🧠 Initializing embedding model: all-MiniLM-L6-v2
   (First run will download the model...)
✅ Model loaded in 0.87s
   Embedding dimension: 384

STEP 2: Initialize Vector Database
--------------------------------------------------------------------------------
💾 Initializing ChromaDB...
   Collection: superhero_docs
   Persist directory: ./chroma_db
✅ ChromaDB initialized!
   Existing documents in collection: 12

STEP 3: Generate Embeddings
--------------------------------------------------------------------------------
🔮 Generating embeddings for 12 chunks...
   Batch size: 128
   L2 normalization: True

   Processing batch 1/1 (12 chunks)...

✅ Embeddings generated in 1.70s
   Ave