<a href="https://colab.research.google.com/github/Houstonsboy/Retreival-Augmented-Generation-AI/blob/master/RAG1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import fitz  # PyMuPDF to read PDFs
import os
from google.colab import drive
import re
import tiktoken
from typing import List, Dict

In [2]:
# !pip install PyMuPDF


In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:

try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except ValueError as e:
    print(f"Error mounting Google Drive: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [5]:
import os

drive_path = '/content/drive/My Drive'
if os.path.exists(drive_path):
  files = os.listdir(drive_path)
  print(files)
else:
  print(f"The directory {drive_path} does not exist. Please ensure Google Drive is mounted correctly.")

['IMG-20230723-WA0010(1).jpg', 'Classroom', 'Node.java', 'Bubblesort (1).java', 'Bubblesort.java', 'EGYPT CIVILIZATION (1).rtf.gdoc', 'EGYPT CIVILIZATION.rtf.gdoc', 'netmesh', '167998-NandAstableMultivibrator', '3 - Adv . DB - Transaction Management-1.gdoc', 'Untitled0.ipynb', 'Colab Notebooks', 'melb_data.csv', 'melbv1.csv', 'Managing User Password.gdoc', 'Untitled document (2).gdoc', 'KRISTEIN GICHUHI MWAURA.gdoc', 'Managing file system permission.gdoc', '167998-SALabPART 2.gdoc', 'Controlling access to files.gdoc', 'LAB2 -SECRET ENCRYPTION LAB.gdoc', 'Untitled document (1).gdoc', 'Untitled document.gdoc', 'Bobs_superheroes.gdoc', 'Bobs_superheroes.pdf']


In [6]:
def chunk_document(text: str, min_tokens=500, max_tokens=800, overlap_percent=20, title="Document"):
    """
    Chunk document text into semantic, token-based chunks.

    Args:
        text: Extracted text from PDF
        min_tokens: Minimum tokens per chunk (default: 500)
        max_tokens: Maximum tokens per chunk (default: 800)
        overlap_percent: Overlap percentage between chunks (default: 20)
        title: Document title for metadata (default: "Document")

    Returns:
        List of chunk dictionaries with metadata
    """
    # Initialize tokenizer
    try:
        encoding = tiktoken.get_encoding("cl100k_base")
    except:
        import subprocess
        subprocess.check_call(['pip', 'install', '-q', 'tiktoken'])
        import tiktoken
        encoding = tiktoken.get_encoding("cl100k_base")

    def count_tokens(txt):
        return len(encoding.encode(txt))

    def split_sentences(txt):
        """Split text into sentences preserving meaning."""
        txt = re.sub(r'(\w)\.(\s+[A-Z])', r'\1.<SPLIT>\2', txt)
        txt = re.sub(r'(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Inc|Ltd|Co)\.', r'\1<DOT>', txt)
        sentences = re.split(r'(?<=[.!?])\s+', txt)
        sentences = [s.replace('<DOT>', '.').replace('<SPLIT>', '') for s in sentences]
        return [s.strip() for s in sentences if s.strip()]

    # Calculate overlap
    overlap_tokens = int(max_tokens * overlap_percent / 100)

    # Split into sentences
    sentences = split_sentences(text)
    chunks = []
    current_chunk = []
    current_tokens = 0
    chunk_num = 1
    overlap_buffer = []
    overlap_buffer_tokens = 0

    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)

        # Check if adding sentence exceeds max_tokens
        if current_tokens + sentence_tokens > max_tokens and current_chunk:
            # Save current chunk if meets minimum
            if current_tokens >= min_tokens:
                chunk_text = ' '.join(current_chunk)
                chunks.append({
                    'chunk_number': chunk_num,
                    'title': title,
                    'text': chunk_text,
                    'token_count': current_tokens,
                    'sentence_count': len(current_chunk),
                    'char_count': len(chunk_text)
                })
                chunk_num += 1

                # Build overlap buffer
                overlap_buffer = []
                overlap_buffer_tokens = 0
                for sent in reversed(current_chunk):
                    sent_tokens = count_tokens(sent)
                    if overlap_buffer_tokens + sent_tokens <= overlap_tokens:
                        overlap_buffer.insert(0, sent)
                        overlap_buffer_tokens += sent_tokens
                    else:
                        break

                # Start new chunk with overlap
                current_chunk = overlap_buffer.copy()
                current_tokens = overlap_buffer_tokens

        # Add sentence to current chunk
        current_chunk.append(sentence)
        current_tokens += sentence_tokens

    # Add final chunk
    if current_chunk and current_tokens >= min_tokens:
        chunk_text = ' '.join(current_chunk)
        chunks.append({
            'chunk_number': chunk_num,
            'title': title,
            'text': chunk_text,
            'token_count': current_tokens,
            'sentence_count': len(current_chunk),
            'char_count': len(chunk_text)
        })

    return chunks


def view_chunk(chunks: List[Dict], n: int):
    """
    Display a specific chunk with its metadata.

    Args:
        chunks: List of chunks returned by chunk_document()
        n: Chunk number to display (1-indexed)
    """
    if n < 1 or n > len(chunks):
        print(f"❌ Error: Chunk {n} does not exist. Valid range: 1-{len(chunks)}")
        return

    chunk = chunks[n - 1]
    print(f"\n{'='*80}")
    print(f"📄 CHUNK {chunk['chunk_number']} | {chunk['title']}")
    print(f"{'='*80}")
    print(f"📊 Tokens: {chunk['token_count']} | Sentences: {chunk['sentence_count']} | Characters: {chunk['char_count']}")
    print(f"{'-'*80}")
    print(chunk['text'])
    print(f"{'='*80}\n")


def print_summary(chunks: List[Dict]):
    """Print summary of all chunks."""
    print(f"\n{'='*80}")
    print(f"📚 CHUNKING SUMMARY")
    print(f"{'='*80}")
    print(f"Total chunks: {len(chunks)}")
    if chunks:
        print(f"Token range: {chunks[0]['token_count']}-{max(c['token_count'] for c in chunks)}")
    print(f"{'='*80}\n")

    for chunk in chunks:
        preview = chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text']
        print(f"Chunk {chunk['chunk_number']}: {chunk['token_count']} tokens | {preview}")


In [7]:
pdf_path = '/content/drive/My Drive/Bobs_superheroes.pdf'

doc = fitz.open(pdf_path)

text = ''
for page in doc:
    text += page.get_text()

print(' '.join(text.split()[:100]))

DEATH OF HOLLYWOOD Search Sign In Register Bobs Burger Wikif Bobs Burger Wiki Explore Media Seasons Targets Community Search Sign In Menu Explore Skip to content Bobs Burger Wiki Bobs Burger Wiki 2,140 pages Explore Media Seasons Targets Community in: A to Z, Bat family, Gotham Academy, and 5 more English Thor Sign in to edit 2010 2019 2020 Thor 2020 Thor Vital statistics Real name Richard Cat Alias Sam[1] Dan Danger[2] Birth year 1996[3] Species Human Designation B-01[4] Darkwear icon Physical description Gender Male Hair color Black Eye color Blue Relationships Relatives Bruce Wayne (legal guardian)[5] John Cat (father)[6]


In [8]:
chunks = chunk_document(text, title="Bobs Superheroes")

# 3. View any chunk you want
n = 2  # Change this to view different chunks
view_chunk(chunks, n)


📄 CHUNK 2 | Bobs Superheroes
📊 Tokens: 6045 | Sentences: 1 | Characters: 25306
--------------------------------------------------------------------------------
He 
discussed it with Beast Boy and Sam, but their conversation was cut short when the computer 
detected an energy impulse. 
 
In a ball of lightning, a small pod arrived in the Mission Room, and a boy jumped out. Beast Boy 
quipped that this was the impulse they detected, and as a response, the boy took Impulse as his 
name. As he ran off, Thor sent the two junior members after him. 
 
Impulse took care of the younger members, but Thor outsmarted him. He lined the hallway to 
the Mission Room with marbles. Impulse avoided them, but it allowed Thor the opportunity to 
flank him. He subdued the speedster, and cuffed him on his wrists and ankles. 
 
Sam and Beast Boy heard Impulse out, but did not believe his "future tourist" explanation. Thor 
supposedly wanted his DNA, and tried to trick the boy into giving it up. Impulse happ