# üöÄ Koushole RAG Processor (EasyOCR Edition)

**Process NCTB textbooks for RAG using FREE cloud GPU**

**Version:** Stable (EasyOCR) + Voyage AI (Optimized Mode) + **ToC Support**

---

**SETUP:**
1. Go to Runtime ‚Üí Change runtime type ‚Üí Select **T4 GPU**
2. Click **Runtime ‚Üí Restart session** (CRITICAL)
3. Run all cells in order
4. Enter API keys when prompted

In [None]:
#@title 1Ô∏è‚É£ Install Dependencies
!pip install -q easyocr pdf2image Pillow supabase voyageai
!apt-get install -q poppler-utils
print("‚úÖ Dependencies installed!")

In [None]:
#@title 2Ô∏è‚É£ Enter Credentials
from getpass import getpass

print("üîë Enter your credentials (hidden):")
SUPABASE_URL = input("Supabase URL: ")
SUPABASE_KEY = getpass("Supabase Service Key: ")
VOYAGE_API_KEY = getpass("Voyage AI API Key: ")

if not SUPABASE_URL or not SUPABASE_KEY or not VOYAGE_API_KEY:
    raise ValueError("‚ùå All credentials are required!")
print("‚úÖ Credentials saved.")

In [None]:
#@title 3Ô∏è‚É£ Initialize Clients & Models
from supabase import create_client
import voyageai
import sys
import easyocr

# 1. Setup Clients
try:
    supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
    voyage = voyageai.Client(api_key=VOYAGE_API_KEY)
    print("‚úÖ Clients initialized")
except Exception as e:
    print(f"‚ùå Client error: {e}")
    sys.exit(1)

# 2. Setup OCR
print("\nüîÆ detailed setup for EasyOCR (GPU)...")
try:
    # Initialize EasyOCR Reader for Bangla (bn) and English (en)
    reader = easyocr.Reader(['bn', 'en'], gpu=True)
    print("‚úÖ EasyOCR Model Loaded Successfully")

except Exception as e:
    print(f"‚ùå Model Loading Error: {e}")
    sys.exit(1)

In [None]:
#@title 4Ô∏è‚É£ Helper Functions (With ToC Logic)
from pdf2image import convert_from_bytes
import requests
from tqdm import tqdm
from PIL import Image
import numpy as np
import time
import re

def extract_toc_easyocr(pdf_bytes, max_pages_to_scan=15):
    """Scan first few pages for Table of Contents"""
    print("\n  üïµÔ∏è Scanning for Table of Contents...")
    try:
        images = convert_from_bytes(pdf_bytes, first_page=1, last_page=max_pages_to_scan, dpi=150)
        chapters = []
        
        # Regex: matches 'Title ..... 123' or 'Title 123' (English or Bangla digits)
        # Basic pattern: Ends with digits. 
        # Note: Bangla digits range from \u09E6 to \u09EF
        toc_pattern = re.compile(r"^(.*?)[\.\s\‚Ä¶]+([0-9]+|[‡ß¶-‡ßØ]+)$")
        
        # Simple cleaner for Bangla digits -> English int
        bn_map = str.maketrans('‡ß¶‡ßß‡ß®‡ß©‡ß™‡ß´‡ß¨‡ß≠‡ßÆ‡ßØ', '0123456789')
        
        for page_idx, img in enumerate(images):
            img_np = np.array(img)
            result = reader.readtext(img_np, detail=0)
            
            for line in result:
                match = toc_pattern.match(line.strip())
                if match:
                    title = match.group(1).strip()
                    page_str = match.group(2).strip().translate(bn_map)
                    
                    # Basic validation: Title length > 2, Page is a number
                    if len(title) < 3 or not page_str.isdigit(): 
                        continue
                    
                    page_num = int(page_str)
                    
                    # Monotonicity Check: If this page < last chapter page, it might be noise OR sub-chapter
                    # For simplicity, we accept it if it's reasonable.
                    chapters.append({
                        'chapter_number': len(chapters) + 1,
                        'title': title,
                        'start_page': page_num
                    })
                        
        print(f"  ‚úÖ Found {len(chapters)} chapters")
        # Sort chapters by page number just in case
        chapters.sort(key=lambda x: x['start_page'])
        return chapters
    except Exception as e:
        print(f"  ‚ö†Ô∏è ToC scan failed (skipping): {e}")
        return []

def save_chapters_to_db(book_id, chapters, source_type):
    """Save chapters and return a lookup map: {start_page: chapter_id}"""
    if not chapters:
        return {}
    
    id_col = 'library_book_id' if source_type == 'library' else 'resource_id'
    db_rows = []
    for ch in chapters:
        db_rows.append({
            id_col: book_id,
            'chapter_number': ch['chapter_number'],
            'title': ch['title'],
            'start_page': ch['start_page']
        })
    
    try:
        # Insert and return inserted rows (we need their UUIDs)
        res = supabase.table('book_chapters').insert(db_rows).execute()
        inserted = res.data or []
        
        # Build a map for easy lookup: page -> chapter_id
        page_map = {}
        for row in inserted:
            page_map[row['start_page']] = row['id']
            
        return page_map
    except Exception as e:
        print(f"  ‚ö†Ô∏è Failed to save chapters: {e}")
        return {}

def extract_text_and_chunk_with_chapters(pdf_bytes, chapter_map):
    """Processing pipeline: OCR -> Assign Chapter -> Chunk"""
    try:
        images = convert_from_bytes(pdf_bytes, dpi=200)
        chunks = []
        
        # Convert map keys to sorted list for range checking
        sorted_start_pages = sorted(chapter_map.keys())
        current_chapter_id = None
        
        pbar = tqdm(total=len(images), desc="OCR & Chunking")
        
        for page_num, img in enumerate(images, start=1):
            # 1. Determine Chapter
            # If current page matches a chapter start, switch to it
            if page_num in chapter_map:
                current_chapter_id = chapter_map[page_num]
            # Else, find the latest chapter start <= page_num
            elif not current_chapter_id and sorted_start_pages:
                 # Handle case where book starts before chapter 1 (Intro)
                 # or we missed the exact start page. Find closest previous.
                 valid_starts = [p for p in sorted_start_pages if p <= page_num]
                 if valid_starts:
                     current_chapter_id = chapter_map[valid_starts[-1]]

            # 2. OCR
            img_np = np.array(img)
            result = reader.readtext(img_np, detail=0, paragraph=True)
            page_text = "\n".join(result)
            
            if not page_text.strip(): 
                pbar.update(1)
                continue

            # 3. Chunk this page immediately
            # We treat page boundaries as natural chunk breaks to keep chapters clean
            page_chunks = split_into_chunks(page_text, 1000, 200)
            
            for txt in page_chunks:
                chunks.append({
                    'text': txt,
                    'chapter_id': current_chapter_id
                })
                
            pbar.update(1)
            
        pbar.close()
        return chunks
        
    except Exception as e:
        print(f"Processing Failed: {e}")
        return []

def split_into_chunks(text, chunk_size, overlap):
    out = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():
            out.append(chunk.strip())
        start = end - overlap
        if start >= len(text) - overlap:
            break
    return [c for c in out if len(c) > 50]

def generate_embeddings(chunks_with_metadata):
    """Embed chunks (accepts dicts, extracts text)"""
    if not chunks_with_metadata: return []
    embeddings = []
    
    batch_size = 4
    print(f"  üß† Generating embeddings for {len(chunks_with_metadata)} chunks...")
    
    for i in tqdm(range(0, len(chunks_with_metadata), batch_size), desc="Embedding"):
        # Extract just the text for embedding
        batch_dicts = chunks_with_metadata[i:i+batch_size]
        batch_texts = [c['text'][:8000] for c in batch_dicts]
        
        if i > 0: time.sleep(25) 
        
        retries = 3
        for attempt in range(retries):
            try:
                res = voyage.embed(batch_texts, model="voyage-multilingual-2", input_type="document")
                embeddings.extend(res.embeddings)
                break 
            except Exception as e:
                err_msg = str(e).lower()
                if "rate limit" in err_msg or "429" in err_msg:
                    time.sleep(75 + (attempt * 15))
                else:
                    raise e
        else:
            raise Exception("‚ùå Max retries exceeded.")
            
    return embeddings

print("‚úÖ Helper functions ready!")

In [None]:
#@title 5Ô∏è‚É£ Fetch Pending Books
print("üì° Fetching pending books from Supabase...")

# Get library books
lib_res = supabase.table('library_books').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
lib_books = lib_res.data or []

# Get official books
off_res = supabase.table('official_resources').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
off_books = off_res.data or []

all_books = [(b, 'library') for b in lib_books] + [(b, 'official') for b in off_books]

print(f"üìö Library books: {len(lib_books)}")
print(f"üìñ Official books: {len(off_books)}")
print(f"üìä Total to process: {len(all_books)}")

In [None]:
#@title 6Ô∏è‚É£ Process Loop üöÄ (Integrated)
import time

def process_book(book, source_type):
    title = book['title']
    id_col = 'library_book_id' if source_type == 'library' else 'resource_id'
    print(f"\nüìò Processing: {title}")
    
    # 0. CLEANUP 
    print("  üßπ Cleaning old data...")
    try:
        supabase.table('book_chunks').delete().eq(id_col, book['id']).execute()
        supabase.table('book_chapters').delete().eq(id_col, book['id']).execute()
    except Exception as e:
        print(f"  ‚ö†Ô∏è Cleanup warning: {e}")

    # 1. Download
    try:
        print("  üì• Downloading...")
        r = requests.get(book['file_url'])
        if r.status_code != 200:
            print("  ‚ùå Download failed")
            return False
        pdf_bytes = r.content
    except Exception as e:
        print(f"  ‚ùå Download error: {e}")
        return False

    # 2. ToC Extraction
    print("  üîé Extracting Chapters...")
    chapters = extract_toc_easyocr(pdf_bytes)
    
    # 3. Save Chapters & Get ID Map
    chapter_map = save_chapters_to_db(book['id'], chapters, source_type)
    if chapter_map:
        print(f"  ‚úÖ Saved {len(chapter_map)} chapters to DB")
    else:
        print("  ‚ÑπÔ∏è No chapters found, proceeding with flat structure.")

    # 4. OCR & Chunking (Page-Aware)
    start_time = time.time()
    # Returns list of dicts: {'text': '...', 'chapter_id': '...'}
    chunks_data = extract_text_and_chunk_with_chapters(pdf_bytes, chapter_map)
    ocr_time = time.time() - start_time
    
    if not chunks_data:
        print("  ‚ùå No text extracted")
        return False
    print(f"  ‚úÖ Generated {len(chunks_data)} chunks in {ocr_time:.1f}s")

    # 5. Embed
    try:
        embeddings = generate_embeddings(chunks_data)
        if len(embeddings) != len(chunks_data):
            print("  ‚ùå Embeddings mismatch")
            return False
    except Exception as e:
        print(f"  ‚ùå Embedding error: {e}")
        return False

    # 6. Store
    print("  üíæ saving chunks to databse...")
    try:
        batch = []
        for idx, (c_data, emb) in enumerate(zip(chunks_data, embeddings)):
            batch.append({
                id_col: book['id'],
                'chunk_index': idx,
                'chunk_text': c_data['text'],
                'chapter_id': c_data['chapter_id'], # CRITICAL LINK
                'embedding': emb
            })
            if len(batch) >= 50:
                supabase.table('book_chunks').insert(batch).execute()
                batch = []
        if batch:
            supabase.table('book_chunks').insert(batch).execute()
            
        # Update Status
        table = 'library_books' if source_type == 'library' else 'official_resources'
        supabase.table(table).update({
            'chunks_generated': True,
            'total_chunks': len(chunks_data),
            'is_processed': True
        }).eq('id', book['id']).execute()
        
        print("  ‚úÖ SUCCESS!")
        return True
        
    except Exception as e:
        print(f"  ‚ùå Database error: {e}")
        return False

# --- MAIN LOOP ---
success_count = 0
fail_count = 0

for book, source in all_books:
    if process_book(book, source):
        success_count += 1
    else:
        fail_count += 1
    time.sleep(1)

print(f"\n{'='*40}")
print(f"üèÅ DONE! Success: {success_count}, Failed: {fail_count}")

## üéâ All Done!

Check your website chat - the new books should now be searchable!