# üöÄ Koushole RAG Processor (EasyOCR Edition)

**Process NCTB textbooks for RAG using FREE cloud GPU**

**Version:** Stable (EasyOCR) + Voyage AI (Super Slow Mode)

---

**SETUP:**
1. Go to Runtime ‚Üí Change runtime type ‚Üí Select **T4 GPU**
2. Click **Runtime ‚Üí Restart session** (CRITICAL)
3. Run all cells in order
4. Enter API keys when prompted

In [None]:
#@title 1Ô∏è‚É£ Install Dependencies
!pip install -q easyocr pdf2image Pillow supabase voyageai
!apt-get install -q poppler-utils
print("‚úÖ Dependencies installed!")

In [None]:
#@title 2Ô∏è‚É£ Enter Credentials
from getpass import getpass

print("üîë Enter your credentials (hidden):")
SUPABASE_URL = input("Supabase URL: ")
SUPABASE_KEY = getpass("Supabase Service Key: ")
VOYAGE_API_KEY = getpass("Voyage AI API Key: ")

if not SUPABASE_URL or not SUPABASE_KEY or not VOYAGE_API_KEY:
    raise ValueError("‚ùå All credentials are required!")
print("‚úÖ Credentials saved.")

In [None]:
#@title 3Ô∏è‚É£ Initialize Clients & Models
from supabase import create_client
import voyageai
import sys
import easyocr

# 1. Setup Clients
try:
    supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
    voyage = voyageai.Client(api_key=VOYAGE_API_KEY)
    print("‚úÖ Clients initialized")
except Exception as e:
    print(f"‚ùå Client error: {e}")
    sys.exit(1)

# 2. Setup OCR
print("\nüîÆ detailed setup for EasyOCR (GPU)...")
try:
    # Initialize EasyOCR Reader for Bangla (bn) and English (en)
    reader = easyocr.Reader(['bn', 'en'], gpu=True)
    print("‚úÖ EasyOCR Model Loaded Successfully")

except Exception as e:
    print(f"‚ùå Model Loading Error: {e}")
    sys.exit(1)

In [None]:
#@title 4Ô∏è‚É£ Helper Functions
from pdf2image import convert_from_bytes
import requests
from tqdm import tqdm
from PIL import Image
import numpy as np
import time

def extract_text_easyocr(pdf_bytes):
    """Extract text from PDF using EasyOCR"""
    try:
        # Convert PDF to images (dpi=200 for better OCR accuracy)
        images = convert_from_bytes(pdf_bytes, dpi=200)
        all_text = []
        
        pbar = tqdm(total=len(images), desc="OCR Processing")
        for img in images:
            # Convert PIL image to numpy array (required by EasyOCR)
            img_np = np.array(img)
            
            # Read text (detail=0 returns just the list of text strings)
            result = reader.readtext(img_np, detail=0, paragraph=True)
            page_text = "\n".join(result)
            all_text.append(page_text)
            
            pbar.update(1)
            
        pbar.close()
        return "\n\n".join(all_text)
        
    except Exception as e:
        print(f"OCR Failed: {e}")
        return ""

def chunk_text(text, chunk_size=2000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():
            chunks.append(chunk.strip())
        start = end - overlap
        if start >= len(text) - overlap:
            break
    return [c for c in chunks if len(c) > 50]

def generate_embeddings(chunks):
    """Generate embeddings with ULTRA STRICT rate limit handling (Voyage AI Free Tier)"""
    if not chunks: return []
    embeddings = []
    
    # SUPER SLOW MODE
    # Batch Size 2 = ~1000 tokens/batch
    # Delay 35s = < 2 batches/min
    # Total TPM = ~2000 (Limit is 10,000)
    batch_size = 2 
    
    print(f"  üß† Generating embeddings for {len(chunks)} chunks...")
    print(f"  üê¢ SUPER SLOW MODE: {batch_size} chunks/batch + 35s delay (To save quota)")
    
    for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding"):
        batch = [c[:8000] for c in chunks[i:i+batch_size]]
        
        if i > 0: 
            time.sleep(35) # Massive delay
        
        retries = 3
        for attempt in range(retries):
            try:
                res = voyage.embed(batch, model="voyage-multilingual-2", input_type="document")
                embeddings.extend(res.embeddings)
                break 
            except Exception as e:
                err_msg = str(e).lower()
                if "rate limit" in err_msg or "429" in err_msg:
                    wait_time = 90 + (attempt * 20)
                    print(f"\n  üõë Rate limit hit. Waiting {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    raise e
        else:
            raise Exception("‚ùå Max retries exceeded. You are out of quota or hitting strict limits.")
            
    return embeddings

print("‚úÖ Helper functions ready!")

In [None]:
#@title 5Ô∏è‚É£ Fetch Pending Books
print("üì° Fetching pending books from Supabase...")

# Get library books
lib_res = supabase.table('library_books').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
lib_books = lib_res.data or []

# Get official books
off_res = supabase.table('official_resources').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
off_books = off_res.data or []

all_books = [(b, 'library') for b in lib_books] + [(b, 'official') for b in off_books]

print(f"üìö Library books: {len(lib_books)}")
print(f"üìñ Official books: {len(off_books)}")
print(f"üìä Total to process: {len(all_books)}")

In [None]:
#@title 6Ô∏è‚É£ Process Loop üöÄ
import time

def process_book(book, source_type):
    title = book['title']
    print(f"\nüìò Processing: {title}")
    
    # 1. Download
    try:
        print("  üì• Downloading...")
        r = requests.get(book['file_url'])
        if r.status_code != 200:
            print("  ‚ùå Download failed")
            return False
        pdf_bytes = r.content
        print(f"  üìÑ Size: {len(pdf_bytes) / 1024 / 1024:.2f} MB")
    except Exception as e:
        print(f"  ‚ùå Download error: {e}")
        return False

    # 2. OCR (Using EasyOCR)
    text = extract_text_easyocr(pdf_bytes)
    if not text or len(text) < 100:
        print(f"  ‚ùå OCR extracted too little text ({len(text)} chars)")
        return False
    print(f"  ‚úÖ Extracted {len(text)} characters")

    # 3. Chunk
    chunks = chunk_text(text)
    print(f"  üì¶ Generated {len(chunks)} chunks")

    # 4. Embed
    try:
        embeddings = generate_embeddings(chunks)
    except Exception as e:
        print(f"  ‚ùå Embedding error: {e}")
        return False

    # 5. Store
    print("  üíæ saving to database...")
    id_col = 'library_book_id' if source_type == 'library' else 'resource_id'
    
    try:
        # Clear old chunks
        supabase.table('book_chunks').delete().eq(id_col, book['id']).execute()
        
        # Insert NEW chunks
        batch = []
        for idx, (txt, emb) in enumerate(zip(chunks, embeddings)):
            batch.append({
                id_col: book['id'],
                'chunk_index': idx,
                'chunk_text': txt,
                'embedding': emb
            })
            if len(batch) >= 50:
                supabase.table('book_chunks').insert(batch).execute()
                batch = []
        if batch:
            supabase.table('book_chunks').insert(batch).execute()
            
        # Update Status
        table = 'library_books' if source_type == 'library' else 'official_resources'
        supabase.table(table).update({
            'chunks_generated': True,
            'total_chunks': len(chunks),
            'is_processed': True
        }).eq('id', book['id']).execute()
        
        print("  ‚úÖ SUCCESS!")
        return True
        
    except Exception as e:
        print(f"  ‚ùå Database error: {e}")
        return False

# --- MAIN LOOP ---
success_count = 0
fail_count = 0

for book, source in all_books:
    if process_book(book, source):
        success_count += 1
    else:
        fail_count += 1
    time.sleep(1)

print(f"\n{'='*40}")
print(f"üèÅ DONE! Success: {success_count}, Failed: {fail_count}")

## üéâ All Done!

Check your website chat - the new books should now be searchable!