# üöÄ Koushole RAG Processor

**Process NCTB textbooks for RAG using FREE cloud GPU**

This notebook uses:
- **Surya OCR v0.17+** (best accuracy for Bangla)
- **Voyage AI** (50M free embedding tokens)
- **Supabase** (cloud storage)

---

**SETUP:**
1. Go to Runtime ‚Üí Change runtime type ‚Üí Select **T4 GPU**
2. Run all cells in order
3. Enter your API keys when prompted

In [None]:
#@title 1Ô∏è‚É£ Install Dependencies (takes ~3 minutes)
!pip install -q surya-ocr pdf2image Pillow supabase voyageai
!apt-get install -q poppler-utils
print("‚úÖ Dependencies installed!")

In [None]:
#@title 2Ô∏è‚É£ Enter Your API Keys
from getpass import getpass

SUPABASE_URL = input("Enter Supabase URL: ")
SUPABASE_KEY = getpass("Enter Supabase Service Key: ")
VOYAGE_API_KEY = getpass("Enter Voyage AI API Key: ")

print("‚úÖ Keys saved!")

In [None]:
#@title 3Ô∏è‚É£ Initialize Clients
from supabase import create_client
import voyageai

supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
voyage = voyageai.Client(api_key=VOYAGE_API_KEY)

print("‚úÖ Connected to Supabase and Voyage AI!")

In [None]:
#@title 4Ô∏è‚É£ Setup Surya OCR v0.17+
from surya.detection import DetectionPredictor
from surya.recognition import RecognitionPredictor

print("Loading Surya OCR models...")
det_predictor = DetectionPredictor()
rec_predictor = RecognitionPredictor(det_predictor)
print("‚úÖ Surya OCR ready!")

In [None]:
#@title 5Ô∏è‚É£ Helper Functions
from pdf2image import convert_from_bytes
import requests
from tqdm import tqdm
from PIL import Image

def extract_text_surya(pdf_bytes):
    """Extract text from PDF using Surya OCR v0.17+"""
    images = convert_from_bytes(pdf_bytes, dpi=150)
    
    all_text = []
    batch_size = 5
    
    for i in tqdm(range(0, len(images), batch_size), desc="OCR Progress"):
        batch = images[i:i+batch_size]
        
        # Run recognition (detection is handled internally)
        rec_results = rec_predictor(batch, langs=["bn", "en"])
        
        for page_result in rec_results:
            page_text = "\n".join([line.text for line in page_result.text_lines])
            all_text.append(page_text)
    
    return "\n\n".join(all_text)

def chunk_text(text, chunk_size=2000, overlap=200):
    """Split text into overlapping chunks"""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():
            chunks.append(chunk.strip())
        start = end - overlap
        if start >= len(text) - overlap:
            break
    return [c for c in chunks if len(c) > 50]

def generate_embeddings(chunks):
    """Generate embeddings using Voyage AI"""
    embeddings = []
    batch_size = 20
    
    for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding"):
        batch = chunks[i:i+batch_size]
        batch = [c[:8000] for c in batch]
        
        result = voyage.embed(
            texts=batch,
            model="voyage-multilingual-2",
            input_type="document"
        )
        embeddings.extend(result.embeddings)
    
    return embeddings

print("‚úÖ Helper functions ready!")

In [None]:
#@title 6Ô∏è‚É£ Fetch Books Without Embeddings

# Get library books
library_result = supabase.table('library_books').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
library_books = library_result.data or []

# Get official resources
official_result = supabase.table('official_resources').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
official_books = official_result.data or []

print(f"üìö Found {len(library_books)} library books")
print(f"üìñ Found {len(official_books)} official books")
print(f"üìä Total: {len(library_books) + len(official_books)} books to process")

In [None]:
#@title 7Ô∏è‚É£ Process All Books üöÄ
import time

def process_book(book, source_type):
    title = book['title']
    print(f"\nüìö Processing: {title}")
    
    try:
        print("  üì• Downloading...")
        response = requests.get(book['file_url'])
        if response.status_code != 200:
            print(f"  ‚ùå Download failed")
            return False
        
        pdf_bytes = response.content
        print(f"  üìÑ Size: {len(pdf_bytes) / 1024 / 1024:.2f} MB")
        
        print("  üîÆ Running Surya OCR...")
        text = extract_text_surya(pdf_bytes)
        print(f"  ‚úÖ Extracted {len(text)} characters")
        
        if len(text) < 200:
            print("  ‚ùå Not enough text")
            return False
        
        chunks = chunk_text(text)
        print(f"  üì¶ Created {len(chunks)} chunks")
        
        print("  üî¢ Generating embeddings...")
        embeddings = generate_embeddings(chunks)
        print(f"  ‚úÖ Generated {len(embeddings)} embeddings")
        
        print("  üíæ Storing in database...")
        id_column = 'library_book_id' if source_type == 'library' else 'resource_id'
        
        supabase.table('book_chunks').delete().eq(id_column, book['id']).execute()
        
        for i in range(0, len(chunks), 50):
            batch = []
            for j in range(i, min(i + 50, len(chunks))):
                batch.append({
                    id_column: book['id'],
                    'chunk_index': j,
                    'chunk_text': chunks[j],
                    'embedding': embeddings[j]
                })
            supabase.table('book_chunks').insert(batch).execute()
        
        table = 'library_books' if source_type == 'library' else 'official_resources'
        supabase.table(table).update({
            'chunks_generated': True,
            'total_chunks': len(chunks),
            'is_processed': True
        }).eq('id', book['id']).execute()
        
        print(f"  ‚úÖ DONE: {len(chunks)} chunks stored!")
        return True
        
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        return False

success = 0
failed = 0

for book in library_books:
    if process_book(book, 'library'):
        success += 1
    else:
        failed += 1
    time.sleep(2)

for book in official_books:
    if process_book(book, 'official'):
        success += 1
    else:
        failed += 1
    time.sleep(2)

print(f"\n{'='*50}")
print(f"üèÅ PROCESSING COMPLETE!")
print(f"‚úÖ Success: {success}")
print(f"‚ùå Failed: {failed}")

---

## ‚úÖ All Done!

Your books are now processed and stored in Supabase.

**Next steps:**
1. Test RAG chat on your website
2. Come back weekly to process new uploads