# üöÄ Koushole RAG Processor

**Process NCTB textbooks for RAG using FREE cloud GPU**

This notebook is designed to be **robust** across Surya OCR versions.

---

**SETUP:**
1. Go to Runtime ‚Üí Change runtime type ‚Üí Select **T4 GPU**
2. Run all cells in order
3. If any error occurs, check the debug output

In [None]:
#@title 1Ô∏è‚É£ Install Dependencies
# We install the latest versions to ensure compatibility
!pip install -q --upgrade surya-ocr pdf2image Pillow supabase voyageai transformers torch
!apt-get install -q poppler-utils
print("‚úÖ Dependencies installed!")

In [None]:
#@title 2Ô∏è‚É£ Check Environment & Versions
import importlib
import pkg_resources

packages = ['surya', 'transformers', 'torch', 'voyageai', 'supabase']
print("üîç Checking versions...")
for package in packages:
    try:
        ver = pkg_resources.get_distribution(package if package != 'surya' else 'surya-ocr').version
        print(f"  - {package}: {ver}")
    except Exception as e:
        print(f"  - {package}: Not found or error ({e})")


In [None]:
#@title 3Ô∏è‚É£ Enter Credentials
from getpass import getpass

print("üîë Enter your credentials (hidden):")
SUPABASE_URL = input("Supabase URL: ")
SUPABASE_KEY = getpass("Supabase Service Key: ")
VOYAGE_API_KEY = getpass("Voyage AI API Key: ")

if not SUPABASE_URL or not SUPABASE_KEY or not VOYAGE_API_KEY:
    raise ValueError("‚ùå All credentials are required!")
print("‚úÖ Credentials saved.")

In [None]:
#@title 4Ô∏è‚É£ Initialize Clients & Models (Robust Loader)
from supabase import create_client
import voyageai
import inspect
import sys

# 1. Setup Clients
try:
    supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
    voyage = voyageai.Client(api_key=VOYAGE_API_KEY)
    print("‚úÖ Clients initialized")
except Exception as e:
    print(f"‚ùå Client error: {e}")
    sys.exit(1)

# 2. Setup Surya OCR (Dynamic Import)
print("\nüîÆ setting up Surya OCR...")
det_predictor = None
rec_predictor = None

try:
    # Try new API (v0.5+)
    from surya.detection import DetectionPredictor
    from surya.recognition import RecognitionPredictor
    
    print("  -> Loading DetectionPredictor...")
    det_predictor = DetectionPredictor()
    
    print("  -> Loading RecognitionPredictor...")
    # Check if RecognitionPredictor needs arguments
    sig = inspect.signature(RecognitionPredictor.__init__)
    params = sig.parameters
    
    if 'detection_predictor' in params or 'foundation_predictor' in params:
        # Some versions require the detector to be passed
        print("  -> Passing detector to recognition initialization...")
        rec_predictor = RecognitionPredictor(det_predictor)
    else:
        rec_predictor = RecognitionPredictor()
        
    print("‚úÖ Surya OCR Models Loaded (New API)")
    
except ImportError:
    # Fallback to Old API
    try:
        print("  ‚ö†Ô∏è New API not found, trying legacy API...")
        from surya.ocr import run_ocr
        from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
        from surya.model.recognition.model import load_model as load_rec_model, load_processor as load_rec_processor
        
        det_model = load_det_model()
        det_processor = load_det_processor()
        rec_model = load_rec_model()
        rec_processor = load_rec_processor()
        
        # Wrap in a compatibility class
        class LegacyWrapper:
            def run(self, images, langs):
                return run_ocr(images, [langs]*len(images), det_model, det_processor, rec_model, rec_processor)
        
        rec_predictor = LegacyWrapper()
        print("‚úÖ Surya OCR Models Loaded (Legacy API)")
    except Exception as e:
        print(f"‚ùå Failed to load Surya models: {e}")
        print("Try running: !pip install transformers==4.36.2 surya-ocr==0.6.0")
        sys.exit(1)
except Exception as e:
    print(f"‚ùå Model Loading Error: {e}")
    if "bbox_size" in str(e):
        print("\n‚ö†Ô∏è CRITICAL: Transformers version conflict detected.")
        print("Please add a cell above and run: !pip install transformers==4.36.2")
    sys.exit(1)

In [None]:
#@title 5Ô∏è‚É£ Processing Functions
from pdf2image import convert_from_bytes
import requests
from tqdm import tqdm

def extract_text_surya(pdf_bytes):
    """Extract text from PDF using the loaded predictor"""
    try:
        images = convert_from_bytes(pdf_bytes, dpi=150)
        all_text = []
        batch_size = 5
        
        pbar = tqdm(total=len(images), desc="OCR Processing")
        for i in range(0, len(images), batch_size):
            batch = images[i:i+batch_size]
            
            # Dynamic call based on object type
            if hasattr(rec_predictor, 'run'):
                # Legacy
                results = rec_predictor.run(batch, ["bn", "en"])
                for page in results:
                    txt = "\n".join([l.text for l in page.text_lines])
                    all_text.append(txt)
            else:
                # New API
                # In new API, we might need to separate detection logic if not linked
                # But rec_predictor usually handles it if initialized with det_predictor
                # Check if it accepts both arguments
                
                # We'll assume the robust loader set it up correctly to take images
                # If it's a RecognitionPredictor, it usually takes images + optional detection results
                # If initialized WITH simple detection, we might just pass `image, langs`?
                # Let's inspect run method
                
                # Using the standard flow: Detection -> Recognition
                if det_predictor and not hasattr(rec_predictor, 'foundation_predictor'): 
                    # If they are decoupled, verify signature
                    pass
                
                # Standard execution for v0.6+
                # Usually: rec_predictor(images, [langs], detector) if decoupled
                # Or if coupled: rec_predictor(images, [langs])?
                
                # Let's try the safest path: run detection, then recognition
                # This works for most recent versions
                det_results = det_predictor(batch)  # Run detection
                rec_results = rec_predictor(batch, ["bn", "en"], det_results) # Run recognition with bounds
                
                for page in rec_results:
                    txt = "\n".join([l.text for l in page.text_lines])
                    all_text.append(txt)

            pbar.update(len(batch))
        pbar.close()
        return "\n\n".join(all_text)
        
    except Exception as e:
        print(f"OCR Failed: {e}")
        return ""

def chunk_text(text, chunk_size=2000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():
            chunks.append(chunk.strip())
        start = end - overlap
        if start >= len(text) - overlap:
            break
    return [c for c in chunks if len(c) > 50]

def generate_embeddings(chunks):
    if not chunks: return []
    embeddings = []
    batch_size = 20
    for i in tqdm(range(0, len(chunks), batch_size), desc="Generating Embeddings"):
        batch = [c[:8000] for c in chunks[i:i+batch_size]]
        res = voyage.embed(batch, model="voyage-multilingual-2", input_type="document")
        embeddings.extend(res.embeddings)
    return embeddings

In [None]:
#@title 6Ô∏è‚É£ Main Loop
import time

def process_one_book(book, type_tag):
    print(f"\nüìò [{type_tag}] Processing: {book['title']}")
    
    # 1. Download
    try:
        r = requests.get(book['file_url'])
        if r.status_code != 200:
            print("‚ùå Download failed")
            return False
        pdf = r.content
    except Exception as e:
        print(f"‚ùå Download error: {e}")
        return False

    # 2. OCR
    text = extract_text_surya(pdf)
    if not text or len(text) < 100:
        print(f"‚ùå OCR extracting little/no text ({len(text)} chars)")
        return False
    print(f"‚úÖ Extracted {len(text)} characters")

    # 3. Chunk
    chunks = chunk_text(text)
    print(f"üì¶ Extracted {len(chunks)} chunks")

    # 4. Embed
    try:
        embeddings = generate_embeddings(chunks)
    except Exception as e:
        print(f"‚ùå Embedding error: {e}")
        return False

    # 5. Store
    id_col = 'library_book_id' if type_tag == 'library' else 'resource_id'
    
    # Cleanup old
    supabase.table('book_chunks').delete().eq(id_col, book['id']).execute()
    
    # Insert new
    insert_batch = []
    for idx, (txt, emb) in enumerate(zip(chunks, embeddings)):
        insert_batch.append({
            id_col: book['id'],
            'chunk_index': idx,
            'chunk_text': txt,
            'embedding': emb
        })
        if len(insert_batch) >= 50:
            supabase.table('book_chunks').insert(insert_batch).execute()
            insert_batch = []
    if insert_batch:
        supabase.table('book_chunks').insert(insert_batch).execute()
        
    # 6. Update Status
    table = 'library_books' if type_tag == 'library' else 'official_resources'
    supabase.table(table).update({
        'chunks_generated': True,
        'total_chunks': len(chunks),
        'is_processed': True
    }).eq('id', book['id']).execute()
    
    print("‚úÖ Saved to DB!")
    return True

# --- RUNNER ---
print("üì° Fetching pending books...")
q1 = supabase.table('library_books').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()
q2 = supabase.table('official_resources').select('id, title, file_url').or_('chunks_generated.is.null,chunks_generated.eq.false').execute()

lib_books = q1.data or []
off_books = q2.data or []
all_books = [(b, 'library') for b in lib_books] + [(b, 'official') for b in off_books]

print(f"üèÅ Found {len(all_books)} books to process.")

for b, tag in all_books:
    process_one_book(b, tag)
    time.sleep(1)