# Local RAG Pipeline Tutorial (Ingestion → Embeddings → Retrieval → Generation)

This notebook demonstrates a minimal **local-first** pipeline using the same building blocks wired into the FastAPI backend:
1. Ingest a PDF with **PyMuPDF** + OCR fallback (**Tesseract**).
2. Chunk & embed pages with **Gemini Embeddings** (or a local fallback) stored in **Chroma**.
3. Retrieve top-*k* chunks via cosine similarity.
4. Run a structured RAG prompt against **Gemini** (or stub) and validate JSON.
5. Show how these steps map to backend endpoints (/api/uploads, /api/jobs, etc.).

> If you don't set `GOOGLE_API_KEY`, the notebook will auto-fallback to deterministic pseudo-embeddings + a stub generator so you can still exercise the flow offline.

In [None]:
# (Optional) Install runtime deps if running standalone.
# You can skip this cell if you've already pip installed in the environment.
# !pip install PyMuPDF pytesseract Pillow chromadb faiss-cpu jsonschema google-generativeai tenacity
import os, math, json, hashlib, random
from dataclasses import dataclass
from typing import List, Dict, Any

try:
    import fitz  # PyMuPDF
except ImportError as e:
    raise RuntimeError('PyMuPDF not installed. Run the pip install cell.') from e

try:
    import pytesseract  # OCR fallback
except ImportError:
    pytesseract = None  # OCR optional

from PIL import Image
from io import BytesIO
import chromadb
from chromadb.utils import embedding_functions
from jsonschema import validate as json_validate
from jsonschema import ValidationError

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '')
print('Gemini key detected?' , bool(GOOGLE_API_KEY))

## 1. Ingestion (extract text + images + OCR fallback)
Mirrors backend/app/services/pdf_extract.py with light adjustments for notebook execution.

In [None]:
def extract_pages(filepath: str):
    doc = fitz.open(filepath)
    pages = []
    for i in range(doc.page_count):
        page = doc.load_page(i)
        text = page.get_text('text').strip()
        images_meta = page.get_images(full=True)
        extracted_images = []
        for img in images_meta:
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n - pix.alpha > 3:  # convert CMYK
                pix = fitz.Pixmap(fitz.csRGB, pix)
            img_bytes = pix.tobytes('png')
            extracted_images.append({'page': i+1, 'image_hex': img_bytes.hex()})
        if not text:  # OCR fallback
            if pytesseract is not None:
                pix = page.get_pixmap(dpi=200)
                img = Image.open(BytesIO(pix.tobytes('png')))
                ocr_text = pytesseract.image_to_string(img)
                text = ocr_text.strip()
        pages.append({'page_no': i+1, 'text': text, 'images': extracted_images})
    return pages

# SAMPLE INPUT: point to a local small PDF (replace this path)
SAMPLE_PDF = 'sample.pdf'  # Place a small PDF in the working dir.
if not os.path.exists(SAMPLE_PDF):
    # create a tiny one-page PDF if missing (blank)
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((72,72), 'Demo PDF content about photosynthesis and energy conversion.')
    doc.save(SAMPLE_PDF)
pages = extract_pages(SAMPLE_PDF)
len(pages), pages[0]['text'][:80]

## 2. Chunking & Embeddings
We'll treat each page as a chunk. For short pages you could merge neighbors.
If a Gemini API key is present we use the real embedding endpoint via `google-generativeai`; else we use a deterministic pseudo-embedding (hash-based).

In [None]:
try:
    import google.generativeai as genai
    if GOOGLE_API_KEY:
        genai.configure(api_key=GOOGLE_API_KEY)
except ImportError:
    genai = None

def embed_texts(texts: List[str]):
    if genai and GOOGLE_API_KEY:
        # Gemini embedding model name may evolve; adjust per docs.
        model = 'text-embedding-004'  # or gemini-embedding-001 depending on availability
        out = []
        for t in texts:
            resp = genai.embed_content(model=model, content=t)
            out.append(resp['embedding'])
        return out
    # Fallback: deterministic pseudo-embedding (NOT for production)
    out = []
    for t in texts:
        h = hashlib.sha256(t.encode('utf-8')).digest()[:128]  # 128 bytes
        vec = [b/255.0 for b in h]
        out.append(vec)
    return out

page_texts = [p['text'] or '' for p in pages]
embeddings = embed_texts(page_texts)
len(embeddings), len(embeddings[0])

## 3. Store in Chroma & Retrieval
Using an ephemeral in-memory Chroma collection (persist directory optional).

In [None]:
client = chromadb.Client()
collection = client.get_or_create_collection('notes')
# Upsert pages
for idx, (p, emb) in enumerate(zip(pages, embeddings)):
    collection.upsert(ids=[f'page-{idx+1}'], embeddings=[emb], metadatas=[{'page_no': p['page_no']}], documents=[p['text']])
collection.count()

### Similarity Search Helper

In [None]:
def retrieve(query: str, k: int = 3):
    q_emb = embed_texts([query])[0]
    res = collection.query(query_embeddings=[q_emb], n_results=k)
    hits = []
    for i, doc in enumerate(res['documents'][0]):
        hits.append({
            'id': res['ids'][0][i],
            'text': doc,
            'metadata': res['metadatas'][0][i],
            'distance': res['distances'][0][i] if 'distances' in res else None
        })
    return hits

retrieve('photosynthesis energy')

## 4. Structured RAG Generation
We craft a strict JSON instruction. If Gemini not available, stub returns an example structure.
Schema (simplified):
```json
{ 'items': [ { 'question': str, 'answer': str, 'page_references': [str] } ] }
```

In [None]:
SCHEMA = {
    'type': 'object',
    'properties': {
        'items': {
            'type': 'array',
            'items': {
                'type': 'object',
                'properties': {
                    'question': {'type': 'string'},
                    'answer': {'type': 'string'},
                    'page_references': { 'type': 'array', 'items': {'type': 'string'} }
                },
                'required': ['question','answer','page_references']
            }
        }
    },
    'required': ['items']
}

def rag_generate(question: str, k: int = 3):
    ctx_hits = retrieve(question, k=k)
    ctx_block = '

'.join([f
 for h in ctx_hits])
    if genai and GOOGLE_API_KEY:
        instruction = (
            'You are a strict academic assistant. Use ONLY the provided context. '
            'Return compact JSON with key "items" (array). Each item: question, answer, page_references. '
            'If insufficient info, answer with an empty items array.'
        )
        prompt = f
        model = genai.GenerativeModel('gemini-1.5-flash')
        resp = model.generate_content(prompt)
        text = resp.text
    else:
        # Stub JSON for offline demo
        refs = [f