<a href="https://colab.research.google.com/github/Mohammed-Atef2004/ASK-project/blob/master/%F0%9F%8E%93_GP_Doc2Pod_RAG_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title 🚑 Emergency Fix: Force NumPy & Pandas Compatibility
# @markdown Run this cell immediately after "Restart Session".
# @markdown It reinstalls both NumPy and Pandas to ensure they match.

import os

print("🔄 Fixing dependencies... (This takes ~30 seconds)")

# 1. Uninstall existing conflicting versions
os.system("pip uninstall -y numpy pandas")

# 2. Install compatible versions
# We force NumPy < 2.0 and let pip find a Pandas version that matches it.
os.system("pip install \"numpy<2.0\" pandas")

# 3. Verify
import numpy
import pandas
print(f"✅ NumPy Version: {numpy.__version__}")
print(f"✅ Pandas Version: {pandas.__version__}")

if numpy.__version__.startswith("2"):
    print("❌ ERROR: NumPy is still 2.x. Please Restart Session and try again.")
else:
    print("👉 Success! Now run the rest of the notebook.")

🔄 Fixing dependencies... (This takes ~30 seconds)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# @title 🛠️ Step 1: Install Dependencies & System Tools
# @markdown This installs OCR tools (PaddlePaddle), Vector Database (Chroma), and AI models.
# @markdown **Note:** This installs specific nightly builds of PyTorch. It may take 3-5 minutes.

# System Tools (Required for PDF conversion)
!apt-get update -qq
!apt-get install -y poppler-utils -qq
!which pdfinfo  # Verify installation

# PaddleOCR (for extracting text/tables)
!pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ -q
!pip install paddlex==3.3.12 "paddleocr[all]" -q

# Utils (PDF handling, Regex, etc.)
!pip install langchain==0.0.354 pdf2image sentence-transformers regex uuid beautifulsoup4 pylatexenc -q

# PyTorch (Nightly Build )
!pip install torch==2.11.0.dev20260107+cu126 torchvision==0.25.0.dev20260107+cu126 torchaudio==2.11.0.dev20260107+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 -q
!pip install transformers accelerate --upgrade -q

# Vision Language Model Utils
!pip install qwen-vl-utils -q

# Vector Database & Data Handling
!pip install chromadb pandas -q

# Numpy Fix (Uninstall/Reinstall to prevent version conflicts)
!pip uninstall -y numpy -q
!pip install numpy==1.26.4 -q

print("✅ Step 1 Complete: All dependencies installed.")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 121852 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
/usr/bin/pdfinfo
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 GB[0m [31m671.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m

In [None]:
# @title 📚 Step 2: Import Libraries
# @markdown We import standard tools, OCR engines, and AI models here.

import os
import gc
import re
import uuid
import html
import json
import glob
import time
import torch
import paddle
import tempfile
import pandas as pd
import chromadb
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm
from typing import Any, List, Dict
from io import StringIO
from pathlib import Path
from bs4 import BeautifulSoup

# AI & Vision Import
from qwen_vl_utils import process_vision_info
from paddleocr import PPStructureV3, PaddleOCRVL
from pylatexenc.latex2text import LatexNodes2Text
from sentence_transformers import SentenceTransformer
from pdf2image import convert_from_path, pdfinfo_from_path
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

print("✅ Step 2 Complete: Libraries loaded successfully.")

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


✅ Step 2 Complete: Libraries loaded successfully.


In [None]:
# @title 📂 Step 3: Mount Drive & Configure Unified Workspace
# @markdown We set up a comprehensive directory structure to handle the full pipeline:
# @markdown PDF -> Images -> OCR JSON -> Embeddings -> ChromaDB -> Podcast Audio.

import os
from pathlib import Path
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Define Root Path
BASE_DIR = Path("/content/drive/MyDrive/Doc2Pod_System")

# Define Sub-Directories
DATA_DIR = BASE_DIR / "Data"
PDF_DIR = DATA_DIR / "pdfs"                    # Raw PDFs go here
IMAGE_DIR = DATA_DIR / "images"                # Extracted images from slides
JSON_DIR = DATA_DIR / "json"                   # Raw OCR output (per page)
UNIFIED_DIR = DATA_DIR / "unified_json"        # Cleaned/Merged text chunks
EMBEDDING_DIR = DATA_DIR / "embedding_inputs"  # Pre-processed data for Vector DB
DB_DIR = DATA_DIR / "chroma_db"                # The Vector Database (Chroma)
OUTPUT_DIR = BASE_DIR / "episodes"             # Final Scripts & Audio files

# 4. Create All Directories
folders_to_create = [
    BASE_DIR, DATA_DIR, PDF_DIR, IMAGE_DIR, JSON_DIR,
    UNIFIED_DIR, EMBEDDING_DIR, DB_DIR, OUTPUT_DIR
]

for d in folders_to_create:
    d.mkdir(parents=True, exist_ok=True)

print(f"✅ Workspace Ready at: {BASE_DIR}")
print(f"   📂 Data Folder:     {DATA_DIR}")
print(f"       ├── Input PDFs: {PDF_DIR.name}")
print(f"       ├── Vector DB:  {DB_DIR.name}")
print(f"       └── Images:     {IMAGE_DIR.name}")
print(f"   🎙️ Final Output:    {OUTPUT_DIR}")

print("✅ Step 3 Complete: Drive mounted successfully")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Workspace Ready at: /content/drive/MyDrive/Doc2Pod_System
   📂 Data Folder:     /content/drive/MyDrive/Doc2Pod_System/Data
       ├── Input PDFs: pdfs
       ├── Vector DB:  chroma_db
       └── Images:     images
   🎙️ Final Output:    /content/drive/MyDrive/Doc2Pod_System/episodes
✅ Step 3 Complete: Drive mounted successfully


In [None]:
# @title 🧠 Step 4: Model Managers (Lazy Loading)
# @markdown Defines functions to load PaddleOCR and Qwen-VL only when needed to save GPU memory.

def load_PaddleOCRVL():
    """Initializes the PaddleOCR pipeline for text & layout detection."""

    device = "gpu" if torch.cuda.is_available() else "cpu"
    print(f"[*] Loading PaddleOCR on {device}...")

    pipeline = PaddleOCRVL(
        device=device,
        use_layout_detection=True,
        format_block_content=True,
        use_chart_recognition=True
    )
    return pipeline


def load_qwen3_vl_model():
    """Loads Qwen3-VL (4B) for describing images and charts."""

    print("[*] Loading Qwen3-VL (Vision Model)...")
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen3-VL-4B-Instruct",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

    processor = AutoProcessor.from_pretrained(
        "Qwen/Qwen3-VL-4B-Instruct",
        min_pixels=256*28*28,
        max_pixels=768*28*28,
        trust_remote_code=True
    )

    model.eval()
    return model, processor


def load_qwen3_thinking_model():
    """Loads Qwen3-4B-Thinking for script generation."""
    print("[*] Loading Qwen3-4B-Thinking...")
    model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen3-4B-Thinking-2507",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen3-4B-Thinking-2507",
        trust_remote_code=True
    )
    return model, tokenizer

class LazyPipelineManager:
    """Manages models so they aren't loaded until the exact moment they are needed."""

    def __init__(self):
        self._vl_pipeline = None
        self._vl_loaded = False
        self._qwen3_vl = None
        self._processor = None
        self._qwen3_vl_loaded = False
        self._qwen3_thinking = None
        self._thinking_tokenizer = None
        self._thinking_loaded = False

    def get_vl_pipeline(self):
        if not self._vl_loaded:
            self._vl_pipeline = load_PaddleOCRVL()
            self._vl_loaded = True
        return self._vl_pipeline

    def get_qwen3_vl_model(self):
        if not self._qwen3_vl_loaded:
            self._qwen3_vl, self._processor = load_qwen3_vl_model()
            self._qwen3_vl_loaded = True
        return self._qwen3_vl, self._processor

    def get_qwen3_thinking_model(self):
        if not self._thinking_loaded:
            self._qwen3_thinking, self._thinking_tokenizer = load_qwen3_thinking_model()
            self._thinking_loaded = True
        return self._qwen3_thinking, self._thinking_tokenizer

    def unload_all(self):
        """Explicitly unloads all currently loaded models and clears GPU cache."""
        print("[*] Unloading all models from GPU...")
        if self._vl_pipeline:
            del self._vl_pipeline
            self._vl_pipeline = None
            self._vl_loaded = False

        if self._qwen3_vl:
            del self._qwen3_vl
            self._qwen3_vl = None
            del self._processor
            self._processor = None
            self._qwen3_vl_loaded = False

        if self._qwen3_thinking:
            del self._qwen3_thinking
            self._qwen3_thinking = None
            del self._thinking_tokenizer
            self._thinking_tokenizer = None
            self._thinking_loaded = False

        torch.cuda.empty_cache()
        gc.collect()
        print("[*] GPU memory cleared.")

print("✅ Step 4 Complete: Model Managers defined.")


✅ Step 4 Complete: Model Managers defined.


In [None]:
# @title 🧹 Step 5: Text Cleaning & Normalization Utils
# @markdown Helper functions to clean code snippets, convert HTML tables to text, and fix LaTeX formatting.

# Constants for LaTeX normalization
SUPERSCRIPTS = {"0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹", "+": "⁺", "-": "⁻", "=": "⁼", "T": "ᵀ", "n": "ⁿ", "i": "ⁱ"}
SUBSCRIPTS = {"0": "₀", "1": "₁", "2": "₂", "3": "₃", "4": "₄", "5": "₅", "6": "₆", "7": "₇", "8": "₈", "9": "₉", "i": "ᵢ", "j": "ⱼ", "k": "ₖ", "n": "ₙ"}

def normalize_code(raw_code):
    """Cleans up OCR errors common in code blocks (e.g. '0' vs 'O')."""

    if not raw_code.strip():
       return "Code snippet: [empty]"

    lines = raw_code.split('\n')
    cleaned_lines = []

    for line in lines:
        stripped = line.rstrip()
        stripped = re.sub(r'\bO\b', '0', stripped)
        stripped = re.sub(r'\b[lI]\b', '1', stripped)
        stripped = re.sub(r'\bS\b(?=\d)', '5', stripped)
        stripped = re.sub(r'! =', '!=', stripped)
        stripped = re.sub(r'< =', '<=', stripped)
        stripped = re.sub(r'> =', '>=', stripped)
        cleaned_lines.append(stripped)

    normalized = '\n'.join(cleaned_lines).strip()

    if not normalized:
       return "Code snippet: [unrecognized content]"

    return f"Code snippet:\n{normalized}"


def html_table_to_text(html_table):
    """Parses HTML tables returned by PaddleOCR into pipe-separated text."""

    soup = BeautifulSoup(html_table, "html.parser")
    table = soup.find("table")

    if not table:
       return "Table"

    grid, rowspan_map = [], {}
    for tr in table.find_all("tr"):
        row, col_idx = [], 0
        while col_idx in rowspan_map:
            row.append(rowspan_map[col_idx]["text"])
            rowspan_map[col_idx]["rows"] -= 1
            if rowspan_map[col_idx]["rows"] == 0:
               del rowspan_map[col_idx]
            col_idx += 1
        for cell in tr.find_all(["td", "th"], recursive=False):
            text = cell.get_text(" ", strip=True) or "—"
            text = latex_to_unicode(text)
            colspan = int(cell.get("colspan", 1))
            rowspan = int(cell.get("rowspan", 1))
            for _ in range(colspan):
                row.append(text)
                if rowspan > 1:
                   rowspan_map[col_idx] = {"text": text, "rows": rowspan - 1}
                col_idx += 1
        grid.append(row)
    max_cols = max(len(r) for r in grid)
    for r in grid: r.extend([""] * (max_cols - len(r)))
    lines = []
    for row in grid:
        if any(cell.strip() for cell in row):
           lines.append(" | ".join(row))
    return "\n".join(lines)

def normalize_latex(text):
    """Basic cleanup for LaTeX strings."""

    text = re.sub(r"\${1,2}", "", text)
    text = text.replace("\\\\", "\\")
    text = re.sub(r"\\quad|\\qquad|\\,", " ", text)
    text = re.sub(r"\\begin\{.*?\}|\\end\{.*?\}", "", text)
    text = re.sub(r"\b[clr]{1,3}\b", "", text)
    return text

def latex_to_unicode(text):
    """Converts LaTeX math symbols to their Unicode equivalents for better readability."""
    if not text:
      return text
    text = normalize_latex(text)
    try:
       text = LatexNodes2Text().latex_to_text(text)
    except Exception:
       pass

    replacements = {
        r"\\theta": "θ", r"\\alpha": "α", r"\\beta": "β", r"\\gamma": "γ",
        r"\\lambda": "λ", r"\\mu": "μ", r"\\sigma": "σ", r"\\pi": "π",
        r"\\geq": "≥", r"\\leq": "≤", r"\\neq": "≠", r"\\approx": "≈",
        r"\\pm": "±", r"\\cdot": "·", r"\\times": "×", r"\\rightarrow": "→", r"\\infty": "∞"
    }
    for k, v in replacements.items():
        text = re.sub(k, v, text)

    # Handle superscripts/subscripts
    text = re.sub(r"\^\{([^}]+)\}",lambda m: "".join(SUPERSCRIPTS.get(c, c) for c in m.group(1)),text)
    text = re.sub(r"\^\(([^)]+)\)",lambda m: "".join(SUPERSCRIPTS.get(c, c) for c in m.group(1)),text)
    text = re.sub(r"_\{([^}]+)\}",lambda m: "".join(SUBSCRIPTS.get(c, c) for c in m.group(1)),text)
    text = re.sub(r"\^([0-9Tni+\-=])",lambda m: SUPERSCRIPTS.get(m.group(1), m.group(1)),text)
    text = re.sub(r"_([0-9ijkn])",lambda m: SUBSCRIPTS.get(m.group(1), m.group(1)),text)
    text = re.sub(r"\\[a-zA-Z]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def is_personal_info(text):
    """Detect admin/link-only content to exclude from chunks."""
    if not text.strip():
        return False

    lower_text = text.lower()

    # ❌ Admin content
    admin_keywords = [
        "email", "@", "office hours", "grading", "resources", "credit",
        "references", "contact", "phone", "exam", "midterm", "quiz",
        "practical", "hands-on","proposed course outline", "helpful resources", "grading scheme","resources"
    ]
    if any(kw in lower_text for kw in admin_keywords):
        return True

    # ❌ Link-only content (e.g., YouTube links)
    url_patterns = [r'https?://', r'www\.', r'\.com', r'\.eg']
    if any(re.search(pattern, text) for pattern in url_patterns):
        # If the text is mostly a URL (short + contains link)
        words = text.split()
        if len(words) <= 3 and any(re.search(p, text) for p in url_patterns):
            return True

    # ❌ Email/phone patterns
    email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
    phone_pattern = r"\+?\d[\d\s-]{5,}\d"
    office_pattern = r"\b(Room|Building|Office)\b"
    if (re.search(email_pattern, text) or
        re.search(phone_pattern, text) or
        re.search(office_pattern, text)):
        return True

    return False

print("✅ Step 5 Complete: Text utilities loaded.")

✅ Step 5 Complete: Text utilities loaded.


In [None]:
# @title 👁️ Step 6: Visual Understanding (Qwen-VL)
# @markdown Logic to recover text from charts and diagrams using the Vision Language Model.

def recover_image_block_qwen(bbox, page_img, pipeline_manager, max_res=800):
    """
    Crops an image region and sends it to Qwen-VL for description.
    """
    x1, y1, x2, y2 = map(int, bbox)
    crop_w, crop_h = x2-x1, y2-y1

    # Skip tiny noise artifacts
    if crop_w < 50 or crop_h < 50:
        return {"type": "image", "text": "[Tiny image skipped]", "recovered": False}

    # Resize if too large to save token cost/time
    cropped = page_img.crop((x1, y1, x2, y2))
    scale = min(max_res / crop_w, max_res / crop_h, 1.0)
    if scale < 1.0:
        cropped = cropped.resize((int(crop_w*scale), int(crop_h*scale)), Image.Resampling.LANCZOS)

    if pipeline_manager:
        model, processor = pipeline_manager.get_qwen3_vl_model()
        try:
            # Re-crop from original for maximum quality
            cropped = page_img.crop((x1, y1, x2, y2))

            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": cropped},
                    {"type": "text", "text": (
                        "Extract and explain all readable textual and semantic information from this image. "
                        "If it is a chart, diagram, table, or equation, describe it clearly in technical English."
                        "- If absolutely nothing is readable, output exactly: [NO READABLE CONTENT]"
                    )}
                ]
            }]

            text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, _ = process_vision_info(messages)

            inputs = processor(
                text=[text_prompt],
                images=image_inputs,
                return_tensors="pt"
            ).to(model.device)

            # Generate description
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=512)

            output_text = processor.batch_decode(
                generated_ids[:, inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            )[0].strip()

            # Cleanup
            del inputs, generated_ids, image_inputs, text_prompt, messages
            cropped.close()
            torch.cuda.empty_cache()
            gc.collect()

            if "NO READABLE CONTENT" in output_text.upper():
              return {"text": "[No content extracted]", "type": "image", "recovered": False}

            return {
                "text": output_text if output_text else "[No content extracted]",
                "type": "image",
                "recovered": bool(output_text)
            }

        except Exception as e:
            try:
                cropped.close()
                del cropped
                torch.cuda.empty_cache()
            except: pass
            print(f"❌ Qwen-VL image recovery failed: {e}")
            return {"text": f"[Image recovery failed: {str(e)}]", "type": "image", "recovered": False}

print("✅ Step 6 Complete: Vision Logic loaded.")

✅ Step 6 Complete: Vision Logic loaded.


In [None]:
# @title 📄 Step 7: Layout Analysis Helper
# @markdown Helper function to determine if two text blocks should be merged based on distance and alignment.

def can_merge(prev_unit, blk, page_no, y_ratio=0.1, indent_ratio=0.05):
    """
    Checks if 'blk' is visually close enough to 'prev_unit' to be considered the same paragraph.
    """
    if prev_unit is None:
       return False
    if prev_unit["page"] != page_no:
       return False

    curr_type = blk.get("label", "text")
    prev_type = prev_unit["type"]

    # Only merge same-type blocks (text with text)
    if curr_type != prev_type:
      return False
    if curr_type not in {"text", "paragraph"}:
      return False

    # Check geometric proximity
    prev_x1, prev_y1, prev_x2, prev_y2 = prev_unit["bbox"]
    curr_x1, curr_y1, curr_x2, curr_y2 = blk["bbox"]

    prev_height = prev_y2 - prev_y1
    page_width = max(prev_x2, curr_x2)

    # Check Vertical Gap
    max_gap = max(prev_height * y_ratio, 10)
    if abs(curr_y1 - prev_y2) > max_gap:
      return False

    # Check Indentation difference
    max_indent_diff = max(page_width * indent_ratio, 10)
    if abs(curr_x1 - prev_x1) > max_indent_diff:
       return False

    # Heuristic: Don't merge if previous line ends with sentence terminator
    prev_text = prev_unit["text"].strip()
    if prev_text.endswith((".", ":", "؟", "!", ";")):
       return False

    # Heuristic: Don't merge bullets
    bullet_pattern = r"^(\-|\*|\u2022|\d+\.)\s+"
    if re.match(bullet_pattern, blk.get("content", "").strip()):
       return False

    # Heuristic: Don't merge headers
    if prev_text.isupper() and len(prev_text.split()) < 6:
       return False

    return True

print("✅ Step 7 Complete: Layout helper loaded.")

✅ Step 7 Complete: Layout helper loaded.


In [None]:
# @title 🧱 Step 8: Raw Block Processing Logic
# @markdown Functions to convert raw OCR blocks into logical units (Paragraphs, Tables, Image Descriptions).

def to_dict(obj: Any) -> Any:
    """Helper to make custom objects JSON serializable."""
    if obj is None: return None
    if hasattr(obj, 'numpy') and callable(obj.numpy):
        try: return obj.numpy().tolist()
        except: return str(obj)
    elif isinstance(obj, np.ndarray): return obj.tolist()
    elif isinstance(obj, Path): return str(obj)
    elif isinstance(obj, (bytes, bytearray)): return obj.decode('utf-8', errors='replace')
    elif isinstance(obj, (tuple, set)): return [to_dict(item) for item in obj]
    elif hasattr(obj, '__dict__') and not isinstance(obj, type): return to_dict(obj.__dict__)
    elif isinstance(obj, dict): return {k: to_dict(v) for k, v in obj.items()}
    elif isinstance(obj, list): return [to_dict(item) for item in obj]
    try:
        json.dumps(obj)
        return obj
    except: return str(obj)

    return True


def is_valid_title(text):
    """Heuristic to check if a text block looks like a Section Header."""
    if not isinstance(text, str):
        return False
    text = text.strip()
    if len(text) < 5 or len(text) > 100:  # ← Increased min length
        return False

    # ❌ Reject LaTeX/math
    latex_patterns = [r'\$.*?\$', r'\\[a-zA-Z]+', r'\{.*?\}']
    if any(re.search(p, text) for p in latex_patterns):
        return False

    # ❌ Reject bullets/lists
    if text.startswith(('•', '-', '—', '*', '●', '·', '→')):
        return False

    # ❌ Reject admin content
    admin_keywords = ["email", "@", "office hours", "grading", "resources", "credit", "references"]
    if any(kw in text.lower() for kw in admin_keywords):
        return False

    # ✅ Accept titles with:
    # - Multiple words (≥2)
    # - No trailing punctuation
    # - Not sentence-like
    words = text.split()
    if len(words) < 2:
        return False
    if text.endswith(('.', '!', ';')) and not text.endswith('...'):
        return False
    structural_keywords = ["comparison", "levels", "deployment", "challenges", "protocols", "domains"]
    if any(kw in text.lower() for kw in structural_keywords):
        return True

    return True

def blocks_to_units(pdf_data, pipeline_manager):
    """
    Iterates through OCR blocks, merges neighbors, parses tables, and recovers images using Qwen-VL.
    """
    units = []
    unit_idx = 0

    for page in pdf_data["pages"]:
        page_no = page["page_number"]
        page_img = Image.open(page["image_path"])
        blocks = page.get("parsing_res_list", [])

        # Filter and Sort Blocks
        valid_blocks = [b for b in blocks if b.get("bbox") and len(b.get("bbox", [])) == 4]
        blocks = sorted(valid_blocks, key=lambda b: (b.get("bbox")[1], b.get("bbox")[0]))

        prev_unit = None
        for blk in blocks:
            text = blk.get("content", "").strip()
            if is_personal_info(text): continue

            t = blk.get("label", "text")
            # Skip noise
            if t == "number" or (t == "text" and len(text) <= 2): continue

            # Handle Tables
            if t in ("table", "chart"):
                text = html_table_to_text(text)
                text = html.unescape(text)

            # Handle Images (The Vision Part)
            if t in ("table","image", "chart") and ((not text) or len(text) <= 2 or (text=="Table")):
                bbox = blk.get("bbox", [0,0,0,0])
                recovered = recover_image_block_qwen(bbox, page_img, pipeline_manager)

                if recovered.get("recovered"):
                    units.append({
                        "text": recovered["text"],
                        "type": "image",
                        "page": page_no,
                        "unit_id": f"u{unit_idx:06d}",
                        "bbox": bbox
                    })
                    unit_idx += 1
                    continue
                else:
                    continue

            if not text: continue

            # Attempt to Merge with previous block (for broken paragraphs)
            if prev_unit and can_merge(prev_unit, blk, page_no):
                prev_unit["text"] += " " + text
                prev_unit["bbox"][2] = max(prev_unit["bbox"][2], blk["bbox"][2])
                prev_unit["bbox"][3] = max(prev_unit["bbox"][3], blk["bbox"][3])
                continue

            # New Unit
            units.append({
                "text": text,
                "type": t,
                "page": page_no,
                "unit_id": f"u{unit_idx:06d}",
                "bbox": blk.get("bbox", [0,0,0,0])
            })
            prev_unit = units[-1]
            unit_idx += 1
    return units

print("✅ Step 8 Complete: Block processors defined.")

✅ Step 8 Complete: Block processors defined.


In [None]:
# @title 🍕 Step 9: Semantic Chunking & Embedding Prep (Admin-Safe + Structural Keywords)
# @markdown
# - Groups text by headers
# - Excludes ENTIRE admin pages (e.g., "Helpful Resources", YouTube links)
# - Promotes structural keywords ("levels", "comparison") to top-level sections
# - Enforces page-aligned subchunking for podcast coherence


# Helper: Detect Level boundaries in text (Level-2, Level-3, etc.) – GENERAL PURPOSE
def detect_level_boundaries(text, units):
        """Returns list of (start_idx, level_name) tuples marking Level-X boundaries."""
        boundaries = []
        # GENERALIZED PATTERN: Matches "Level-2", "Level 3", "Level4", etc. WITHOUT domain keywords
        level_pattern = r'Level[-\s]*(\d+)'
        for idx, u in enumerate(units):
            u_text = u.get("text", "").strip()
            if not u_text:
                continue
            # Check if this unit contains a Level-X marker
            match = re.search(level_pattern, u_text, re.IGNORECASE)
            if match:
                level_num = match.group(1)
                level_name = f"Level-{level_num}"
                boundaries.append((idx, level_name))
        return boundaries

def process_units_to_chunks(units, emb_model):
    """
    Hybrid chunking that works for ANY CS lecture:
    - Uses structural signals (Strategy 1/2) to find candidate titles.
    - Dynamically filters to keep ONLY top-level sections.
    - Preserves intro and merges subtopics.
    - Excludes admin/link-only content AND entire admin pages.
    """
    chunks = []
    main_title_pages = set()
    strategy_1_pages = set()
    LARGE_ELEMENT_TYPES = {"image", "table", "chart", "display_formula", "algorithm"}

    # === PASS 0: PRE-FILTER OUT ADMIN PAGES ENTIRELY ===
    # First, identify pages with admin titles
    page_to_units = {}
    for u in units:
        page_no = u["page"]
        if page_no not in page_to_units:
            page_to_units[page_no] = []
        page_to_units[page_no].append(u)

    admin_pages = set()
    for page_no, page_units in page_to_units.items():
        if page_units:
            first_text = page_units[0].get("text", "").strip()
            if is_personal_info(first_text):
                print(f"⏭️ Skipping entire page {page_no} (admin title): '{first_text}'")
                admin_pages.add(page_no)

    # Filter out all units from admin pages
    filtered_units = [u for u in units if u["page"] not in admin_pages]
    units = filtered_units

    # Rebuild page_to_units with filtered units
    page_to_units = {}
    for u in units:
        page_no = u["page"]
        if page_no not in page_to_units:
            page_to_units[page_no] = []
        page_to_units[page_no].append(u)

    # === PASS 1: Detect ALL title candidates ===
    for page_no, page_units in page_to_units.items():
        if not page_units: continue

        # Unified Strategy 1 detection
        first_unit = page_units[0]
        first_text = first_unit.get("text", "").strip()
        y1 = first_unit.get("bbox", [0,0,0,0])[1]

        # Check if first unit is a valid title candidate
        is_ocr_title = first_unit["type"] == "paragraph_title"
        is_positional_title = (
            (y1 < 300) and
            (5 < len(first_text) < 100) and
            first_unit.get("type") not in LARGE_ELEMENT_TYPES
        )
        is_valid = is_valid_title(first_text)

        if (is_ocr_title or is_positional_title) and is_valid:
            # Calculate content weight (ignoring first unit)
            content_weight = sum(
                5 if u["type"] in LARGE_ELEMENT_TYPES else 1
                for u in page_units[1:]
            )

            # NEW: Check for structural keywords
            structural_keywords = ["comparison", "levels", "deployment", "challenges", "protocols", "domains"]
            has_structural_keyword = any(kw in first_text.lower() for kw in structural_keywords)

            # STRATEGY 1: Single-title OR low content OR structural keyword
            if len(page_units) == 1 or content_weight < 1 or has_structural_keyword:
                main_title_pages.add(page_no)
                strategy_1_pages.add(page_no)
                print(f"✅ [STRATEGY 1] Title detected on Page {page_no}: '{first_text}'")
                continue

            # Strategy 2: Title with substantial content
            else:
                main_title_pages.add(page_no)
                print(f"✅ [STRATEGY 2] Title detected on Page {page_no}: '{first_text}' (IoT-style (with content))")

    # === CRITICAL: Filter to TOP-LEVEL sections only ===
    real_strategy_1_pages = {p for p in strategy_1_pages if p != 1}

    if real_strategy_1_pages:
        main_title_pages = real_strategy_1_pages
        print("🎯 Using all Strategy 1 pages as top-level sections.")
    else:
        print("🎯 Falling back to Strategy 2 titles.")

    if  main_title_pages:
        print(f"🎯 Final title pages used: {sorted(main_title_pages)}")

    # === PASS 2: Build Chunks (USE EACH PAGE ONCE) ===
    first_main_title_idx = None
    for i, u in enumerate(units):
        if u["page"] in main_title_pages and is_valid_title(u.get("text", "").strip()):
            first_main_title_idx = i
            break

    # Handle intro (exclude admin/link content)
    if first_main_title_idx is not None and first_main_title_idx > 0:
        intro_units = [
            u for u in units[:first_main_title_idx]
            if u.get("text", "").strip() and not is_personal_info(u.get("text", ""))
        ]
        if intro_units:
            chunks.append({
                "concept": "Introduction",
                "content": intro_units,
                "page_start": intro_units[0]["page"]
            })
        units = units[first_main_title_idx:]

    # Build chunks (skip admin/link units)
    current_concept = "General Section"
    current_content = []
    current_page_start = None
    initial_chunks = []
    used_title_pages = set()

    for u in units:
        u_text = u.get("text", "").strip()
        if not u_text or is_personal_info(u_text):
            continue  # ← Skip admin/link-only units

        page_no = u["page"]
        y1 = u.get("bbox", [0,0,0,0])[1]
        is_eligible_title = (
            page_no in main_title_pages and
            page_no not in used_title_pages and
            ((u["type"] == "paragraph_title" or (y1 < 300 and len(u_text) < 100 and u["type"] not in LARGE_ELEMENT_TYPES )) and is_valid_title(u_text))
        )

        if is_eligible_title:
            if current_content:
                initial_chunks.append({
                    "concept": current_concept,
                    "content": current_content,
                    "page_start": current_page_start
                })
            current_concept = u_text
            current_content = []
            current_page_start = page_no
            used_title_pages.add(page_no)
            print(f"📌 Starting new chunk: '{current_concept}' (Page {page_no})")
        else:
            current_content.append(u)

    if current_content:
        initial_chunks.append({
            "concept": current_concept,
            "content": current_content,
            "page_start": current_page_start
        })

    print(f"📦 Initial chunk count: {len(initial_chunks)}")

    # === PASS 3: Merge by TITLE Similarity (threshold=0.85) ===
    final_chunks = []
    i = 0
    while i < len(initial_chunks):
        current_chunk = initial_chunks[i]
        merged_content = current_chunk["content"]
        merged_concept = current_chunk["concept"]
        merged_page_start = current_chunk["page_start"]

        j = i + 1
        while j < len(initial_chunks):
            next_chunk = initial_chunks[j]
            next_concept = next_chunk["concept"]

            if not merged_concept.strip() or not next_concept.strip():
                break

            try:
                emb1 = emb_model.encode([merged_concept], normalize_embeddings=True)
                emb2 = emb_model.encode([next_concept], normalize_embeddings=True)
                sim = cosine_similarity(emb1, emb2)[0][0]

                if sim >= 0.85:
                    print(f"🔗 Merging title '{merged_concept}' with '{next_concept}' (sim={sim:.2f})")
                    merged_content.extend(next_chunk["content"])
                    j += 1
                else:
                    print(f"🚫 NOT merging '{merged_concept}' and '{next_concept}' (sim={sim:.2f})")
                    break
            except Exception as e:
                print(f"⚠️ Title similarity failed: {e}")
                break

        final_chunks.append({
            "concept": merged_concept,
            "content": merged_content,
            "page_start": merged_page_start
        })
        i = j

    # Prepend Introduction
    if chunks:
        final_chunks = chunks + final_chunks

    print(f"📦 Final chunk count after merging: {len(final_chunks)}")
    for idx, ch in enumerate(final_chunks):
        print(f"   Chunk {idx+1}: '{ch['concept']}' (Pages: {ch['page_start']}+)")

    return final_chunks



def chunk_to_embedding_text(chunks, max_chars=1800):
    """
    Converts chunks to embedding-ready text with STRICT page integrity:
    - Never splits a page across subchunks
    - Each subchunk contains complete pages only
    - SPECIAL: For chunks containing "Level-X" patterns, splits by Level boundaries BEFORE page alignment
    """
    embeddings = []

    for chunk_idx, chunk in enumerate(chunks):
        concept = chunk.get("concept", "").strip()
        units = chunk.get("content", [])
        if not units:
            print(f"⚠️ Chunk {chunk_idx+1} ('{concept}') has no units. Skipping.")
            continue

        print(f"\n📦 Processing Chunk {chunk_idx+1}: '{concept}'")

        # Clean units (existing logic preserved)
        effective_units = []
        for u in units:
            u_text = u.get("text", "").strip()
            if not u_text: continue
            if u.get("type") in ("image", "chart") and "[Image recovery failed]" in u_text:
                continue
            if "display_formula" in u.get("type", "") or "$" in u_text:
                u_text = latex_to_unicode(u_text)
            elif u.get("type") == "algorithm":
                u_text = normalize_code(u_text)
            u_text = html.unescape(u_text)
            effective_units.append({**u, "effective_text": u_text})

        if not effective_units:
            print(f"⚠️ Chunk {chunk_idx+1} has no valid units after cleaning. Skipping.")
            continue

        full_text = "\n\n".join(u["effective_text"] for u in effective_units)
        total_pages = sorted(set(u["page"] for u in effective_units))
        print(f"   Pages: {total_pages} | Total chars: {len(full_text)}")

        # GENERAL CONDITION: Activate splitting if chunk concept mentions "level" AND has multiple units
        is_levels_chunk = "level" in concept.lower()
        level_subchunks = []

        if is_levels_chunk and len(effective_units) > 1:
            # Detect Level boundaries in the chunk
            boundaries = detect_level_boundaries(full_text, effective_units)
            if boundaries:
                print(f"   🔬 Detected {len(boundaries)} Level boundaries: {[name for _, name in boundaries]}")

                # Ensure first unit is a boundary if it looks like a Level
                first_unit_text = effective_units[0].get("text", "")
                if "Level-" in first_unit_text and not any(idx == 0 for idx, _ in boundaries):
                    # Extract level number from first unit if possible
                    match_first = re.search(r'Level[-\s]*(\d+)', first_unit_text, re.IGNORECASE)
                    if match_first:
                        level_num = match_first.group(1)
                        boundaries.insert(0, (0, f"Level-{level_num}"))
                    else:
                        boundaries.insert(0, (0, "Level-1"))  # Fallback

                # Split chunk into Level-based subchunks
                start_idx = 0
                for boundary_idx, level_name in boundaries:
                    if boundary_idx > start_idx:
                        level_subchunks.append({
                            "concept": f"{concept} ({level_name})",
                            "units": effective_units[start_idx:boundary_idx],
                            "level_name": level_name
                        })
                    start_idx = boundary_idx
                # Add final segment
                if start_idx < len(effective_units):
                    last_level = boundaries[-1][1] if boundaries else "Level-1"
                    level_subchunks.append({
                        "concept": f"{concept} ({last_level}+)",
                        "units": effective_units[start_idx:],
                        "level_name": f"{last_level}+"
                    })
                print(f"   ✂️  Split into {len(level_subchunks)} Level-based segments")
            else:
                # No boundaries detected → treat as single segment
                level_subchunks = [{
                    "concept": concept,
                    "units": effective_units,
                    "level_name": None
                }]
        else:
            # Non-Levels chunk → single segment (existing behavior)
            level_subchunks = [{
                "concept": concept,
                "units": effective_units,
                "level_name": None
            }]

        # Process each segment (Level-based or whole chunk) with page-aligned splitting
        for seg_idx, segment in enumerate(level_subchunks):
            seg_concept = segment["concept"]
            seg_units = segment["units"]
            seg_text = "\n\n".join(u["effective_text"] for u in seg_units)
            seg_pages = sorted(set(u["page"] for u in seg_units))

            if len(seg_text) <= max_chars:
                # Single subchunk (existing logic)
                print(f"   ✅ Single subchunk for '{seg_concept}' (≤{max_chars} chars)")
                embedding_input = f"Concept: {seg_concept}\nRole: section\n\n{seg_text}"
                embeddings.append({
                    "embedding_text": embedding_input,
                    "metadata": {
                        "concept": seg_concept,
                        "page": seg_units[0].get("page") if seg_units else chunk.get("page_start"),
                        "units": seg_units,
                        "total_subchunks": 1
                    }
                })
            else:
                # PAGE-ALIGNED SUBCHUNKING (existing logic preserved)
                print(f"   🔍 Splitting segment '{seg_concept}' (> {max_chars} chars)...")

                # Group units by page
                page_to_units = {}
                for u in seg_units:
                    page_to_units.setdefault(u["page"], []).append(u)

                # Create page-aligned subchunks
                subchunks = []
                current_pages = []
                current_text = ""

                for page_no in sorted(page_to_units.keys()):
                    page_units = page_to_units[page_no]
                    page_text = "\n\n".join(u["effective_text"] for u in page_units)

                    if current_text and (len(current_text) + len(page_text) + 2 > max_chars):
                        # Finalize current subchunk
                        subchunks.append({
                            "text": current_text.strip(),
                            "units": [u for p in current_pages for u in page_to_units[p]]
                        })
                        print(f"   📄 Subchunk {len(subchunks)}: Pages {current_pages} ({len(current_text)} chars)")

                        # Start new subchunk
                        current_pages = [page_no]
                        current_text = page_text
                    else:
                        current_pages.append(page_no)
                        current_text += ("\n\n" + page_text) if current_text else page_text

                # Add last subchunk
                if current_text:
                    subchunks.append({
                        "text": current_text.strip(),
                        "units": [u for p in current_pages for u in page_to_units[p]]
                    })
                    print(f"   📄 Subchunk {len(subchunks)}: Pages {current_pages} ({len(current_text)} chars)")

                print(f"   ✅ Created {len(subchunks)} page-aligned subchunks for '{seg_concept}'")

                # Create embeddings for each subchunk
                for idx, sc in enumerate(subchunks):
                    embedding_input = f"Concept: {seg_concept}\nRole: section\n\n{sc['text']}"
                    embeddings.append({
                        "embedding_text": embedding_input,
                        "metadata": {
                            "concept": seg_concept,
                            "page": sc["units"][0].get("page") if sc["units"] else chunk.get("page_start"),
                            "units": sc["units"],
                            "total_subchunks": len(subchunks)
                        }
                    })

    print(f"\n🎯 Total embedding inputs generated: {len(embeddings)}")
    return embeddings

print("✅ Step 9 Complete: Chunking logic improved for slides.")

✅ Step 9 Complete: Chunking logic improved for slides.


In [None]:
# @title ⚙️ Step 10: Main PDF Processing Loop (Memory Safe)
# @markdown Runs OCR on every page. Optimized to prevent GPU crashes.

def extract_pdf_info(pdf_path, pipeline):
    """
    Main Loop: PDF -> Images -> OCR -> JSON
    """
    pdf_name = pdf_path.stem

    # Use the global directories we set in Step 3
    pdf_image_dir = IMAGE_DIR / pdf_name
    pdf_json_dir = JSON_DIR / pdf_name

    for d in [pdf_image_dir, pdf_json_dir]:
        d.mkdir(parents=True, exist_ok=True)

    final_json_path = pdf_json_dir / f"{pdf_name}.json"

    # Get page count
    try:
        num_pages = pdfinfo_from_path(str(pdf_path))['Pages']
    except:
        print("⚠️ Could not read PDF info. Installing poppler again...")
        os.system("apt-get install -y poppler-utils")
        num_pages = pdfinfo_from_path(str(pdf_path))['Pages']

    pdf_result = {
        "file_name": pdf_path.name,
        "num_pages": num_pages,
        "pages": []
    }

    print(f"[*] Starting OCR for {pdf_name} ({num_pages} pages)...")
    pbar = tqdm(range(1, num_pages + 1), desc="OCR Processing", unit="page")

    for i in pbar:
        try:
            # 1. CLEAN MEMORY BEFORE STARTING (Critical for Colab)
            paddle.device.cuda.empty_cache()
            gc.collect()

            # 2. Convert PDF Page to Image (LOWER RES to save RAM)
            # Changed dpi=150 -> dpi=100 to prevent OOM
            page = convert_from_path(str(pdf_path), dpi=100, first_page=i, last_page=i)
            page_img = page[0]

            if page_img.width == 0 or page_img.height == 0: continue

            page_np = np.array(page_img).astype("uint8")
            image_path = pdf_image_dir / f"page_{i:03d}.png"
            page_img.save(image_path)

            # 3. Run PaddleOCR
            output = pipeline.predict(page_np)
            result = output[0]

            # 4. Parse Results
            parsing_res_list = [to_dict(block) for block in result.get("parsing_res_list", [])]
            layout_det_res = to_dict(result.get("layout_det_res", {}))

            pbar.set_postfix_str(f"Found {len(parsing_res_list)} blocks", refresh=True)

            pdf_result["pages"].append({
                "page_number": i,
                "image_path": str(image_path),
                "parsing_res_list": parsing_res_list,
                "layout_det_res": layout_det_res,
                "height": page_img.height,
                "width": page_img.width
            })

            # Cleanup RAM explicitly
            del page_img, page_np, page, output, result

        except Exception as e:
            print(f"⚠️ Error on page {i}: {e}")
            # If a page fails, we try to skip it to save the rest of the document
            continue

    # Save Final JSON
    with open(final_json_path, "w", encoding="utf-8") as f:
        json.dump(pdf_result, f, ensure_ascii=False, indent=2)

    print(f"✅ OCR Complete. Data saved to: {final_json_path}")
    return pdf_result

print("✅ Step 10 Complete: Extraction loop optimized.")

✅ Step 10 Complete: Extraction loop optimized.


In [None]:
# @title 🧠 Step 11: Embedding Model Helpers
# @markdown Loads the BGE-M3 model to convert text chunks into vectors.

def load_embedding_model():
    """Loads the SentenceTransformer model on GPU."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[*] Loading Embedding Model on {device}...")
    return SentenceTransformer("BAAI/bge-m3", device=device)

def get_embedding(text: str, embModel) -> List[float]:
    """Generates a normalized vector for a given text string."""
    # normalize_embeddings=True improves cosine similarity accuracy
    emb = embModel.encode([text], normalize_embeddings=True)
    return emb[0].tolist()

print("✅ Step 11 Complete: Embedding functions defined.")

✅ Step 11 Complete: Embedding functions defined.


In [None]:
# @title 🧪 TEST: Load Units from Drive & Run Chunking + Embedding Text Generation
# import json
# from pathlib import Path

# # --- CONFIGURE YOUR PDF NAME HERE ---
# PDF_NAME = "[IoT_25] Lecture 1"  # ← Change this if needed

# # --- LOAD EXISTING UNITS ---
# unified_dir = UNIFIED_DIR / PDF_NAME
# units_path = unified_dir / f"{PDF_NAME}_units.json"

# if not units_path.exists():
#     print(f"❌ Units file not found: {units_path}")
#     print("👉 Make sure you've run OCR at least once, or upload your own _units.json")
# else:
#     print(f"[*] Loading units from: {units_path}")
#     with open(units_path, "r", encoding="utf-8") as f:
#         units = json.load(f)
#     print(f"✅ Loaded {len(units)} units.")

#     # --- RUN CHUNKING ---
#     print("\n[*] Running process_units_to_chunks...")
#     embedding_model = load_embedding_model()  # From Step 11
#     chunks = process_units_to_chunks(units, embedding_model)

#     # --- SAVE CHUNKS ---
#     chunks_path = unified_dir / f"{PDF_NAME}_chunks_TEST.json"
#     with open(chunks_path, "w", encoding="utf-8") as f:
#         json.dump({"doc_id": PDF_NAME, "chunks": chunks}, f, ensure_ascii=False, indent=2)
#     print(f"✅ Generated {len(chunks)} chunks.")
#     print(f"💾 Saved chunks to: {chunks_path}")

#     # --- GENERATE EMBEDDING TEXTS (NEW STEP) ---
#     print("\n[*] Generating embedding-ready texts with page-aligned subchunking...")
#     embedding_texts = chunk_to_embedding_text(chunks)

#     # --- SAVE EMBEDDING INPUTS ---
#     embedding_dir = EMBEDDING_DIR / PDF_NAME
#     embedding_dir.mkdir(parents=True, exist_ok=True)
#     embedding_path = embedding_dir / f"{PDF_NAME}_embedding_input_texts_TEST.json"

#     with open(embedding_path, "w", encoding="utf-8") as f:
#         json.dump({
#             "doc_id": PDF_NAME,
#             "inputs": embedding_texts
#         }, f, ensure_ascii=False, indent=2)

#     print(f"✅ Generated {len(embedding_texts)} embedding inputs.")
#     print(f"💾 Saved embedding inputs to: {embedding_path}")

#     # --- PRINT SUMMARY ---
#     print("\n📋 Chunk Concepts:")
#     for i, ch in enumerate(chunks):
#         pages = sorted(set(u["page"] for u in ch.get("content", [])))
#         print(f"  {i+1}. '{ch['concept']}' (Pages: {pages})")

#     total_subchunks = sum(item["metadata"]["total_subchunks"] for item in embedding_texts)
#     print(f"\n📊 Total subchunks for podcast: {total_subchunks} (each ≈6 minutes)")

❌ Units file not found: /content/drive/MyDrive/Doc2Pod_System/Data/unified_json/[IoT_25] Lecture 1/[IoT_25] Lecture 1_units.json
👉 Make sure you've run OCR at least once, or upload your own _units.json


In [None]:
# @title ⚙️ Step 12: Extraction Pipeline (Refactored)

def run_friend_extraction_pipeline(pdf_path):
    """
    Runs the full OCR -> Chunking -> Embedding pipeline on a single PDF.
    Refactored from 'main()' to be modular.
    """
    print(f"🚀 Starting Extraction Pipeline for: {pdf_path.name}")

    pdf_name_raw = pdf_path.stem
    # Sanitize name to be used for collection name and now also for folder names
    pdf_name = re.sub(r'[^a-zA-Z0-9._-]', '_', pdf_name_raw)

    # 1. OCR & Layout Analysis
    pipeline_manager = LazyPipelineManager()
    VL_Pipeline = pipeline_manager.get_vl_pipeline()

    ocr_result = extract_pdf_info(pdf_path, VL_Pipeline)
    print("✅ OCR Complete.")

    # 2. Convert to Units
    units = blocks_to_units(ocr_result, pipeline_manager)

    # **Explicitly unload vision models to free up GPU memory**
    pipeline_manager.unload_all()

    # Use sanitized name for unified_json directory
    unified_dir = UNIFIED_DIR / pdf_name
    unified_dir.mkdir(parents=True, exist_ok=True)

    # Use sanitized name for units file
    units_path = unified_dir / f"{pdf_name}_units.json"
    with open(units_path, "w", encoding="utf-8") as f:
        json.dump(units, f, ensure_ascii=False, indent=2)
    print(f"✅ Created {len(units)} raw units.")

    # 3. Semantic Chunking
    print("[*] Loading Embedding Model...")
    embedding_model = load_embedding_model()

    chunks = process_units_to_chunks(units, embedding_model)
    # Use sanitized name for chunks file
    chunks_path = unified_dir / f"{pdf_name}_chunks.json"
    with open(chunks_path, "w", encoding="utf-8") as f:
        json.dump({"doc_id": pdf_name_raw, "chunks": chunks}, f, ensure_ascii=False, indent=2)
    print(f"✅ Generated {len(chunks)} chunks.")

    # 4. Prepare Embeddings
    # Use sanitized name for embedding_inputs directory
    embedding_dir = EMBEDDING_DIR / pdf_name
    embedding_dir.mkdir(parents=True, exist_ok=True)
    # Use sanitized name for embedding input texts file
    embedding_path = embedding_dir / f"{pdf_name}_embedding_input_texts.json"
    embedding_inputs = []

    for idx, item in enumerate(chunk_to_embedding_text(chunks)):
        text = item["embedding_text"]
        metadata = item["metadata"]
        if not text.strip(): continue

        embedding_inputs.append({
            "embedding_id": f"{pdf_name_raw}_c{idx:04d}", # Use raw name for embedding ID if needed for lookup back to original doc
            "page": metadata.get("page"),
            "text": text,
            "metadata": metadata
        })

    with open(embedding_path, "w", encoding="utf-8") as f:
          json.dump({"doc_id": pdf_name_raw, "inputs": embedding_inputs}, f, ensure_ascii=False, indent=2)

    # 5. Store in ChromaDB (Using Global DB_DIR)
    chroma_client = chromadb.PersistentClient(path=str(DB_DIR))
    collection_name = f"cs_podcast_{pdf_name}"

    try:
        chroma_client.delete_collection(name=collection_name)
    except: pass

    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}
    )

    print(f"[*] Storing {len(embedding_inputs)} vectors in DB...")

    ids, embeddings, documents, metadatas = [], [], [], []

    for item in tqdm(embedding_inputs, desc="Embedding"):
        emb_id = item["embedding_id"]
        text = item["text"]
        meta = item["metadata"]

        emb = get_embedding(text, embedding_model)

        chroma_meta = {
            "doc_id": pdf_name_raw,
            "concept": meta.get("concept", ""),
            "page": meta.get("page", 0),
            "unit_ids": ",".join([u["unit_id"] for u in meta.get("units", [])])
        }

        ids.append(emb_id)
        embeddings.append(emb)
        documents.append(text)
        metadatas.append(chroma_meta)

    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas
    )

    print(f"🎉 Pipeline Success! Data ready for RAG.")

print("✅ Step 12 Complete: extraction pipeline packaged as a function.")


✅ Step 12 Complete: extraction pipeline packaged as a function.


In [None]:
# @title 🔍 Step 13: Retrieval System (Fixed ID Mapping + Debug Mode)
# @markdown Unified retrieval with CORRECT embedding_id mapping + optional debug output
class Doc2PodRetrieval:
    def __init__(self, db_path, collection_name, use_gpu=True, debug=False):
        self.device = "cuda" if use_gpu else "cpu"
        self.debug = debug
        self.client = chromadb.PersistentClient(path=str(db_path))
        self.collection = self.client.get_collection(name=collection_name)
        self.emb_model = SentenceTransformer("BAAI/bge-m3", device=self.device)
        if self.debug:
            print(f"✅ Loaded collection '{collection_name}' with {self.collection.count()} vectors")

    @staticmethod
    def _load_subchunks(embedding_dir, embedding_path):
        """Load all subchunks from embedding_inputs directory."""
        with open(embedding_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data["inputs"]

    def get_context(self, embedding_dir, embedding_path, query=None, page_range=None, k=10):
        """
        Unified retrieval that works with SUBCHUNKS.
        Returns list of {text, concept, total_subs, next_concept} dicts.
        """
        # Load all subchunks (includes embedding_id field)
        all_subchunks = Doc2PodRetrieval._load_subchunks(embedding_dir, embedding_path)

        id_to_subchunk = {
            sc.get("embedding_id"): sc
            for sc in all_subchunks
            if sc.get("embedding_id")
        }

        if self.debug:
            print(f"🔍 Loaded {len(all_subchunks)} subchunks")
            print(f"🔍 ID mapping size: {len(id_to_subchunk)}")
            if id_to_subchunk:
                sample_id = list(id_to_subchunk.keys())[0]
                print(f"🔍 Sample ID format: '{sample_id}'")

        # === SCENARIO: Range Only (No Query) ===
        if not query and page_range:
            start_page, end_page = page_range
            filtered_subchunks = [
                sc for sc in all_subchunks
                if start_page <= sc["metadata"]["page"] <= end_page
            ]
            if self.debug:
                print(f"🔍 Range filter ({start_page}-{end_page}): {len(filtered_subchunks)} matches")
            if not filtered_subchunks:
                return []

            concepts = list(dict.fromkeys([sc["metadata"]["concept"] for sc in filtered_subchunks]))
            results = []
            for sc in filtered_subchunks:
                meta = sc["metadata"]
                next_concept = None
                if meta["concept"] in concepts:
                    idx = concepts.index(meta["concept"])
                    if idx + 1 < len(concepts):
                        next_concept = concepts[idx + 1]
                results.append({
                    "text": sc["text"],
                    "concept": meta["concept"],
                    "total_subs": meta["total_subchunks"],
                    "next_concept": next_concept
                })
            return results

        # === SCENARIO: Query (with/without Range) ===
        if query:
            # Build ChromaDB filter
            where_filter = None
            if page_range:
                start_page, end_page = page_range
                where_filter = {
                    "$and": [
                        {"page": {"$gte": start_page}},
                        {"page": {"$lte": end_page}}
                    ]
                }

            # Query ChromaDB
            query_emb = self.emb_model.encode([query], normalize_embeddings=True)[0].tolist()
            chroma_results = self.collection.query(
                query_embeddings=[query_emb],
                n_results=k,
                where=where_filter,
                include=["documents", "metadatas", "distances"]
            )

            if self.debug:
                print(f"\n🔍 ChromaDB Query: '{query}'")
                print(f"   Filter: {where_filter}")
                print(f"   Raw IDs returned: {chroma_results['ids'][0] if chroma_results['ids'][0] else 'NONE'}")
                print(f"   Available IDs in mapping: {list(id_to_subchunk.keys())[:3]}...")

            if not chroma_results['ids'] or not chroma_results['ids'][0]:
                if self.debug:
                    print("⚠️ ChromaDB returned NO matches")
                return []

            #  MAP USING ACTUAL embedding_id
            retrieved_subchunks = []
            for emb_id in chroma_results['ids'][0]:
                if emb_id in id_to_subchunk:
                    retrieved_subchunks.append(id_to_subchunk[emb_id])
                elif self.debug:
                    print(f"⚠️ ID mismatch: ChromaDB returned '{emb_id}' but not in mapping")

            if self.debug:
                print(f"✅ Mapped {len(retrieved_subchunks)}/{len(chroma_results['ids'][0])} results")

            if not retrieved_subchunks:
                return []
            # SORT BY PAGE ORDER FOR PODCAST FLOW
            retrieved_subchunks.sort(key=lambda x: x["metadata"]["page"])
            if self.debug:
              print(f"✅ Sorted {len(retrieved_subchunks)} results by page:")
              for sc in retrieved_subchunks:
                  print(f"   - Page {sc['metadata']['page']}: '{sc['metadata']['concept']}'")


            # Build continuity-aware results
            concepts = list(dict.fromkeys([sc["metadata"]["concept"] for sc in retrieved_subchunks]))
            results = []
            for sc in retrieved_subchunks:
                meta = sc["metadata"]
                next_concept = None
                if meta["concept"] in concepts:
                    idx = concepts.index(meta["concept"])
                    if idx + 1 < len(concepts):
                        next_concept = concepts[idx + 1]
                results.append({
                    "text": sc["text"],
                    "concept": meta["concept"],
                    "total_subs": meta["total_subchunks"],
                    "next_concept": next_concept
                })
            return results

        return []


In [None]:
# @title ✍️ Step 14: Podcast Script Generator (Upgraded)

class PodcastScriptGen:
    def __init__(self):
        self.langs = {
            "1": {"name": "English", "tone": "Professional, crisp"},
            "2": {"name": "Modern Standard Arabic (MSA)", "tone": "Educational"},
            "3": {"name": "Egyptian Arabic", "tone": "Friendly, conversational"}
        }


    def _load_structure(self, unified_dir):
        """Load clean structure from chunks as an ordered list of section titles (Smart Skeleton)."""
        try:
            chunk_files = list(unified_dir.glob("*_chunks.json"))
            if not chunk_files:
                return "Structure unavailable."

            with open(chunk_files[0], "r", encoding="utf-8") as f:
                data = json.load(f)

            # Extract concept titles in order
            concepts = []
            seen = set()
            for c in data.get("chunks", []):
                concept = c.get("concept", "").strip()
                if (
                    concept and
                    concept not in seen
                ):
                    concepts.append(concept)
                    seen.add(concept)

            if not concepts:
                return "No valid sections found."

            # Format as numbered list (matches your desired output)
            lines = ["This lecture covers these topics in order:"]
            for i, concept in enumerate(concepts, 1):
                lines.append(f"{i}. {concept}")

            return "\n".join(lines)

        except Exception as e:
            return f"Structure unavailable ({str(e)})."

    def build_prompt(self, context, user_topic, lang_choice, unified_dir, sub_idx=1, total_subs=1, next_concept=None):
        """
        Generates a prompt for script generation with Smart Skeleton and continuity cues.

        Args:
            context (str): RAG-retrieved text for current subchunk
            user_topic (str): Current section title (e.g., "IoT Levels")
            lang_choice (str): Language code ("3" for Egyptian Arabic)
            unified_dir (Path): Path to document's unified JSON directory
            sub_idx (int): Current subchunk index (1-based)
            total_subs (int): Total subchunks in current section
            next_concept (str, optional): Next section title for transition
        """
        config = self.langs.get(lang_choice, self.langs["3"])

        # Load clean structure (Smart Skeleton)
        doc_structure = self._load_structure(unified_dir)


        # CONTINUITY INSTRUCTIONS
        continuity_instructions = ""
        if total_subs > 1:
            if sub_idx == 1:
                continuity_instructions = f"Start fresh with a hook question about '{user_topic}'."
            elif sub_idx == total_subs:
                next_topic = next_concept if next_concept else "الموضوع اللي جاي"
                continuity_instructions = f"End with: 'وبكده خلصنا موضوع {user_topic}، وهنشوف دلوقتي إيه في {next_topic}.'"
            else:
                continuity_instructions = "Start with: 'واصلًا من النقطة اللي وقفنا عندها...'"

        prompt = f"""أنت كاتب سيناريو محترف لبرنامج "بودكاست تقني" باللهجة المصرية القاهرية (Cairene Slang).
            هدفنا: شرح مفاهيم علوم الحاسب (CS) لأي حد ماشي في الشارع باستخدام تشبيهات من الحياة اليومية.

            الشخصيات:
            - Speaker 1 (سارة): المذيعة. لسانها مصري جداً، دمها خفيف، بتسأل بذكاء وتقاطع أحمد كل شوية عشان الجمهور يفهم.
            - Speaker 2 (أحمد): الضيف (Senior Engineer). خبير، صوته هادي، **ممنوع يشرح شرح أكاديمي**، لازم يشرح بـ "أمثلة شعبية" (مطبخ، سوق، زحمة، مواصلات).

            ⚠️ قائمة الممنوعات (Negative Constraints) - ممنوع تماماً:
            1. ❌ ممنوع الفصحى نهائياً (لا تكتب: لماذا، سوف، هذا، نعم، حسناً).
            2. ❌ ممنوع الشرح المعقد: لا تشرح تعريفات كتب. اشرح بـ "تخيل إننا في...".
            3. ❌ ممنوع الجمل الطويلة: المذيعين بيقاطعوا بعض، والجمل لازم تكون قصيرة (≤ 10 كلمات).
            4. ❌ ممنوع المؤثرات الصوتية: لا تكتب [موسيقى] أو [ضحك]، اكتب الكلام فقط.

            ✅ أسلوب الكتابة المطلوب (Style Guide):
            1. **مصري 100%:** استخدم (يا دين النبي، يا خبر، ده، عشان، إيه الحلاوة دي، بص بقى).
            2. **الإنجليزية:** المصطلحات التقنية زي ما هي (API, Deadlock, RAM) وسط الكلام.
            3. **التشبيه الإجباري (Mandatory Analogy):**
              - اقرأ الـ"FOCUS CONTENT" جيداً، واستنتج المفهوم الأساسي.
              - اختار تشبيه من الحياة المصرية يناسب **جوهر المفهوم** (مش اسمه).
                - مثال: لو المفهوم عن "تنظيم الموارد" → "زي مدير المطبخ".
                - لو عن "التكرار لتحسين النتيجة" → "زي البائع اللي بيغير سعره".
            4. **الربط المنطقي:**
              - استخدم الـ"GLOBAL CONTEXT" علشان تربط بين الأجزاء:
                - "زي ما فهمنا في الجزء اللي فات عن..."
                - "ده هيخلينا نفكر في الموضوع اللي هنتكلم عنه بعد شوية..."
            5. **الإبداع الموجّه:**
              - استخدم المعلومات اللي مديهالك كمرجع أساسي.
              - **مسموح لك تضيف أمثلة من معرفتك**، بس **متخرجش عن نطاق الـFOCUS CONTENT**.

            ---
            ### KEY ENGLISH TERMS TO PRESERVE (Do NOT translate):
            - Technical terms: API, RAM, IoT, CPU, Algorithm, Protocol, Cloud, Sensor, Actuator
            - File formats: .pdf, .json, .txt
            - Code snippets: if x > 5: print("hello")

            ---
            🌟 المثال الذهبي (اكتب زيه بالظبط):

            Speaker 1 (سارة): طب يا أحمد، أنا بسمع كلمة CPU Scheduling دي وبحس إنها طلاسم، إيه الكلام الكبير ده؟
            Speaker 2 (أحمد): بصي يا ستي، ولا طلاسم ولا حاجة. تخيلي إن الـ CPU ده هو "الشيف" في مطبخ فندق كبير.
            Speaker 1 (سارة): حلو، يعني بيطبخ الداتا؟
            Speaker 2 (أحمد): الله ينور عليكي! أنتي بقى الزبون، وبتطلبي أوردرات كتير: "افتح كروم"، "شغل أغنية"، "نزل ملف". الشيف ده واحد بس، هيعمل كل ده إزاي في نفس الوقت؟
            Speaker 1 (سارة): أكيد هيتجنن! هيسيب الأكل يتحرق ويرد على التليفون.
            Speaker 2 (أحمد): بالظبط! هنا بقى بتيجي الـ Algorithms.. دي "قائمة التعليمات" اللي المدير بيديها للشيف عشان ينظم وقته. يقوله مثلاً: "خلص الطلبات السهلة الأول" (SJF)، أو "اشتغل في كل طلب دقيقة ولف عليهم" (Round Robin).
            Speaker 1 (سارة): يا ابن اللعيبة! يعني الكمبيوتر بيلف علينا كلنا عشان يرضي الجميع؟
            Speaker 2 (أحمد): بالظبط كده. من غير النظام ده، الجهاز "هيهنج" والزبون "هيزهق" ويمشي.

            ---

            ### CONTINUITY INSTRUCTIONS
            {continuity_instructions}

            ### GLOBAL CONTEXT (The Book Map)
            {doc_structure}

            ### FOCUS CONTENT (The Facts)
            {context}

            المطلوب منك الآن:
            - اقرأ الـ"FOCUS CONTENT" وافهم المفهوم الأساسي.
            - استخدم الـ"GLOBAL CONTEXT" علشان تربط بين الأجزاء.
            - اكتب حوار كامل بنفس الروح والأسلوب ده.
            - التزم بالهيكل الدرامي:
              **(1) Hook** ← سارة تبدأ بسؤال مفاجئ
              **(2) Context** ← أحمد يربط بالمفاهيم السابقة
              **(3) Deep Dive** ← التشبيه المصري + شرح مبسط
              **(4) Twist** ← سارة تقاطع بسؤال ذكي
              **(5) Outro** ← أحمد يلخص بجملة مأثورة

            مدة الحوار: حوالي 6 دقائق (≈1200 كلمة).
            ابدأ الحوار فوراً بدون مقدمات طويلة.

            Speaker 1 (سارة):"""

        return prompt.strip()

In [None]:
# @title 🔧 Step 15a: Thinking Extraction Helper (SAFE VERSION)
def extract_clean_script(raw_script):
    """
    SAFELY extracts final answer from Qwen3-Thinking output.
    Strategy:
    1. Remove ONLY explicit <|thinking|>...</|thinking|> blocks
    2. Keep ALL content after last </thinking> marker (dialogue + narrative + transitions)
    3. NEVER filter dialogue lines (preserves script flow)
    4. Only strip obvious English monologue starters at VERY beginning
    """
    # Method 1: Remove explicit thinking blocks (Qwen3 format) - SAFE
    cleaned = re.sub(r'<\|thinking\|>.*?<\|/thinking\|>', '', raw_script, flags=re.DOTALL)

    # Method 2: If thinking tags remain, keep content after last closing tag
    if '<|/thinking|>' in cleaned:
        parts = cleaned.split('<|/thinking|>')
        cleaned = parts[-1].strip()

    # Method 3: ONLY remove English monologue starters at VERY BEGINNING (safe for Egyptian Arabic)
    monologue_starters = [
        r'^Okay,\s*',
        r'^Hmm,\s*',
        r'^First,\s*',
        r'^Let me\s+',
        r'^I should\s+',
        r'^The user\s+',
        r'^This is\s+',
        r'^Alright,\s*',
        r'^So,\s+',
    ]
    for pattern in monologue_starters:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)

    # Final cleanup: Remove excessive newlines at start ONLY
    cleaned = re.sub(r'^\s*\n+', '', cleaned)

    return cleaned.strip()

In [None]:
# @title 🎤 Step 15: Doc2Pod Studio (Optimized + Debug-Enabled)
# @markdown Fully compatible with unified get_context() and handles all edge cases.

def get_pdf_selection():
    pdf_files = list(PDF_DIR.glob("*.pdf"))
    if not pdf_files:
        print(f"\n❌ No PDFs found in {PDF_DIR}!")
        return None
    print("\n[?] Available Documents:")
    for idx, f in enumerate(pdf_files):
        print(f"   {idx + 1}: {f.name}")
    while True:
        choice = input(f"\nSelect Document (1-{len(pdf_files)}): ")
        if choice.isdigit() and 1 <= int(choice) <= len(pdf_files):
            return pdf_files[int(choice) - 1]

def sanitize_filename(text):
    if not text: return "Untitled"
    safe = re.sub(r'[^a-zA-Z0-9._-]', '_', text.strip())
    return safe[:50]

# --- MAIN INTERACTIVE LOOP ---
print("--- Doc2Pod Studio ---")

selected_pdf = get_pdf_selection()
if selected_pdf:
    pdf_name_raw = selected_pdf.stem
    safe_name = re.sub(r'[^a-zA-Z0-9._-]', '_', pdf_name_raw)
    coll_name = f"cs_podcast_{safe_name}"

    # Check for both chunks.json AND embedding_input_texts.json
    expected_chunks_json = UNIFIED_DIR / safe_name / f"{safe_name}_chunks.json"
    expected_embedding_json = EMBEDDING_DIR / safe_name / f"{safe_name}_embedding_input_texts.json"

    if not expected_chunks_json.exists() or not expected_embedding_json.exists():
        print(f"\n[Phase 1] Data missing. Running Extraction Pipeline...")
        run_friend_extraction_pipeline(selected_pdf)
    else:
        print(f"\n[Phase 1] Data found. Using existing database.")

    # 🔍 DEBUG: Show chunk structure
    chunks_path = UNIFIED_DIR / safe_name / f"{safe_name}_chunks.json"
    with open(chunks_path, "r") as f:
        chunks_data = json.load(f)
    print("\n🔍 [DEBUG] Chunk Structure:")
    for i, ch in enumerate(chunks_data["chunks"]):
        pages = sorted(set(u["page"] for u in ch.get("content", [])))
        print(f"  {i+1}. '{ch['concept']}' (Pages: {pages})")

    rag = Doc2PodRetrieval(DB_DIR, coll_name, use_gpu=True, debug=True)
    gen = PodcastScriptGen()
    doc_unified_dir = UNIFIED_DIR / safe_name
    embedding_dir = EMBEDDING_DIR / safe_name
    embedding_path = embedding_dir / f"{safe_name}_embedding_input_texts.json"

    # Initialize Qwen3 ONCE outside loops
    pipeline_manager = LazyPipelineManager()
    model, tokenizer = pipeline_manager.get_qwen3_thinking_model()

    while True:
        print("\n" + "="*40)
        print("1. Ask Specific Question")
        print("2. Filter by Page Range")
        print("3. Auto-Generate Full Podcast (60+ min)")
        print("Type 'exit' to quit.")

        mode = input("\nSelect Mode: ").strip()
        if mode == 'exit':
            # Explicitly unload models
            del model, tokenizer
            pipeline_manager.unload_all()
            torch.cuda.empty_cache()
            gc.collect()
            break

        # Mode 3: Full Podcast (with sorting + filtering)
        if mode == '3':
            print("[*] Generating full 60+ minute podcast...")
            with open(embedding_path, "r") as f:
                embedding_data = json.load(f)
            subchunks = embedding_data["inputs"]

            # Filter empty + sort by page
            original_count = len(subchunks)
            subchunks = [
                sc for sc in subchunks
                if sc["text"].strip() and sc["metadata"]["units"]
            ]
            subchunks.sort(key=lambda x: x["metadata"]["page"])
            print(f"🔍 [DEBUG] Subchunks: {original_count} → {len(subchunks)} after filtering")

            # Load concepts from TRUE document order (not retrieval order)
            chunks_path = UNIFIED_DIR / safe_name / f"{safe_name}_chunks.json"
            with open(chunks_path, "r") as f:
                chunks_data = json.load(f)
            concepts = []
            seen = set()
            for ch in chunks_data["chunks"]:
                concept = ch["concept"]
                if concept not in seen:
                    concepts.append(concept)
                    seen.add(concept)

            full_script_parts = []
            raw_parts = []  # Track raw outputs like in Modes 1 & 2
            for idx, sub in enumerate(subchunks):
                meta = sub["metadata"]
                concept = meta["concept"]
                total_subs = meta["total_subchunks"]
                sub_idx = idx + 1

                next_concept = None
                if concept in concepts:
                    concept_idx = concepts.index(concept)
                    if concept_idx + 1 < len(concepts):
                        next_concept = concepts[concept_idx + 1]

                print(f"→ [DEBUG] Generating Part {sub_idx}/{len(subchunks)}: '{concept}'")
                if next_concept:
                    print(f"   → Next: '{next_concept}'")

                final_prompt = gen.build_prompt(
                    context=sub["text"],
                    user_topic=concept,
                    lang_choice="3",
                    unified_dir=doc_unified_dir,
                    sub_idx=sub_idx,
                    total_subs=total_subs,
                    next_concept=next_concept
                )

                messages = [{"role": "user", "content": final_prompt}]
                text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                inputs = tokenizer([text], return_tensors="pt").to(model.device)
                outputs = model.generate(**inputs, max_new_tokens=2500)
                raw_script = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)

                # ✅ EXTRACT CLEAN VERSION (thinking removed, narrative preserved)
                clean_script = extract_clean_script(raw_script)

                # 🔒 SAFETY CHECK: Prevent over-filtering
                if len(clean_script) < 100:
                    print(f"   ⚠️ WARNING: Clean script too short ({len(clean_script)} chars) - falling back to raw")
                    clean_script = re.sub(r'<\|thinking\|>.*?<\|/thinking\|>', '', raw_script, flags=re.DOTALL).strip()

                # CRITICAL: Append BOTH versions
                full_script_parts.append(clean_script)
                raw_parts.append(raw_script)  # ✅ Track raw output

                # Periodic memory cleanup
                if idx % 3 == 2:
                    torch.cuda.empty_cache()
                    gc.collect()

            # ✅ SAVE FINAL COMBINED SCRIPTS (FIXED)
            full_script = "\n\n".join(full_script_parts)
            raw_full = "\n\n".join(raw_parts)  # ✅ CORRECT WAY
            debug_filename = f"{safe_name}_FULL_60MIN_PODCAST_DEBUG.txt"
            prod_filename = f"{safe_name}_FULL_60MIN_PODCAST.txt"
            debug_path = OUTPUT_DIR / debug_filename
            prod_path = OUTPUT_DIR / prod_filename

            # Save DEBUG version (full raw output)
            with open(debug_path, "w", encoding="utf-8") as f:
                f.write(raw_full)  # ✅ Now saves true raw output

            # Save PRODUCTION version (cleaned)
            with open(prod_path, "w", encoding="utf-8") as f:
                f.write(full_script)

            print(f"   💾 Saved DEBUG to: {debug_path.name}")
            print(f"   ✅ Saved CLEAN to: {prod_path.name}")

            print("\n" + "="*50)
            print(f"🎉 FULL PODCAST SAVED TO: {prod_path}")
            print(f"📊 Total Parts: {len(subchunks)} | Estimated Duration: ~{len(subchunks)*6} minutes")
            print("="*50)
            print(full_script)  # ✅ PRINT CLEANED SCRIPT
            print("="*50)
            continue

        # Mode 2: Page Range (with all 3 updates)
        if mode == '2':
            try:
                p_input = input("Enter Range (e.g. 10-20): ")
                start, end = map(int, p_input.split('-'))
                page_range = (start, end)
                topic = input("Filter Topic (Optional): ").strip()

                file_mode = "Range"
                file_detail = f"Pages_{start}-{end}"
                if topic:
                    file_detail += f"_{sanitize_filename(topic)}"

                if topic:
                    # Scenario: Query + Range → MULTIPLE subchunks
                    query_results = rag.get_context(
                        embedding_dir,
                        embedding_path,
                        query=topic,
                        page_range=page_range,
                        k=10
                    )
                    print(f"Retrieved {len(query_results)} chunks" if query_results else "❌ Still failing - check debug output")
                    if not query_results or (isinstance(query_results, list) and not query_results):
                        print("⚠️ No relevant content found.")
                        continue

                    # Handle result format
                    if isinstance(query_results, tuple):
                        context, concept, sub_idx, total_subs, next_concept = query_results
                        script_parts = [context]
                        concepts_list = [concept]
                        total_subs_list = [total_subs]
                        next_concepts_list = [next_concept]
                    else:
                        script_parts = [r["text"] for r in query_results]
                        concepts_list = [r["concept"] for r in query_results]
                        total_subs_list = [r["total_subs"] for r in query_results]
                        next_concepts_list = [r["next_concept"] for r in query_results]

                    full_script_parts = []
                    raw_parts = []  # Track raw outputs for debug file
                    for idx, (context, concept, total_subs, next_concept) in enumerate(
                        zip(script_parts, concepts_list, total_subs_list, next_concepts_list)
                    ):
                        sub_idx = idx + 1
                        print(f"→ [DEBUG] Retrieved Part {sub_idx}: '{concept}' (Pages: {start}-{end})")
                        if next_concept:
                            print(f"   → Next: '{next_concept}'")

                        final_prompt = gen.build_prompt(
                            context=context,
                            user_topic=concept,
                            lang_choice="3",
                            unified_dir=doc_unified_dir,
                            sub_idx=sub_idx,
                            total_subs=total_subs,
                            next_concept=next_concept
                        )

                        messages = [{"role": "user", "content": final_prompt}]
                        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                        inputs = tokenizer([text], return_tensors="pt").to(model.device)
                        outputs = model.generate(**inputs, max_new_tokens=2500)
                        raw_script = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)

                        # ✅ EXTRACT CLEAN VERSION
                        clean_script = extract_clean_script(raw_script)

                        # 🔒 SAFETY CHECK
                        if len(clean_script) < 100:
                            print(f"   ⚠️ WARNING: Clean script too short ({len(clean_script)} chars) - falling back to raw")
                            clean_script = re.sub(r'<\|thinking\|>.*?<\|/thinking\|>', '', raw_script, flags=re.DOTALL).strip()

                        # Track both versions
                        full_script_parts.append(clean_script)
                        raw_parts.append(raw_script)

                    # ✅ SAVE FINAL COMBINED SCRIPTS
                    full_script = "\n\n".join(full_script_parts)
                    raw_full = "\n\n".join(raw_parts)
                    debug_filename = f"{safe_name}_{file_mode}_{file_detail}_DEBUG.txt"
                    prod_filename = f"{safe_name}_{file_mode}_{file_detail}.txt"
                    debug_path = OUTPUT_DIR / debug_filename
                    prod_path = OUTPUT_DIR / prod_filename

                    with open(debug_path, "w", encoding="utf-8") as f:
                        f.write(raw_full)
                    with open(prod_path, "w", encoding="utf-8") as f:
                        f.write(full_script)

                    print(f"   💾 Saved DEBUG to: {debug_path.name}")
                    print(f"   ✅ Saved CLEAN to: {prod_path.name}")

                    print("\n" + "="*50)
                    print(f"✅ SCRIPT SAVED TO: {prod_path}")
                    print("="*50)
                    print(full_script)  # ✅ PRINT CLEANED SCRIPT
                    print("="*50)

                else:
                    # Scenario: Range Only → full podcast with continuity
                    with open(embedding_path, "r") as f:
                        embedding_data = json.load(f)
                    all_subchunks = embedding_data["inputs"]

                    original_count = len(all_subchunks)
                    filtered_subchunks = [
                        sc for sc in all_subchunks
                        if start <= sc["metadata"]["page"] <= end
                        and sc["text"].strip()
                        and sc["metadata"]["units"]
                    ]
                    filtered_subchunks.sort(key=lambda x: x["metadata"]["page"])
                    print(f"→ [DEBUG] Range {start}-{end}: {original_count} → {len(filtered_subchunks)} subchunks")

                    if not filtered_subchunks:
                        print("⚠️ No valid subchunks found in page range.")
                        continue

                    # Load concepts from TRUE document order
                    chunks_path = UNIFIED_DIR / safe_name / f"{safe_name}_chunks.json"
                    with open(chunks_path, "r") as f:
                        chunks_data = json.load(f)
                    concepts = []
                    seen = set()
                    for ch in chunks_data["chunks"]:
                        concept = ch["concept"]
                        if concept not in seen:
                            concepts.append(concept)
                            seen.add(concept)

                    full_script_parts = []
                    raw_parts = []  # Track raw outputs for debug file
                    for idx, sub in enumerate(filtered_subchunks):
                        meta = sub["metadata"]
                        concept = meta["concept"]
                        total_subs = meta["total_subchunks"]
                        sub_idx = idx + 1

                        next_concept = None
                        if concept in concepts:
                            concept_idx = concepts.index(concept)
                            if concept_idx + 1 < len(concepts):
                                next_concept = concepts[concept_idx + 1]

                        print(f"→ [DEBUG] Generating Part {sub_idx}/{len(filtered_subchunks)}: '{concept}'")
                        if next_concept:
                            print(f"   → Next: '{next_concept}'")

                        final_prompt = gen.build_prompt(
                            context=sub["text"],
                            user_topic=concept,
                            lang_choice="3",
                            unified_dir=doc_unified_dir,
                            sub_idx=sub_idx,
                            total_subs=total_subs,
                            next_concept=next_concept
                        )

                        messages = [{"role": "user", "content": final_prompt}]
                        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                        inputs = tokenizer([text], return_tensors="pt").to(model.device)
                        outputs = model.generate(**inputs, max_new_tokens=2500)
                        raw_script = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)

                        # ✅ EXTRACT CLEAN VERSION (thinking removed, narrative preserved)
                        clean_script = extract_clean_script(raw_script)

                        # 🔒 SAFETY CHECK: Prevent over-filtering
                        if len(clean_script) < 100:
                            print(f"   ⚠️ WARNING: Clean script too short ({len(clean_script)} chars) - falling back to raw")
                            clean_script = re.sub(r'<\|thinking\|>.*?<\|/thinking\|>', '', raw_script, flags=re.DOTALL).strip()

                        # Track both versions
                        full_script_parts.append(clean_script)
                        raw_parts.append(raw_script)

                        if idx % 3 == 2:
                            torch.cuda.empty_cache()
                            gc.collect()

                    # ✅ SAVE FINAL COMBINED SCRIPTS
                    full_script = "\n\n".join(full_script_parts)
                    raw_full = "\n\n".join(raw_parts)
                    debug_filename = f"{safe_name}_{file_mode}_{file_detail}_DEBUG.txt"
                    prod_filename = f"{safe_name}_{file_mode}_{file_detail}.txt"
                    debug_path = OUTPUT_DIR / debug_filename
                    prod_path = OUTPUT_DIR / prod_filename

                    with open(debug_path, "w", encoding="utf-8") as f:
                        f.write(raw_full)
                    with open(prod_path, "w", encoding="utf-8") as f:
                        f.write(full_script)

                    print(f"   💾 Saved DEBUG to: {debug_path.name}")
                    print(f"   ✅ Saved CLEAN to: {prod_path.name}")

                    print("\n" + "="*50)
                    print(f"✅ RANGE PODCAST SAVED TO: {prod_path}")
                    print(f"📊 Total Parts: {len(full_script_parts)} | Estimated Duration: ~{len(full_script_parts)*6} minutes")
                    print("="*50)
                    print(full_script)  # ✅ PRINT CLEANED SCRIPT
                    print("="*50)

                continue

            except Exception as e:
                print(f"❌ Invalid format: {e}")
                continue

        # Mode 1: Single Query (UPDATED)
        elif mode == '1':
            topic = input("Enter Topic: ").strip()
            if not topic: continue
            file_mode = "QnA"
            file_detail = sanitize_filename(topic)

            query_results = rag.get_context(
                embedding_dir,
                embedding_path,
                query=topic
            )

            if not query_results or (isinstance(query_results, list) and not query_results):
                print("⚠️ No relevant data found.")
                continue

            # Handle single result (old format) vs list (new format)
            if isinstance(query_results, tuple):
                # Old format (Range Only path)
                context, concept, sub_idx, total_subs, next_concept = query_results
                script_parts = [context]
                concepts_list = [concept]
                total_subs_list = [total_subs]
                next_concepts_list = [next_concept]
            else:
                # New format: list of results
                script_parts = [r["text"] for r in query_results]
                concepts_list = [r["concept"] for r in query_results]
                total_subs_list = [r["total_subs"] for r in query_results]
                next_concepts_list = [r["next_concept"] for r in query_results]

            lang = input("Language (1:EN, 2:MSA, 3:EGY): ")
            full_script_parts = []
            raw_parts = []  # Track raw outputs for debug file

            # Load concepts from TRUE document order for continuity
            chunks_path = UNIFIED_DIR / safe_name / f"{safe_name}_chunks.json"
            with open(chunks_path, "r") as f:
                chunks_data = json.load(f)
            global_concepts = []
            seen = set()
            for ch in chunks_data["chunks"]:
                concept = ch["concept"]
                if concept not in seen:
                    global_concepts.append(concept)
                    seen.add(concept)

            for idx, (context, concept, total_subs, next_concept) in enumerate(
                zip(script_parts, concepts_list, total_subs_list, next_concepts_list)
            ):
                sub_idx = idx + 1

                # Override next_concept using TRUE document order
                actual_next_concept = None
                if concept in global_concepts:
                    concept_idx = global_concepts.index(concept)
                    if concept_idx + 1 < len(global_concepts):
                        actual_next_concept = global_concepts[concept_idx + 1]

                print(f"→ [DEBUG] Retrieved Part {sub_idx}: '{concept}'")
                if actual_next_concept:
                    print(f"   → Next: '{actual_next_concept}'")

                final_prompt = gen.build_prompt(
                    context=context,
                    user_topic=concept,
                    lang_choice=lang,
                    unified_dir=doc_unified_dir,
                    sub_idx=sub_idx,
                    total_subs=total_subs,
                    next_concept=actual_next_concept
                )

                messages = [{"role": "user", "content": final_prompt}]
                text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                inputs = tokenizer([text], return_tensors="pt").to(model.device)
                outputs = model.generate(**inputs, max_new_tokens=2500)
                raw_script = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)

                # ✅ EXTRACT CLEAN VERSION
                clean_script = extract_clean_script(raw_script)

                # 🔒 SAFETY CHECK
                if len(clean_script) < 100:
                    print(f"   ⚠️ WARNING: Clean script too short ({len(clean_script)} chars) - falling back to raw")
                    clean_script = re.sub(r'<\|thinking\|>.*?<\|/thinking\|>', '', raw_script, flags=re.DOTALL).strip()

                # Track both versions
                full_script_parts.append(clean_script)
                raw_parts.append(raw_script)

                if idx % 3 == 2:
                    torch.cuda.empty_cache()
                    gc.collect()

            # ✅ SAVE FINAL COMBINED SCRIPTS
            full_script = "\n\n".join(full_script_parts)
            raw_full = "\n\n".join(raw_parts)
            debug_filename = f"{safe_name}_{file_mode}_{file_detail}_DEBUG.txt"
            prod_filename = f"{safe_name}_{file_mode}_{file_detail}.txt"
            debug_path = OUTPUT_DIR / debug_filename
            prod_path = OUTPUT_DIR / prod_filename

            with open(debug_path, "w", encoding="utf-8") as f:
                f.write(raw_full)
            with open(prod_path, "w", encoding="utf-8") as f:
                f.write(full_script)

            print(f"   💾 Saved DEBUG to: {debug_path.name}")
            print(f"   ✅ Saved CLEAN to: {prod_path.name}")

            print("\n" + "="*50)
            print(f"✅ SCRIPT SAVED TO: {prod_path}")
            print("="*50)
            print(full_script)  # ✅ PRINT CLEANED SCRIPT
            print("="*50)

--- Doc2Pod Studio ---

[?] Available Documents:
   1: [IoT_25] Lecture 1.pdf
   2: [ML&PR 2024] Lec5 Classification II.pdf

Select Document (1-2): 1


[32mCreating model: ('PP-DocLayoutV2', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-DocLayoutV2`.[0m



[Phase 1] Data missing. Running Extraction Pipeline...
🚀 Starting Extraction Pipeline for: [IoT_25] Lecture 1.pdf
[*] Loading PaddleOCR on gpu...


[32mCreating model: ('PaddleOCR-VL-0.9B', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PaddleOCR-VL`.[0m
[32mLoading configuration file /root/.paddlex/official_models/PaddleOCR-VL/config.json[0m
[32mLoading weights file /root/.paddlex/official_models/PaddleOCR-VL/model.safetensors[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key_value_heads: 2[0m
[32muse GQA - num_heads: 16- num_key

[*] Starting OCR for [IoT_25] Lecture 1 (52 pages)...


OCR Processing:   0%|          | 0/52 [00:00<?, ?page/s]

  return tensor(
Non compatible API. Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/model_convert/convert_from_pytorch/api_difference/torch/torch.max.html first.


✅ OCR Complete. Data saved to: /content/drive/MyDrive/Doc2Pod_System/Data/json/[IoT_25] Lecture 1/[IoT_25] Lecture 1.json
✅ OCR Complete.
[*] Loading Qwen3-VL (Vision Model)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/713 [00:00<?, ?it/s]

[*] Unloading all models from GPU...
[*] GPU memory cleared.
✅ Created 241 raw units.
[*] Loading Embedding Model...
[*] Loading Embedding Model on cuda...


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

⏭️ Skipping entire page 2 (admin title): 'Practical IoT Hacking
The Definitive Guide to Attacking the
Internet of Things

Foreword by
DAVE KENNEDY

Fotios Chantzis and Ioannis Stais
Paulina Calderon, Evangelos Deimantzoglou, and Beau Woods

no starch press'
⏭️ Skipping entire page 9 (admin title): 'The image contains the following readable textual and semantic information:

- The prominent text "IOT" is displayed at the top center, which stands for Internet of Things.

- The image is a conceptual illustration depicting the interconnected ecosystem of IoT, with various icons and symbols representing devices and functions linked together.

- Key elements include:
  - A hand holding a smartphone, indicating user interaction or control.
  - A cloud icon, symbolizing cloud computing or data storage.
  - A bar chart, representing data analytics or monitoring.
  - A camera, suggesting video or image capture.
  - A smart thermostat or similar device, indicating home automation.
  - A traffic l

Embedding:   0%|          | 0/28 [00:00<?, ?it/s]

🎉 Pipeline Success! Data ready for RAG.

🔍 [DEBUG] Chunk Structure:
  1. 'Introduction' (Pages: [1])
  2. 'What is IoT?' (Pages: [6, 7, 8, 10, 12, 13, 14, 15])
  3. 'Comparison between Traditional Network and IoT' (Pages: [16, 17, 18, 19, 20, 21, 22, 23])
  4. 'IoT Application Domains' (Pages: [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36])
  5. 'IoT Communication Protocols' (Pages: [38, 39, 40, 41])
  6. 'IoT Levels & Deployment Templates' (Pages: [44, 45, 47, 48, 50, 51, 52])


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

✅ Loaded collection 'cs_podcast__IoT_25__Lecture_1' with 28 vectors
[*] Loading Qwen3-4B-Thinking...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]


1. Ask Specific Question
2. Filter by Page Range
3. Auto-Generate Full Podcast (60+ min)
Type 'exit' to quit.

Select Mode: 1
Enter Topic: what exactly is iot - level 2 ?
🔍 Loaded 28 subchunks
🔍 ID mapping size: 28
🔍 Sample ID format: '[IoT_25] Lecture 1_c0000'

🔍 ChromaDB Query: 'what exactly is iot - level 2 ?'
   Filter: None
   Raw IDs returned: ['[IoT_25] Lecture 1_c0014', '[IoT_25] Lecture 1_c0015', '[IoT_25] Lecture 1_c0012', '[IoT_25] Lecture 1_c0011', '[IoT_25] Lecture 1_c0020', '[IoT_25] Lecture 1_c0022', '[IoT_25] Lecture 1_c0025', '[IoT_25] Lecture 1_c0018', '[IoT_25] Lecture 1_c0013', '[IoT_25] Lecture 1_c0023']
   Available IDs in mapping: ['[IoT_25] Lecture 1_c0000', '[IoT_25] Lecture 1_c0001', '[IoT_25] Lecture 1_c0002']...
✅ Mapped 10/10 results
✅ Sorted 10 results by page:
   - Page 44: 'IoT Levels & Deployment Templates (Level-2)'
   - Page 44: 'IoT Levels & Deployment Templates (Level-1)'
   - Page 45: 'IoT Levels & Deployment Templates (Level-2)'
   - Page 45: 'Io

In [None]:
!rm -rf /content/drive/MyDrive/Doc2Pod_System/Data/json
!rm -rf /content/drive/MyDrive/Doc2Pod_System/Data/unified_json
!rm -rf /content/drive/MyDrive/Doc2Pod_System/Data/chroma_db
print("🗑️ Old data cleared. Ready for fresh run.")

🗑️ Old data cleared. Ready for fresh run.
