In [None]:
# rag_scipdf_core.py  ── import this file from your app
# -------------------------------------------------------
import os, glob, json, re, textwrap, time, uuid
from typing import Dict, List
import nest_asyncio
import chromadb
import google.generativeai as genai
from llama_parse import LlamaParse

# ─────────── Keys & model names ───────────
GEMINI_API_KEY  = "AIzaSyD-zaE26w-qsu00AyXXjVI7HK7Uhl6-IVQ"   # MUST be set in the host env
LLAMA_API_KEY   = "llx-gOp6mdoPWfKSft1LdDaiDcT2GZl2hEuKXjS5u7E6kWIqXIuu"  # optional
MODEL_GEN       = "models/gemini-1.5-flash-latest"
MODEL_EMB       = "models/text-embedding-004"

if not GEMINI_API_KEY:
    raise RuntimeError("Set GEMINI_API_KEY environment variable first")

# ───────────  Initialise clients ──────────
nest_asyncio.apply()

genai.configure(api_key=GEMINI_API_KEY)
_gemini_gen  = genai.GenerativeModel(MODEL_GEN)

client = chromadb.PersistentClient(path="chroma_scipdfs")
collection = client.get_or_create_collection(
    name="scientific_chunks",
    metadata={"hnsw:space": "cosine"}
)

parser_full = LlamaParse(
    api_key           = LLAMA_API_KEY or None,
    result_type       = "markdown",
    chunk_size        = None,
    allow_nested_data = False,
)

# ───────────── sanitize helpers ──────────────────
def _scalarize(value):
    """Turn list-type metadata into a comma-separated string."""
    if isinstance(value, list):
        return ", ".join(map(str, value))
    return value

def _clean_meta(meta: dict) -> dict:
    """Recursively ensure every metadata value is scalar."""
    clean = {}
    for k, v in meta.items():
        if isinstance(v, dict):                       # unlikely but safe
            clean[k] = _clean_meta(v)
        else:
            clean[k] = _scalarize(v)
    return clean

# ───────────  Internal helpers ────────────
def _gemini_chat(prompt: str, retry: int = 3) -> str:
    for i in range(retry):
        try:
            return _gemini_gen.generate_content(prompt).text.strip()
        except Exception as e:
            if i == retry - 1:
                raise
            time.sleep(1 + i)

def _embed(texts: List[str]) -> List[List[float]]:
    return genai.embed_content(
        model=MODEL_EMB,
        content=texts,
        task_type="retrieval_document"
    )["embedding"]

_META_PROMPT = textwrap.dedent(""" Extract the following fields from the first‑page text of a scientific paper.
    Return ONLY valid **JSON** with these exact keys:
    {
      "title": string,
      "authors": [list of full names],
      "abstract": string,
      "keywords": [list of keywords],
      "Diseases:" [list of diseases used in the paper],                       
      "Methodology": one of ["signal", "image", "graph", "language"]
    }
    If a field is missing, return an empty string or empty list.
    Text:
""")

_QUERY_PROMPT = textwrap.dedent("""
From the user query extract any of these fields (return JSON):
{ "Diseases":[…],"title":string,"authors":[…],"keywords":[…],"methodology":string }
Query:
""")

def _safe_json(raw: str) -> Dict:
    raw = re.sub(r"^```json|```$", "", raw, flags=re.I).strip()
    try: return json.loads(raw)
    except Exception: return {}

# ──────────────────────────────────────────
#  PUBLIC API
# ──────────────────────────────────────────
def ingest_documents(pdf_glob: str,
                     by_page: bool = True,
                     chunk_size: int = 1500) -> None:
    """
    Parse → chunk → embed → store in Chroma.
    You can call this many times – duplicate IDs are skipped.
    """
    paths = glob.glob(pdf_glob)
    if not paths:
        raise FileNotFoundError("No PDF matched the pattern")

    docs = parser_full.load_data(paths)

    for doc, path in zip(docs, paths):
        meta = _safe_json(_gemini_chat(_META_PROMPT + doc.text))
        meta["path"] = path

        # ---- chunking ----
        # --- chunking -----------------------------------------------------
        if by_page and hasattr(doc, "pages") and doc.pages:
            # preferred: one chunk per pdf-page
            chunks = [p.text for p in doc.pages]
        else:                           # fallback: length-based slicing
            txt    = doc.text
            chunks = [txt[i:i + chunk_size]                # ≈1500 chars each
                    for i in range(0, len(txt), chunk_size)]


        embs  = _embed(chunks)
        ids   = [str(uuid.uuid4()) for _ in chunks]
        metas = []

        for _id, chunk in zip(ids, chunks):
            m = _clean_meta(meta)                  # ← ensure scalars only
            m.update({
                "chunk_id": _id,
                "chunk_preview": chunk[:400]
            })
            metas.append(m)


        collection.add(ids     = ids,
                       embeddings = embs,
                       documents  = chunks,
                       metadatas  = metas)
        print(f"✓ stored {len(chunks):3d} chunk(s) from {os.path.basename(path)}")


def query_rag(question: str, top_k: int = 3) -> str:
    """
    Semantic + metadata-aware retrieval, followed by Gemini answer synthesis.
    """
    
    parsed = _safe_json(_gemini_chat(_QUERY_PROMPT + question))
    print(parsed)
    q_vec  = _embed([question])[0]

    where = {k: v for k, v in parsed.items() if v}

    res = collection.query(
        query_embeddings=[q_vec],
        n_results=top_k,
        where=where if where else None,
        include=["documents", "metadatas"]
    )

    ctx = ""
    for i, (doc, meta) in enumerate(zip(res["documents"][0],
                                    res["metadatas"][0]), 1):
        ctx += (
            f"\n### Doc {i}\n"
            f"Path : {meta['path']}\n"
            f"Title: {meta.get('title', '')}\n"
            f"Authors: {meta.get('authors', '')}\n"
            f"Keywords: {meta.get('keywords', '')}\n"
            f"{doc[:1500]}\n"
        )
    print(ctx)

    answer_prompt = textwrap.dedent(f"""
    Use ONLY the information in the chunks below to answer the user question.
    Cite as (Doc 1), (Doc 2)… where appropriate.

    {ctx}

    Question: "{question}"
    """)
    return _gemini_chat(answer_prompt)


In [None]:
ingest_documents("D:\\RAG\\PAPER_BRAZIL.pdf")        # step-1   (once)


Parsing files: 100%|██████████| 1/1 [00:59<00:00, 59.75s/it]


✓ stored  12 chunk(s) from PAPER_BRAZIL.pdf
The authors are Vatsal Shah, Love Fadia, Mohammad Hassanzadeh, Majid Ahmadi, and Jonathan Wu. (Doc 1, Doc 2, Doc 3)


In [8]:
print(query_rag("Who are the authors?"))

{}

### Doc 1
Path : D:\RAG\PAPER_BRAZIL.pdf
Title: Lightweight Vision Transformer for Efficient Influenza Virus Subtype Classification via Genomic Image Processing
Authors: Vatsal Shah, Love Fadia, Mohammad Hassanzadeh, Majid Ahmadi, Jonathan Wu
Keywords: Influenza virus, Genomic Data Analysis, Gramian Angular Summation Field, Convolutional Neural Network, Vision Transformer

### Doc 2
Path : D:\RAG\PAPER_BRAZIL.pdf
Title: Lightweight Vision Transformer for Efficient Influenza Virus Subtype Classification via Genomic Image Processing
Authors: Vatsal Shah, Love Fadia, Mohammad Hassanzadeh, Majid Ahmadi, Jonathan Wu
Keywords: Influenza virus, Genomic Data Analysis

### Doc 3
Path : D:\RAG\PAPER_BRAZIL.pdf
Title: Lightweight Vision Transformer for Efficient Influenza Virus Subtype Classification via Genomic Image Processing
Authors: Vatsal Shah, Love Fadia, Mohammad Hassanzadeh, Majid Ahmadi, Jonathan Wu
Keywords: Influenza virus, Genomic Data Analysis

Vatsal Shah, Love Fadia, Mohammad 

In [13]:
print(query_rag("Which combination got the best accuracy ?"))

The combination of the proposed method and GADF achieved the highest accuracy at 98.34% (Doc 3).


In [10]:
collection.peek(1)

{'ids': ['083ab09e-a9c7-4e02-9721-7b4e9c7eac69'],
 'embeddings': [[-0.06251403,
   -0.004813515,
   -0.010654692,
   -0.011218989,
   0.023100358,
   -0.009354548,
   0.018634932,
   0.0071557746,
   -0.013857004,
   -0.0011912282,
   -0.023723835,
   0.03168933,
   0.011525357,
   0.009895766,
   0.0843007,
   -0.043037754,
   0.064180724,
   0.018642938,
   -0.07346096,
   -0.005303306,
   0.012962045,
   -0.0034852908,
   -0.045039754,
   -0.052248828,
   -0.06599786,
   -0.0057515493,
   0.007632555,
   0.00854894,
   0.03576389,
   -0.008823718,
   0.040653802,
   0.067894466,
   0.027919846,
   -0.033902332,
   0.015172411,
   0.075485624,
   0.020785464,
   0.04456813,
   0.033670675,
   -0.040352914,
   -0.021102076,
   0.0064245933,
   -0.0020456892,
   0.066710666,
   0.0026127526,
   0.010629963,
   0.04391139,
   0.04147557,
   -0.06753652,
   0.045544516,
   -0.008211431,
   -0.015130207,
   -0.031706817,
   0.054952666,
   0.015298931,
   -0.05151809,
   -0.021952659,
   

In [1]:
# rag_scipdf_core.py  ── drop‑in replacement
# ----------------------------------------------------------
import os, glob, json, re, textwrap, time, uuid, sqlite3, shutil
from typing   import Dict, List
import nest_asyncio, chromadb, google.generativeai as genai
from llama_parse import LlamaParse

# ─────────── Keys & model names ───────────
GEMINI_API_KEY = "AIzaSyD-zaE26w-qsu00AyXXjVI7HK7Uhl6-IVQ"
LLAMA_API_KEY  = "llx-gOp6mdoPWfKSft1LdDaiDcT2GZl2hEuKXjS5u7E6kWIqXIuu"
MODEL_GEN      = "models/gemini-1.5-flash-latest"
MODEL_EMB      = "models/text-embedding-004"

if not GEMINI_API_KEY:
    raise RuntimeError("Set GEMINI_API_KEY environment variable first")

# ─────────── Init external clients ─────────
nest_asyncio.apply()
genai.configure(api_key=GEMINI_API_KEY)
_gemini_gen  = genai.GenerativeModel(MODEL_GEN)

client = chromadb.PersistentClient(path="chroma_scipdfs")
collection = client.get_or_create_collection(
    name="scientific_chunks",
    metadata={"hnsw:space": "cosine"}
)

parser_full = LlamaParse(
    api_key           = LLAMA_API_KEY or None,
    result_type       = "markdown",
    chunk_size        = None,
    allow_nested_data = False,
)

# ─────────── local object / SQL stores ──────
IMG_STORE = "obj_store_images"           # directory for figures
os.makedirs(IMG_STORE, exist_ok=True)

SQLITE_DB = "tables.db"
sql_conn  = sqlite3.connect(SQLITE_DB,   check_same_thread=False)
sql_cur   = sql_conn.cursor()
sql_cur.execute("""
CREATE TABLE IF NOT EXISTS tables(
    id        TEXT PRIMARY KEY,
    chunk_id  TEXT,
    raw_md    TEXT,
    summary   TEXT
)""")
sql_conn.commit()

# ─────────── tiny utils ─────────────────────
def _scalarize(v):
    return ", ".join(map(str, v)) if isinstance(v, list) else v

def _clean_meta(meta: dict) -> dict:
    return {k: _scalarize(v) if not isinstance(v, dict) else _clean_meta(v)
            for k,v in meta.items()}

def _gemini_chat(prompt: str, retry: int = 3) -> str:
    for i in range(retry):
        try:
            return _gemini_gen.generate_content(prompt).text.strip()
        except Exception:
            if i == retry-1: raise
            time.sleep(1+i)

def _embed(texts: List[str]) -> List[List[float]]:
    return genai.embed_content(
        model=MODEL_EMB, content=texts,
        task_type="retrieval_document")["embedding"]

def _safe_json(txt: str) -> Dict:
    txt = re.sub(r"^```json|```$", "", txt, flags=re.I).strip()
    try: return json.loads(txt)
    except Exception: return {}

def _save_image(img_bytes: bytes, parent_chunk: str) -> str:
    img_id  = str(uuid.uuid4()) + ".png"
    img_out = os.path.join(IMG_STORE, img_id)
    with open(img_out, "wb") as f: f.write(img_bytes)
    return img_id, img_out

def _summarise(text: str, kind: str) -> str:
    prompt = f"Give a concise (~200 words) summary of this {kind}:\n{text[:3000]}"
    return _gemini_chat(prompt)

# ───────────── extraction regexes ───────────
IMG_RE   = re.compile(r'!\[([^\]]*)\]\((.*?)\)')      # markdown image
TABLE_RE = re.compile(r'((?:\|.*\n)+)', re.M)         # md table block

# ─────────────────────────────────────────────
#  PUBLIC API
# ─────────────────────────────────────────────
def ingest_documents(pdf_glob: str,
                     by_page:   bool = True,
                     chunk_size: int = 1500) -> None:
    """
    Parse → chunk → embed → store in Chroma.
    Creates / updates:
        • Chroma collection  (text chunks)
        • obj_store_images/  (figures)
        • tables.db          (tables)
    """
    paths = glob.glob(pdf_glob)
    if not paths:
        raise FileNotFoundError("No PDF matched pattern")

    docs = parser_full.load_data(paths)

    for doc, path in zip(docs, paths):
        paper_meta = _safe_json(_gemini_chat(_META_PROMPT + doc.text))
        paper_meta["path"] = path

        # ── iterate pages or length‑chunks ──────────────────
        if by_page and getattr(doc, "pages", None):
            chunks_src = [p.text for p in doc.pages]
        else:
            t = doc.text
            chunks_src = [t[i:i+chunk_size] for i in range(0, len(t), chunk_size)]

        embeddings = _embed(chunks_src)
        chunk_ids  = [str(uuid.uuid4()) for _ in chunks_src]

        for id_, txt, emb in zip(chunk_ids, chunks_src, embeddings):
            # ---------- extract figures ---------------------
            img_ids = []
            for alt, ref in IMG_RE.findall(txt):
                # LlamaParse leaves "imageX" placeholders; user is expected to
                # copy the real bytes.  Here we just save the *markdown* name.
                dummy_bytes = alt.encode()                # placeholder
                img_id, img_path = _save_image(dummy_bytes, id_)
                img_sum = _summarise(alt or "figure", "figure")
                img_ids.append(img_id)

                # link summary & path in a .json sidecar
                json.dump({"chunk_id": id_, "summary": img_sum},
                          open(img_path+".json","w",encoding="utf-8"), indent=2)

            # ---------- extract tables ----------------------
            tbl_ids = []
            for tbl_md in TABLE_RE.findall(txt):
                tbl_id = str(uuid.uuid4())
                tbl_sum = _summarise(tbl_md, "table")
                sql_cur.execute(
                    "INSERT OR REPLACE INTO tables VALUES (?,?,?,?)",
                    (tbl_id, id_, tbl_md, tbl_sum))
                tbl_ids.append(tbl_id)
            sql_conn.commit()

            # ---------- write chunk to Chroma ---------------
            meta = _clean_meta(paper_meta) | {
                "chunk_id"   : id_,
                "chunk_preview": txt[:400],
                "has_images" : bool(img_ids),
                "image_ids"  : ",".join(img_ids),
                "has_tables" : bool(tbl_ids),
                "table_ids"  : ",".join(tbl_ids),
            }

            collection.add(ids=[id_], embeddings=[emb],
                           documents=[txt], metadatas=[meta])

        print(f"✓ {len(chunks_src):3d} chunks  |  "
              f"{sum(map(bool,chunk_ids))} pages  |  "
              f"path={os.path.basename(path)}")

# ─────────────────────────────────────────────
def _tables_by_ids(ids: List[str]) -> List[Dict]:
    if not ids: return []
    q = ",".join("?"*len(ids))
    rows = sql_cur.execute(f"SELECT id,summary,raw_md FROM tables WHERE id IN ({q})", ids)
    return [{"id": r[0], "summary": r[1], "raw_md": r[2]} for r in rows]

def _figures_by_ids(ids: List[str]) -> List[Dict]:
    data = []
    for fid in ids:
        path = os.path.join(IMG_STORE, fid)
        if not os.path.exists(path): continue
        summ_path = path + ".json"
        summary = ""
        if os.path.exists(summ_path):
            summary = json.load(open(summ_path, encoding="utf-8"))["summary"]
        data.append({"id": fid, "path": path, "summary": summary})
    return data
# ─────────────────────────────────────────────
def query_rag(question: str, top_k: int = 3) -> str:
    """
    Vector‑search → gather linked figs / tables → Gemini answer.
    """
    parsed = _safe_json(_gemini_chat(_QUERY_PROMPT + question))
    q_vec  = _embed([question])[0]
    where  = {k:v for k,v in parsed.items() if v}

    res = collection.query(query_embeddings=[q_vec], n_results=top_k,
                           where=where or None,
                           include=["documents", "metadatas"])

    ctx_blocks = []
    for i,(doc,meta) in enumerate(zip(res["documents"][0], res["metadatas"][0]),1):
        fig_ids = meta.get("image_ids","").split(",") if meta.get("has_images") else []
        tbl_ids = meta.get("table_ids","").split(",") if meta.get("has_tables") else []

        figs = _figures_by_ids(fig_ids)
        tbls = _tables_by_ids(tbl_ids)

        fig_txt = "\n".join(f"- FIG {f['id']} : {f['summary']}" for f in figs)
        tbl_txt = "\n".join(f"- TABLE {t['id']}: {t['summary']}" for t in tbls)

        ctx_blocks.append(
            f"\n### Doc {i}\n"
            f"Path : {meta['path']}\n"
            f"Title: {meta.get('title','')}\n"
            f"{doc[:1500]}\n"
            f"{fig_txt or ''}\n{tbl_txt or ''}"
        )

    answer_prompt = textwrap.dedent(f"""
    Use **only** the information below to answer the question.
    Cite as (Doc 1), (Fig XYZ), (Table XYZ) where relevant.

    {'-'*80}
    {"".join(ctx_blocks)}
    {'-'*80}

    Question: "{question}"
    """)
    return _gemini_chat(answer_prompt)

# ───────── helper for UI layer (optional) ──────────
def get_table(id_: str)  -> Dict: return _tables_by_ids([id_])[0]
def get_figure(id_: str) -> Dict: return _figures_by_ids([id_])[0]


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# rag_scipdf_core.py  ── import this file from your app
# -------------------------------------------------------
"""Multimodal Scientific‑PDF RAG core

Architecture (v May‑2025)
=========================
• **Vector DB**  – `scientific_chunks` collection in Chroma → **text‑chunk
  embeddings only**.
• **Object store (images)**  – files copied under `object_store/images/`.
• **Object store (tables)**  – original Markdown tables saved under
  `object_store/tables/` (**one file per table**, preserving layout).
• **SQLite media catalog**  – `media_summaries.db` tracks metadata & 200‑word
  summaries for every image/table and links them to their parent chunk.

Why this split?
• We never embed images or tables ⇒ no vectors inside SQLite.
• SQLite is lightweight and perfect for the small structured catalogue.
• Object‑store folders emulate an S3 bucket locally; swap `shutil.copy2`
  with an upload call if you move to real S3 later.
"""

import os, glob, json, re, textwrap, time, uuid, shutil, sqlite3
from typing import Dict, List, Tuple

import nest_asyncio
import chromadb
import google.generativeai as genai
from llama_parse import LlamaParse

# ─────────── Keys & model names ───────────
GEMINI_API_KEY = "AIzaSyD-zaE26w-qsu00AyXXjVI7HK7Uhl6-IVQ"
LLAMA_API_KEY  = "llx-gOp6mdoPWfKSft1LdDaiDcT2GZl2hEuKXjS5u7E6kWIqXIuu"
MODEL_GEN       = "models/gemini-1.5-flash-latest"
MODEL_EMB       = "models/text-embedding-004"

if not GEMINI_API_KEY:
    raise RuntimeError("Set GEMINI_API_KEY environment variable first")

# ───────────  Initialise clients ──────────
nest_asyncio.apply()

genai.configure(api_key=GEMINI_API_KEY)
_gemini_gen  = genai.GenerativeModel(MODEL_GEN)

# Vector DB ---------------------------------------------------------
client = chromadb.PersistentClient(path="chroma_scipdfs")
collection_txt = client.get_or_create_collection(
    name="scientific_chunks",
    metadata={"hnsw:space": "cosine"},
)

# Object stores -----------------------------------------------------
OBJ_DIR_IMG = "object_store/images"
OBJ_DIR_TBL = "object_store/tables"
os.makedirs(OBJ_DIR_IMG, exist_ok=True)
os.makedirs(OBJ_DIR_TBL, exist_ok=True)

# SQLite catalogue --------------------------------------------------
DB_PATH = "media_summaries.db"
_conn   = sqlite3.connect(DB_PATH)
_conn.execute(
    """CREATE TABLE IF NOT EXISTS image_summaries(
            id TEXT PRIMARY KEY,
            parent_chunk_id TEXT,
            path TEXT,
            alt  TEXT,
            summary TEXT)"""
)
_conn.execute(
    """CREATE TABLE IF NOT EXISTS table_summaries(
            id TEXT PRIMARY KEY,
            parent_chunk_id TEXT,
            path TEXT,
            summary TEXT)"""
)
_conn.commit()

# PDF parser --------------------------------------------------------
parser_full = LlamaParse(
    api_key           = LLAMA_API_KEY or None,
    result_type       = "markdown",
    chunk_size        = None,
    allow_nested_data = False,
)

# ───────────── helpers ────────────────────

def _scalarize(val):
    return ", ".join(map(str, val)) if isinstance(val, list) else val

def _clean_meta(meta: dict) -> dict:
    return {k: _scalarize(v) for k, v in meta.items()}

def _gemini_chat(prompt: str, retry: int = 3) -> str:
    for i in range(retry):
        try:
            return _gemini_gen.generate_content(prompt).text.strip()
        except Exception:
            if i == retry - 1:
                raise
            time.sleep(1 + i)

def _embed(texts: List[str]):
    return genai.embed_content(model=MODEL_EMB, content=texts, task_type="retrieval_document")["embedding"]

# ─────────── prompts ───────────
_META_PROMPT = textwrap.dedent(""" Extract the following fields from the first‑page text of a scientific paper.
Return ONLY valid **JSON** with these exact keys:
{
  "title": string,
  "authors": [list of full names],
  "abstract": string,
  "keywords": [list of keywords],
  "Diseases": [list of diseases used in the paper],
  "Methodology": one of ["signal", "image", "graph", "language"]
}
If a field is missing, return an empty string or empty list.
Text:
""")

_QUERY_PROMPT = textwrap.dedent("""
From the user query extract any of these fields (return JSON):
{ "Diseases":[…], "title":string, "authors":[…], "keywords":[…], "Methodology":string }
Query:
""")

_SUMMARY_PROMPT = "Give a precise 200‑word summary of the following {kind}:\n\n{content}"

# ─────────── utils ───────────

def _safe_json(raw: str) -> Dict:
    raw = re.sub(r"^```json|```$", "", raw, flags=re.I).strip()
    try:
        return json.loads(raw)
    except Exception:
        return {}

# ­­­­­­­­­­­­­­­­­­­­­­­ extraction ­­­­­­­­­­­­­­­­­­­­­­­

def _extract_images(md: str):
    return [(m.group(2), m.group(1)) for m in re.finditer(r"!\[(.*?)\]\((.*?)\)", md)]

def _extract_tables(md: str):
    blocks = md.split("\n\n")
    return [b for b in blocks if re.search(r"\|.*---", b)]

def _summarise(content: str, kind: str):
    return _gemini_chat(_SUMMARY_PROMPT.format(kind=kind, content=content[:5000]))

# ─────────── ingestion ───────────

def ingest_documents(pdf_glob: str, *, by_page=True, chunk_size=1500):
    paths = glob.glob(pdf_glob)
    if not paths:
        raise FileNotFoundError("No PDF matched the pattern")

    docs = parser_full.load_data(paths)

    for doc, path in zip(docs, paths):
        meta = _safe_json(_gemini_chat(_META_PROMPT + doc.text))
        meta["path"] = path

        if by_page and getattr(doc, "pages", None):
            chunks = [(p.text, p.text) for p in doc.pages]
        else:
            txt = doc.text
            chunks = [(txt[i:i+chunk_size], txt[i:i+chunk_size]) for i in range(0, len(txt), chunk_size)]

        for raw_chunk, extract_src in chunks:
            chunk_id = str(uuid.uuid4())
            imgs     = _extract_images(extract_src)
            tbls_md  = _extract_tables(extract_src)

            # 1️⃣ text chunk ➜ Chroma
            collection_txt.add(
                ids        = [chunk_id],
                embeddings = _embed([raw_chunk]),
                documents  = [raw_chunk],
                metadatas  = [_clean_meta({**meta,
                    "chunk_id": chunk_id,
                    "chunk_preview": raw_chunk[:400],
                    "has_image": bool(imgs),
                    "has_table": bool(tbls_md),
                })],
            )

            # 2️⃣ images ➜ object_store/images + SQLite row
            for img_path, alt in imgs:
                # copy (best effort)
                dst_name = f"{uuid.uuid4()}_{os.path.basename(img_path)}"
                dst_path = os.path.join(OBJ_DIR_IMG, dst_name)
                try:
                    shutil.copy2(img_path, dst_path) if os.path.isfile(img_path) else shutil.copy2(img_path, dst_path)
                except Exception:
                    dst_path = img_path  # fallback: keep original ref

                _conn.execute(
                    "INSERT OR IGNORE INTO image_summaries VALUES (?,?,?,?,?)",
                    (str(uuid.uuid4()), chunk_id, dst_path, alt, _summarise(alt or "Image", "image")),
                )

            # 3️⃣ tables ➜ object_store/tables + SQLite row
            for tbl_md in tbls_md:
                tbl_id   = str(uuid.uuid4())
                tbl_file = os.path.join(OBJ_DIR_TBL, f"{tbl_id}.md")
                with open(tbl_file, "w", encoding="utf8") as f:
                    f.write(tbl_md)
                _conn.execute(
                    "INSERT OR IGNORE INTO table_summaries VALUES (?,?,?,?)",
                    (tbl_id, chunk_id, tbl_file, _summarise(tbl_md, "table")),
                )

        _conn.commit()
        print(f"✓ indexed {path} (pages={len(chunks):d})")

# ─────────── retrieval ───────────

def _fetch_media(parent_id: str):
    imgs  = _conn.execute("SELECT summary, path FROM image_summaries WHERE parent_chunk_id=?", (parent_id,)).fetchall()
    tbls  = _conn.execute("SELECT summary, path FROM table_summaries WHERE parent_chunk_id=?", (parent_id,)).fetchall()
    return imgs, tbls


def query_rag(question: str, *, top_k=3):
    parsed = _safe_json(_gemini_chat(_QUERY_PROMPT + question))
    q_vec  = _embed([question])[0]
    where  = {k: v for k, v in parsed.items() if v}

    res = collection_txt.query(
        query_embeddings=[q_vec],
        n_results       = top_k,
        where           = where or None,
        include         = ["documents", "metadatas"],
    )

    ctx = ""
    for i, (doc, meta) in enumerate(zip(res["documents"][0], res["metadatas"][0]), start=1):
        ctx += (
            f"\n### Doc {i}\nPath: {meta.get('path','')}\nTitle: {meta.get('title','')}\nDiseases: {meta.get('Diseases','')}\nMethodology: {meta.get('Methodology','')}\n\n{doc[:1500]}\n"
        )
        imgs, tbls = _fetch_media(meta["chunk_id"])
        for j, (summary, path) in enumerate(imgs, start=1):
            ctx += f"\n**Figure {j} summary** ({path}): {summary}\n"
        for j, (summary, path) in enumerate(tbls, start=1):
            ctx += f"\n**Table {j} summary** ({path}): {summary}\n"

    prompt = textwrap.dedent(f"""
    Use ONLY the information in the text chunks, figure summaries, and table summaries below to answer the question.
    Cite as (Doc 1), (Doc 2)… where appropriate.

    {ctx}

    Question: "{question}"
    """)

    return _gemini_chat(prompt)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_glob_path = "D:\\RAG\\PAPER_BRAZIL.pdf"

# Step 1: Ingest the documents (parse, chunk, extract images/tables, and store everything)
ingest_documents(pdf_glob=pdf_glob_path, by_page=True)


Parsing files: 100%|██████████| 1/1 [00:08<00:00,  8.96s/it]


✓ indexed D:\RAG\PAPER_BRAZIL.pdf (pages=13)


In [3]:
answer = query_rag("Who are the authors?")
print(answer)

The provided text does not name the authors of the paper.


In [5]:
collection_txt.peek(1)

{'ids': ['083ab09e-a9c7-4e02-9721-7b4e9c7eac69'],
 'embeddings': [[-0.06251402944326401,
   -0.004813515115529299,
   -0.010654691606760025,
   -0.01121898926794529,
   0.0231003575026989,
   -0.0093545475974679,
   0.01863493211567402,
   0.007155774626880884,
   -0.01385700423270464,
   -0.001191228162497282,
   -0.02372383512556553,
   0.03168933093547821,
   0.011525357142090797,
   0.009895766153931618,
   0.084300696849823,
   -0.04303775355219841,
   0.06418072432279587,
   0.018642937764525414,
   -0.07346095889806747,
   -0.0053033060394227505,
   0.012962045148015022,
   -0.003485290799289942,
   -0.04503975436091423,
   -0.05224882811307907,
   -0.06599786132574081,
   -0.005751549266278744,
   0.007632554974406958,
   0.008548939600586891,
   0.03576388955116272,
   -0.008823717944324017,
   0.04065380245447159,
   0.06789446622133255,
   0.027919845655560493,
   -0.033902332186698914,
   0.015172410756349564,
   0.07548562437295914,
   0.020785463973879814,
   0.0445681288

In [11]:
docs


NameError: name 'docs' is not defined

In [1]:
# ───────────── (run once, then comment out) ─────────────
# !pip install -q docling pillow pdf2image   # Docling pulls in PDFium/Poppler

# ───────────────── IMPORTS ──────────────────────────────
import pathlib, sys
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

# ────────────── 1. ASK FOR THE PDF FOLDER ──────────────
pdf_dir = pathlib.Path(input("📂  Folder with PDFs ➜ ").strip()).expanduser()
if not pdf_dir.is_dir():
    sys.exit(f"❌  '{pdf_dir}' is not a directory")

out_dir = pdf_dir / "_pics"
out_dir.mkdir(exist_ok=True)

# ────────────── 2. DOC LING SET-UP ──────────────────────
pdf_opts = PdfPipelineOptions(
    generate_picture_images = True,  # ← turn on figure extraction
    images_scale            = 2.0    # ← 2× default DPI ≈ 144 dpi
)
converter = DocumentConverter(
    format_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
)

# ────────────── 3. CONVERT & SAVE PICTURES ─────────────
print("⇣  Extracting pictures...")
for pdf in sorted(pdf_dir.glob("*.pdf")):
    print(f"   · {pdf.name}")
    ddoc = converter.convert(str(pdf)).document
    page_nos = [pic.prov[0].page_no for pic in ddoc.pictures if pic.prov]

#     for idx, pic in enumerate(ddoc.pictures, 1):
#         img = pic.get_image(ddoc)
#         if img is None:                     # tiny vector icon Docling skipped
#             continue
#         fn = out_dir / f"{pdf.stem}pic{idx}.png"
#         img.save(fn, "PNG")

# print(f"✔  Done! {len(list(out_dir.glob('*.png')))} images saved to {out_dir}")

  from .autonotebook import tqdm as notebook_tqdm


⇣  Extracting pictures...
   · PAPER_BRAZIL.pdf


In [2]:
page_nos

[3, 4]