In [None]:
# ============================================================
# Cell 0: Install all dependencies (run once, then restart runtime)
# ============================================================
!pip -q install --no-cache-dir \
  "numpy<2.0" "pandas==2.2.2" \
  pymupdf==1.24.9 tqdm==4.66.5 \
  sentence-transformers==3.0.1 faiss-cpu==1.8.0 \
  langchain==0.2.16 langchain-community==0.2.16 langchain-text-splitters==0.2.2 \
  langchain-openai==0.1.23 \
  rank_bm25==0.2.2 \
  ragas==0.1.21 \
  openai>=1.40

In [17]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import os

BASE_DIR = "/content/drive/MyDrive/Kerala_DIRs"   # <-- this matches your screenshot
PDF_DIR = os.path.join(BASE_DIR, "01_canonical_pdfs")
MANIFEST_PATH = os.path.join(BASE_DIR, "10_manifests", "manifest_final.csv")

assert os.path.isdir(PDF_DIR), f"Missing PDF folder: {PDF_DIR}"
assert os.path.isfile(MANIFEST_PATH), f"Missing manifest: {MANIFEST_PATH}"

print("✅ Found PDF_DIR:", PDF_DIR)
print("✅ Found manifest:", MANIFEST_PATH)


✅ Found PDF_DIR: /content/drive/MyDrive/Kerala_DIRs/01_canonical_pdfs
✅ Found manifest: /content/drive/MyDrive/Kerala_DIRs/10_manifests/manifest_final.csv


In [19]:
import pandas as pd

df = pd.read_csv(MANIFEST_PATH)

# Required columns
required_cols = {"doc_id", "canonical_filename"}
missing = required_cols - set(df.columns)
assert not missing, f"manifest_final.csv missing columns: {missing}"

# Build Drive paths using canonical_filename (ignore canonical_path because it's Windows)
df["pdf_path"] = df["canonical_filename"].apply(lambda fn: os.path.join(PDF_DIR, str(fn)))
df["pdf_exists"] = df["pdf_path"].apply(os.path.isfile)

print("Rows in manifest:", len(df))
print("PDFs found:", int(df["pdf_exists"].sum()))
print("PDFs missing:", int((~df["pdf_exists"]).sum()))

if (~df["pdf_exists"]).any():
    display(df.loc[~df["pdf_exists"], ["doc_id", "canonical_filename", "pdf_path"]].head(20))


Rows in manifest: 436
PDFs found: 436
PDFs missing: 0


In [20]:
import fitz  # PyMuPDF
from tqdm import tqdm

def extract_pages(pdf_path: str):
    pages = []
    with fitz.open(pdf_path) as doc:
        for i in range(len(doc)):
            text = doc.load_page(i).get_text("text") or ""
            text = text.strip()
            pages.append({"page": i+1, "text": text})
    return pages

# Smoke test: open one PDF
sample = df[df["pdf_exists"]].iloc[0]
pages = extract_pages(sample["pdf_path"])
print("Sample doc_id:", sample["doc_id"])
print("Pages:", len(pages))
print("First page chars:", len(pages[0]["text"]))
print(pages[0]["text"][:500])


Sample doc_id: KCDI000001
Pages: 12
First page chars: 2236
Vol.:(0123456789)
Journal of Industrial and Business Economics (2020) 47:519–530
https://doi.org/10.1007/s40812-020-00170-x
1 3
A critique of the Indian government’s response 
to the COVID-19 pandemic
Jayati Ghosh1 
Received: 30 May 2020 / Revised: 2 July 2020 / Accepted: 3 July 2020 / Published online: 11 July 2020 
© Associazione Amici di Economia e Politica Industriale 2020
Abstract
The most destructive effects of Covid-19 in India have not been the result of the dis-
ease, but the nature of 


In [21]:
from langchain_core.documents import Document

def safe_get(row, col):
    if col in row and pd.notna(row[col]):
        return row[col]
    return ""

# Try to pick the most useful title for display:
# prefer original_filename if present, else title
def choose_display_title(row):
    orig = safe_get(row, "original_filename")
    title = safe_get(row, "title")
    return orig if str(orig).strip() else title

page_docs = []
failed = []

for _, row in tqdm(df[df["pdf_exists"]].iterrows(), total=int(df["pdf_exists"].sum())):
    pdf_path = row["pdf_path"]
    try:
        base_meta = {
            "doc_id": safe_get(row, "doc_id"),
            "canonical_filename": safe_get(row, "canonical_filename"),
            "display_title": choose_display_title(row),
            "year": safe_get(row, "year"),
            "doi": safe_get(row, "doi"),
            "geo_scope": safe_get(row, "geo_scope"),
            "metadata_confidence": safe_get(row, "metadata_confidence"),
            # Your sheet shows 2 metric columns (primary/secondary). Names can vary,
            # so we check a few possible column names safely.
            "metric_bucket_primary": safe_get(row, "metric_bucket_primary") or safe_get(row, "metric_buck_primary") or safe_get(row, "metric_bucket") or safe_get(row, "metric_buc"),
            "metric_bucket_secondary": safe_get(row, "metric_bucket_secondary") or safe_get(row, "metric_buck_secondary") or "",
            "sha256_hash": safe_get(row, "sha256_hash"),
            "file_size_bytes": safe_get(row, "file_size_bytes"),
            "source_folder": safe_get(row, "source_folder"),
        }

        pages = extract_pages(pdf_path)
        for p in pages:
            if not p["text"]:
                continue
            meta = dict(base_meta)
            meta["page"] = p["page"]
            meta["pdf_path"] = pdf_path  # traceability
            page_docs.append(Document(page_content=p["text"], metadata=meta))

    except Exception as e:
        failed.append((safe_get(row, "doc_id"), pdf_path, str(e)))

print("✅ Page-docs created:", len(page_docs))
print("❌ Failed PDFs:", len(failed))
failed[:5]


100%|██████████| 436/436 [00:59<00:00,  7.35it/s]

✅ Page-docs created: 6912
❌ Failed PDFs: 2





[('KCDI000387',
  '/content/drive/MyDrive/Kerala_DIRs/01_canonical_pdfs/kcdi000387_.pdf',
  "Cannot open empty file: filename='/content/drive/MyDrive/Kerala_DIRs/01_canonical_pdfs/kcdi000387_.pdf'."),
 ('KCDI000424',
  '/content/drive/MyDrive/Kerala_DIRs/01_canonical_pdfs/kcdi000424_.pdf',
  "Failed to open file '/content/drive/MyDrive/Kerala_DIRs/01_canonical_pdfs/kcdi000424_.pdf'.")]

In [None]:
# ============================================================
# STEP 1: Improved Chunking (larger chunks for academic papers)
# ============================================================
# Academic papers have dense paragraphs — 1500 chars with 300 overlap
# keeps more context per chunk while still being under token limits.

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    separators=["\n\n", "\n", ". ", " ", ""]  # added ". " to prefer sentence boundaries
)

chunk_docs = splitter.split_documents(page_docs)

# Store full page text as "parent_text" in metadata for parent-child retrieval later
page_lookup = {}
for doc in page_docs:
    key = (doc.metadata["doc_id"], doc.metadata["page"])
    page_lookup[key] = doc.page_content

for chunk in chunk_docs:
    key = (chunk.metadata["doc_id"], chunk.metadata["page"])
    chunk.metadata["parent_text"] = page_lookup.get(key, "")

print(f"Chunks created: {len(chunk_docs)}")
print(f"Avg chunk length: {sum(len(c.page_content) for c in chunk_docs) / len(chunk_docs):.0f} chars")
print(f"Parent pages indexed: {len(page_lookup)}")

In [None]:
# ============================================================
# STEP 2: Upgraded Embedding Model + FAISS Index
# ============================================================
# bge-base-en-v1.5 is a much stronger retrieval model than MiniLM.
# It uses 768-dim embeddings and was trained specifically for retrieval.
# On Colab GPU this takes ~15-20 min for ~20k chunks.

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"

embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={"device": "cuda"},    # use GPU if available
    encode_kwargs={"normalize_embeddings": True}  # bge models need L2 normalization
)

# Build the FAISS index — this is the slow step (~15-20 min on GPU)
print(f"Embedding {len(chunk_docs)} chunks with {EMBEDDING_MODEL}...")
vectorstore = FAISS.from_documents(chunk_docs, embeddings)

# Save to Drive so you don't have to re-embed every session
INDEX_DIR = os.path.join(BASE_DIR, "faiss_index_bge_base")
vectorstore.save_local(INDEX_DIR)
print(f"FAISS index saved to: {INDEX_DIR}")

In [None]:
# ============================================================
# STEP 2b: (OPTIONAL) Load saved index instead of re-embedding
# ============================================================
# After first run, you can skip the embedding step and load from Drive:
# Uncomment the lines below and SKIP cell 7 on subsequent runs.

# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
#
# EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"
# embeddings = HuggingFaceEmbeddings(
#     model_name=EMBEDDING_MODEL,
#     model_kwargs={"device": "cuda"},
#     encode_kwargs={"normalize_embeddings": True}
# )
# INDEX_DIR = os.path.join(BASE_DIR, "faiss_index_bge_base")
# vectorstore = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)
# print(f"Loaded FAISS index from: {INDEX_DIR}")

In [None]:
# ============================================================
# STEP 3: BM25 Retriever + Hybrid Ensemble
# ============================================================
# BM25 catches exact keyword matches (acronyms like MGNREGA, Kudumbashree,
# LSG, CDS) that dense embeddings often miss. EnsembleRetriever combines both.

from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# Build BM25 index over the same chunks
print("Building BM25 index...")
bm25_retriever = BM25Retriever.from_documents(chunk_docs, k=25)

# Dense retriever from FAISS
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# Combine: 60% dense + 40% BM25
# (dense gets more weight because semantic matching is usually better,
#  but BM25 rescues keyword-specific queries)
ensemble_retriever = EnsembleRetriever(
    retrievers=[dense_retriever, bm25_retriever],
    weights=[0.6, 0.4]
)

print("Hybrid retriever ready (FAISS 60% + BM25 40%)")

In [None]:
# ============================================================
# STEP 4: Cross-Encoder Reranker
# ============================================================
# The reranker takes the top-25 candidates from hybrid search and
# re-scores them with a cross-encoder (much more accurate but slower).
# We keep only the top-k after reranking.

from sentence_transformers import CrossEncoder
import numpy as np

RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(RERANKER_MODEL, max_length=512)

def retrieve_and_rerank(query: str, top_k: int = 6):
    """Hybrid retrieve -> cross-encoder rerank -> return top_k."""
    # Step 1: Get candidates from hybrid retriever
    candidates = ensemble_retriever.invoke(query)

    if not candidates:
        return []

    # Step 2: Score each candidate with cross-encoder
    pairs = [(query, doc.page_content) for doc in candidates]
    scores = reranker.predict(pairs)

    # Step 3: Sort by reranker score descending
    scored = list(zip(candidates, scores))
    scored.sort(key=lambda x: x[1], reverse=True)

    # Return top_k with scores attached
    results = []
    for doc, score in scored[:top_k]:
        doc.metadata["reranker_score"] = float(score)
        results.append(doc)

    return results

# Quick test
test_results = retrieve_and_rerank("What is the role of Kudumbashree in poverty reduction?")
print(f"Retrieved and reranked: {len(test_results)} chunks")
for i, doc in enumerate(test_results, 1):
    print(f"\n[{i}] score={doc.metadata['reranker_score']:.4f}  "
          f"doc={doc.metadata['doc_id']}  p.{doc.metadata['page']}")
    print(f"    {doc.page_content[:200]}...")

In [None]:
# ============================================================
# STEP 5: OpenAI LLM Generation with Citations
# ============================================================
import os
from getpass import getpass
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Set your OpenAI API key (will prompt you securely)
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

# Use gpt-4o-mini for cost efficiency; switch to "gpt-4o" for higher quality
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

SYSTEM_PROMPT = """You are an expert research assistant specializing in Kerala community development, 
local governance, decentralization, and Indian urban/rural development policy.

You answer questions using ONLY the provided source documents. Follow these rules strictly:

1. Base every claim on the provided sources. Cite using [doc_id, p.X] format after each claim.
2. If the sources do not contain enough information to answer, say so explicitly.
3. Synthesize information across multiple sources when relevant.
4. When sources disagree, note the disagreement and cite both sides.
5. Keep answers focused, well-structured, and academic in tone.
6. End with a "Sources Used" section listing all cited documents with their titles."""

def format_context(docs):
    """Format retrieved documents into a context string for the LLM."""
    context_parts = []
    for i, doc in enumerate(docs, 1):
        md = doc.metadata
        header = (f"[Source {i}] doc_id={md.get('doc_id','?')} | "
                  f"page={md.get('page','?')} | "
                  f"title={str(md.get('display_title',''))[:100]} | "
                  f"year={md.get('year','?')} | "
                  f"geo={md.get('geo_scope','?')}")
        context_parts.append(f"{header}\n{doc.page_content}")
    return "\n\n---\n\n".join(context_parts)

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", """Based on the following source documents, answer the question.

SOURCES:
{context}

QUESTION: {question}

Provide a thorough, well-cited answer:""")
])

chain = prompt | llm

print("LLM chain ready (gpt-4o-mini)")

In [None]:
# ============================================================
# STEP 6: Full RAG Pipeline (retrieve + rerank + generate)
# ============================================================

def rag_query(question: str, top_k: int = 6, verbose: bool = True):
    """End-to-end RAG: hybrid retrieve -> rerank -> LLM answer with citations."""

    # 1. Retrieve and rerank
    docs = retrieve_and_rerank(question, top_k=top_k)

    if not docs:
        return {"answer": "No relevant documents found.", "sources": [], "docs": []}

    # 2. Format context
    context = format_context(docs)

    # 3. Generate answer
    response = chain.invoke({"context": context, "question": question})
    answer = response.content

    # 4. Collect source info
    sources = []
    for doc in docs:
        md = doc.metadata
        sources.append({
            "doc_id": md.get("doc_id"),
            "page": md.get("page"),
            "title": str(md.get("display_title", ""))[:120],
            "year": md.get("year"),
            "reranker_score": md.get("reranker_score", 0),
        })

    if verbose:
        print("=" * 80)
        print(f"QUESTION: {question}")
        print("=" * 80)
        print(f"\n{answer}\n")
        print("-" * 40)
        print("Retrieved Sources (ranked by relevance):")
        for i, s in enumerate(sources, 1):
            print(f"  [{i}] {s['doc_id']} p.{s['page']} "
                  f"(score={s['reranker_score']:.3f}) — {s['title']}")

    return {"answer": answer, "sources": sources, "docs": docs}


# --- Try it! ---
result = rag_query("What indicators are used to measure municipal service delivery performance in Kerala?")


In [None]:
# ============================================================
# STEP 6b: More example queries to test the pipeline
# ============================================================

queries = [
    "How does Kudumbashree contribute to women's empowerment and poverty alleviation?",
    "What are the main challenges in solid waste management in Kerala municipalities?",
    "How do studies measure road congestion or traffic conditions in Indian cities?",
    "What role do gram panchayats play in local governance and decentralization?",
    "What indicators measure electricity or water service reliability in Kerala?",
]

for q in queries:
    rag_query(q, top_k=5)
    print("\n\n")

# --- EVALUATION SECTION ---
## Step 7: Build evaluation dataset and run RAGAS metrics

The cells below let you:
1. Define ground-truth Q&A pairs with expected source doc_ids
2. Run retrieval metrics (Hit Rate, MRR)
3. Run generation metrics via RAGAS (faithfulness, relevancy, context precision/recall)

In [None]:
# ============================================================
# STEP 7a: Evaluation Dataset (Ground Truth)
# ============================================================
# Fill in expected_doc_ids with doc_ids you KNOW should be retrieved.
# Start with 10-15 and grow to 30-50 as you review results.
# You can find relevant doc_ids by running rag_query and checking which
# sources are returned — then verify if those are correct.

eval_dataset = [
    {
        "question": "What indicators are used to measure municipal service delivery performance in Kerala?",
        "expected_doc_ids": [],   # <-- fill in after reviewing results, e.g. ["KCDI000045", "KCDI000112"]
        "ground_truth_answer": "" # <-- optional: write a short reference answer
    },
    {
        "question": "How does Kudumbashree contribute to women's empowerment?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "What are the challenges in solid waste management in Kerala municipalities?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "What role do gram panchayats play in decentralized governance?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "How is literacy rate measured across Kerala districts?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "What are the main sources of revenue for local self-government institutions in Kerala?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "How do studies measure housing quality or housing conditions in Indian cities?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "What is the impact of MGNREGA on rural employment in Kerala?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "How is drinking water access measured at the ward or panchayat level?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
    {
        "question": "What are the health indicators used in community development indices?",
        "expected_doc_ids": [],
        "ground_truth_answer": ""
    },
]

print(f"Evaluation dataset: {len(eval_dataset)} questions")
print("NOTE: Fill in expected_doc_ids after reviewing initial results!")

In [None]:
# ============================================================
# STEP 7b: Retrieval Metrics (Hit Rate, MRR)
# ============================================================
# These measure whether the correct documents appear in the top-k results.
# Only works once you fill in expected_doc_ids above.

def evaluate_retrieval(eval_data, top_k=6):
    """Compute Hit Rate@k and MRR@k for the evaluation dataset."""
    hit_rates = []
    mrrs = []

    for item in eval_data:
        expected = set(item["expected_doc_ids"])
        if not expected:
            continue  # skip items without ground truth

        docs = retrieve_and_rerank(item["question"], top_k=top_k)
        retrieved_ids = [d.metadata.get("doc_id") for d in docs]

        # Hit Rate: was ANY expected doc in the top-k?
        hit = any(rid in expected for rid in retrieved_ids)
        hit_rates.append(1.0 if hit else 0.0)

        # MRR: reciprocal rank of the first expected doc found
        rr = 0.0
        for rank, rid in enumerate(retrieved_ids, 1):
            if rid in expected:
                rr = 1.0 / rank
                break
        mrrs.append(rr)

    if not hit_rates:
        print("No evaluation items have expected_doc_ids filled in yet!")
        return

    print(f"Evaluated {len(hit_rates)} questions at top_k={top_k}")
    print(f"  Hit Rate@{top_k}: {np.mean(hit_rates):.3f}")
    print(f"  MRR@{top_k}:      {np.mean(mrrs):.3f}")

    return {"hit_rate": np.mean(hit_rates), "mrr": np.mean(mrrs)}

# Run it (will only produce results once expected_doc_ids are filled in)
evaluate_retrieval(eval_dataset, top_k=6)

In [None]:
# ============================================================
# STEP 7c: RAGAS Evaluation (Faithfulness, Relevancy, etc.)
# ============================================================
# RAGAS uses an LLM (OpenAI) to automatically evaluate RAG quality.
# It measures:
#   - faithfulness: is the answer grounded in the retrieved context?
#   - answer_relevancy: does it actually answer the question?
#   - context_precision: are the retrieved chunks relevant?
#   - context_recall: did we retrieve all the needed context?
#     (requires ground_truth_answer to be filled in)

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset

def run_ragas_evaluation(eval_data, top_k=6):
    """Run RAGAS evaluation over the eval dataset."""
    questions = []
    answers = []
    contexts = []
    ground_truths = []

    for item in eval_data:
        q = item["question"]
        result = rag_query(q, top_k=top_k, verbose=False)

        questions.append(q)
        answers.append(result["answer"])
        contexts.append([d.page_content for d in result["docs"]])

        gt = item.get("ground_truth_answer", "")
        ground_truths.append(gt if gt else result["answer"])  # fallback to generated answer

    # Build HuggingFace Dataset
    ds = Dataset.from_dict({
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truths,
    })

    # Choose metrics (context_recall needs ground_truth to be meaningful)
    metrics = [faithfulness, answer_relevancy, context_precision]
    if any(item.get("ground_truth_answer") for item in eval_data):
        metrics.append(context_recall)

    # Run evaluation
    print(f"Running RAGAS evaluation on {len(questions)} questions...")
    result = evaluate(ds, metrics=metrics)

    print("\n--- RAGAS Results ---")
    for metric_name, score in result.items():
        print(f"  {metric_name}: {score:.4f}")

    return result

# Run it
ragas_results = run_ragas_evaluation(eval_dataset, top_k=6)