In [1]:
# NOTE: this is a big install (many optional connectors). Remove packages you don't need.
#!pip install --upgrade --quiet \
#  langchain langchain-community langchain-huggingface \
#  langchain-google-genai google-generativeai \
#  sentence-transformers transformers accelerate huggingface-hub sentencepiece \
#  faiss-cpu \
#  easyocr python-dotenv pandas openpyxl python-docx pypdf pillow jq

In [2]:
# ✅ Install dependencies (run once)
#!pip install -U langchain-text-splitters spacy

# ✅ Download the spaCy model (run once)
#!python -m spacy download en_core_web_sm

# ✅ Correct modern import
from langchain_text_splitters import SpacyTextSplitter

# ✅ Initialize splitter with the SpaCy pipeline name
spacy_splitter = SpacyTextSplitter(pipeline="en_core_web_sm")

# Optional customization
# spacy_splitter = SpacyTextSplitter(pipeline="en_core_web_sm", separator="\n\n", chunk_size=2000)

In [3]:
from tqdm import tqdm
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
from pypdf import PdfReader
from docx import Document
from PIL import Image
import numpy as np
import spacy
import io
import re
import easyocr
from dotenv import load_dotenv
import os
import torch
import math
import sys
import traceback
from dataclasses import dataclass
from typing import List, Dict, Any, Optional

In [4]:
#!pip install spacy
#!pip install -U langchain-text-splitters spacy
#!python -m spacy download en_core_web_sm
#!pip install hf_xet
#!pip install --upgrade langchain langchain-community
#!pip install --upgrade langchain-core
#!pip install langchain-google-genai

In [5]:
# --- Load environment variables ---
load_dotenv()

# ✅ Create a data directory if not exists
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)

# --- Core configuration ---
CONFIG = {
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "chunk_min_len": 150,  # min chars before splitting
    "chunk_max_len": 1000, # adaptive splitting upper limit
    "overlap_ratio": 0.15, # contextual overlap for retriever
    "embedding_batch_size": 16,
    "retrieval_top_k": 5,
}

# --- Check essential env keys ---
api_keys = {
    "GOOGLE_API_KEY": os.getenv("GOOGLE_API_KEY"),
    "HUGGINGFACEHUB_API_TOKEN": os.getenv("HUGGINGFACEHUB_API_TOKEN"),
}

print("✅ Environment setup complete")
print(f"💾 Working directory: {BASE_DIR}")
print(f"📁 Data directory: {DATA_DIR}")
print(f"⚙️ Using device: {CONFIG['device']}")
print(f"🔑 Keys loaded: {', '.join([k for k, v in api_keys.items() if v]) or 'None'}")
print(f"📚 Embedding Model: {CONFIG['embedding_model']}")

# --- Quick import test ---
try:
    import langchain
    import langchain_community
    import langchain_huggingface
    import faiss
    from sentence_transformers import SentenceTransformer
    print("✅ Core libraries imported successfully.")
except Exception as e:
    print("❌ Library import error:", e)

✅ Environment setup complete
💾 Working directory: c:\Users\mbkhn\Downloads\Inspired\RAG
📁 Data directory: c:\Users\mbkhn\Downloads\Inspired\RAG\data
⚙️ Using device: cpu
🔑 Keys loaded: GOOGLE_API_KEY, HUGGINGFACEHUB_API_TOKEN
📚 Embedding Model: sentence-transformers/all-MiniLM-L6-v2
✅ Core libraries imported successfully.


In [6]:
# ✅ Correct modern import
from langchain_text_splitters import SpacyTextSplitter

# ✅ Initialize splitter with the SpaCy pipeline name
spacy_splitter = SpacyTextSplitter(pipeline="en_core_web_sm")

# Optional customization
# spacy_splitter = SpacyTextSplitter(pipeline="en_core_web_sm", separator="\n\n", chunk_size=2000)

In [7]:
#del hybrid_chunks

In [8]:
# ===== Cell 6 (FIXED AGAIN) — Hybrid Semantic Chunking (spaCy + transformers) with guardrails =====
import os
import re
from pathlib import Path

import spacy
from sentence_transformers import SentenceTransformer, util

# Optional: if you defined CONFIG earlier, reuse it; otherwise fall back
try:
    DEVICE = CONFIG.get("device", "cpu")
    EMBEDDING_MODEL_NAME = CONFIG.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
except Exception:
    DEVICE = "cpu"
    EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# ---------- Config / guardrails ----------
MIN_CHARS = 150           # minimum chunk length (characters)
MAX_CHARS = 1200          # maximum chunk length (characters)
SIM_THRESHOLD = 0.72      # cosine similarity threshold to continue merging sentences
EMBED_BATCH = 32          # batch size for sentence embedding
DATA_DIR = "data"         # fallback data dir to load files if needed

# ---------- Ensure spaCy model is available and increase max length ----------
try:
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    raise RuntimeError(
        "spaCy model en_core_web_sm not found. Run in terminal: python -m spacy download en_core_web_sm"
    ) from e

# Increase max length safely (just assign an int)
nlp.max_length = 10_000_000

# ---------- Embedding model (SentenceTransformers) ----------
embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)

# ---------- Utility helpers ----------
BOILERPLATE_PATTERNS = [
    r"^\s*figure\s*\d+", r"^\s*page\s*\d+", r"^\s*table\s*\d+", r"^\s*copyright", r"^\s*.\s*$"
]

def is_boilerplate(s: str) -> bool:
    s_strip = s.strip().lower()
    if len(s_strip) < 8:
        return True
    for p in BOILERPLATE_PATTERNS:
        if re.search(p, s_strip):
            return True
    return False

def looks_like_table(text: str) -> bool:
    if text.count("|") >= 2 or "\t" in text:
        return True
    comma_fraction = sum(1 for tok in text.split() if re.match(r"^[\d,\.%-]+$", tok)) / max(1, len(text.split()))
    if comma_fraction > 0.25 and len(text.split()) > 6:
        return True
    return False

# ---------- Get documents (use existing variables if present) ----------
# Prefer workspace variables created earlier: documents or all_docs
if "hybrid_chunks" in globals():
    raise RuntimeError("hybrid_chunks already exists in the workspace — delete or rename it before re-running this cell.")

docs_source = None
if "documents" in globals() and isinstance(documents, list) and len(documents) > 0:
    docs_source = documents
elif "all_docs" in globals() and isinstance(all_docs, list) and len(all_docs) > 0:
    docs_source = all_docs
else:
    # fallback: load from DATA_DIR using basic loaders (no OCR here)
    from langchain_community.document_loaders import (
        TextLoader, PyPDFLoader, Docx2txtLoader, CSVLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader
    )
    docs_source = []
    for f in sorted(os.listdir(DATA_DIR)):
        p = os.path.join(DATA_DIR, f)
        ext = os.path.splitext(f)[1].lower()
        try:
            if ext == ".txt":
                docs_source.extend(TextLoader(p).load())
            elif ext == ".pdf":
                docs_source.extend(PyPDFLoader(p).load())
            elif ext == ".docx":
                docs_source.extend(Docx2txtLoader(p).load())
            elif ext == ".csv":
                docs_source.extend(CSVLoader(p).load())
            elif ext in [".xls", ".xlsx"]:
                docs_source.extend(UnstructuredExcelLoader(p).load())
            elif ext in [".ppt", ".pptx"]:
                docs_source.extend(UnstructuredPowerPointLoader(p).load())
            else:
                # skip images (assume OCR handled earlier) or unsupported types
                continue
            print(f"Loaded: {f}")
        except Exception as e:
            print(f"Failed to load {f}: {e}")

if not docs_source:
    raise RuntimeError("No documents found. Place files in ./data or create 'documents' / 'all_docs' variables before running.")

# ---------- Core hybrid chunking algorithm ----------
hybrid_chunks_out = []

for doc_idx, doc in enumerate(docs_source):
    # Accept either langchain Document or simple dict/str
    text = doc.page_content if hasattr(doc, "page_content") else (doc.get("text") if isinstance(doc, dict) else str(doc))
    src_meta = (doc.metadata.get("source") if hasattr(doc, "metadata") else None) or (getattr(doc, "source", None) or f"doc_{doc_idx}")
    if not text or not text.strip():
        continue

    # Normalize newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # Heuristic paragraph splitting
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

    for para in paragraphs:
        # If very short paragraph or looks like table -> keep whole paragraph as chunk
        if looks_like_table(para) or len(para) < MIN_CHARS:
            hybrid_chunks_out.append({
                "text": para,
                "source": src_meta,
                "tokens": len(para.split()),
                "boundary_reason": "table_or_short_para"
            })
            continue

        # Sentence segmentation with spaCy (fallback to naive split if spaCy fails)
        try:
            doc_sp = nlp(para)
            sents = [s.text.strip() for s in doc_sp.sents if s.text.strip()]
        except Exception:
            sents = [s.strip() for s in re.split(r'(?<=[\.\?\!])\s+', para) if s.strip()]

        # Filter boilerplate
        sents = [s for s in sents if not is_boilerplate(s)]
        if not sents:
            continue

        # Embed sentences (vector form) in batch
        sent_embeddings = embed_model.encode(sents, convert_to_tensor=True, batch_size=EMBED_BATCH)

        # Merge neighbor sentences by semantic similarity and size constraints
        cur_indices = []
        cur_len = 0
        cur_emb_sum = None
        for i, sent in enumerate(sents):
            sent_len = len(sent)
            sent_emb = sent_embeddings[i]

            if not cur_indices:
                cur_indices = [i]
                cur_len = sent_len
                cur_emb_sum = sent_emb.clone()
                continue

            cur_mean_emb = (cur_emb_sum / len(cur_indices))
            sim = util.cos_sim(cur_mean_emb, sent_emb).item()

            # Merge if similar and size allows, or if current chunk still too short
            if (sim >= SIM_THRESHOLD and cur_len + sent_len <= MAX_CHARS) or (cur_len < MIN_CHARS):
                cur_indices.append(i)
                cur_len += sent_len + 1
                cur_emb_sum = cur_emb_sum + sent_emb
            else:
                # finalize current chunk
                chunk_text = " ".join([sents[j] for j in cur_indices]).strip()
                if len(chunk_text) >= MIN_CHARS:
                    hybrid_chunks_out.append({
                        "text": chunk_text,
                        "source": src_meta,
                        "tokens": len(chunk_text.split()),
                        "boundary_reason": "semantic_split"
                    })
                # start new chunk
                cur_indices = [i]
                cur_len = sent_len
                cur_emb_sum = sent_emb.clone()

        # finalize last chunk in paragraph
        if cur_indices:
            chunk_text = " ".join([sents[j] for j in cur_indices]).strip()
            if len(chunk_text) >= 1:
                hybrid_chunks_out.append({
                    "text": chunk_text,
                    "source": src_meta,
                    "tokens": len(chunk_text.split()),
                    "boundary_reason": "paragraph_end"
                })

# ---------- Postprocess: dedupe & light filtering ----------
final_chunks = []
seen = set()
for ch in hybrid_chunks_out:
    txt = re.sub(r'\s+', ' ', ch["text"]).strip()
    if len(txt) < 10:
        continue
    if txt in seen:
        continue
    seen.add(txt)
    # mark short but meaningful chunks
    if len(txt) < MIN_CHARS and not looks_like_table(txt):
        ch["boundary_reason"] = ch.get("boundary_reason", "") + "|short"
    ch["text"] = txt
    final_chunks.append(ch)

# Expose to notebook namespace
hybrid_chunks = final_chunks

print(f"✅ Hybrid chunking finished. Created {len(hybrid_chunks)} chunks from {len(docs_source)} documents.")
for i, ex in enumerate(hybrid_chunks[:3]):
    print(f"\n--- CHUNK {i+1} ---\nsource: {ex['source']}\ntokens: {ex['tokens']}\nboundary: {ex['boundary_reason']}\ntext preview: {ex['text'][:300]}...")

Loaded: 20251014-174708_Top AC Repa.csv
Loaded: ICMLA_329_Comment_Response.pdf
Failed to load ICMLA_329_Final.docx: No module named 'docx2txt'
Loaded: Invoice_TNB-18A84296.pdf
✅ Hybrid chunking finished. Created 175 chunks from 107 documents.

--- CHUNK 1 ---
source: data\20251014-174708_Top AC Repa.csv
tokens: 31
boundary: paragraph_end
text preview: ﻿Company Name: Shah AIR Cool Company Name link: https://www.justdial.com/Thane/Shah-AIR-Cool-Palava-City/022PXX22-XX22-231005182331-X6T7_BZDET?trkid=&term=&ncatid=10890481&area=&search=Popular%20AC%20Repair%20&%20Services%20in%20Mumbai&mncatname=AC%20Repair%20%26%20Services&ftterm=&abd_btn=Send%20En...

--- CHUNK 2 ---
source: data\20251014-174708_Top AC Repa.csv
tokens: 28
boundary: paragraph_end
text preview: ﻿Company Name: Cool India Enterprises Company Name link: https://www.justdial.com/Mumbai/Cool-India-Enterprises-Near-Nahar-Amrit-Shakti-Andheri-East/022PXX22-XX22-110831111304-F8X9_BZDET?trkid=&term=&ncatid=10890481&area=&search=Po

In [9]:
# ===== Cell 7 (robust) — Build Embeddings + FAISS vector store (persisted) =====
import os
from pathlib import Path
from tqdm.auto import tqdm

# --- safe imports for Document (works across LangChain versions) ---
try:
    from langchain.schema import Document
except Exception:
    try:
        from langchain_core.documents import Document
    except Exception:
        from dataclasses import dataclass
        @dataclass
        class Document:
            page_content: str
            metadata: dict = None

# --- embeddings & vectorstore imports ---
try:
    from langchain_community.embeddings import HuggingFaceEmbeddings
    from langchain_community.vectorstores import FAISS
except Exception as e:
    raise RuntimeError(
        "Could not import langchain_community embeddings/vectorstores. "
        "Install/update langchain-community and restart the kernel."
    ) from e

# Settings
VECTOR_DIR = Path("vectorstore")
VECTOR_DIR.mkdir(exist_ok=True)
EMB_MODEL = os.environ.get("EMB_MODEL", None) or (globals().get("CONFIG", {}).get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2"))
BATCH = globals().get("CONFIG", {}).get("embedding_batch_size", 16)
TOP_K = globals().get("CONFIG", {}).get("retrieval_top_k", 5)

# Ensure hybrid_chunks exists
if "hybrid_chunks" not in globals() or not isinstance(hybrid_chunks, list) or len(hybrid_chunks) == 0:
    raise RuntimeError("hybrid_chunks not found or empty — run the hybrid chunking cell first.")

print(f"Building embeddings for {len(hybrid_chunks)} chunks using model: {EMB_MODEL}")

# Convert to Document objects (with metadata)
doc_objects = [
    Document(
        page_content=ch["text"],
        metadata={
            "source": ch.get("source"),
            "tokens": ch.get("tokens"),
            "boundary_reason": ch.get("boundary_reason"),
            "chunk_id": i,
        },
    )
    for i, ch in enumerate(hybrid_chunks)
]

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name=EMB_MODEL)

# Build FAISS index
vectorstore = None
try:
    vectorstore = FAISS.from_documents(doc_objects, embedding=embedding_model)
except Exception as e:
    print("⚠️ FAISS.from_documents failed — trying batched ingestion. Error:", e)
    try:
        texts = [d.page_content for d in doc_objects]
        metadatas = [d.metadata if d.metadata else {} for d in doc_objects]
        vectorstore = FAISS.from_texts(texts=texts, embedding=embedding_model, metadatas=metadatas)
    except Exception as e2:
        raise RuntimeError("All FAISS creation attempts failed:\n" + str(e2))

# Persist vectorstore
try:
    vectorstore.save_local(str(VECTOR_DIR))
    print(f"✅ Vectorstore saved to: {VECTOR_DIR.resolve()}")
except Exception as e:
    print("⚠️ Could not persist vectorstore:", e)

# Create retriever
try:
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": TOP_K})
    print("✅ Retriever ready. Top-k =", retriever.search_kwargs.get("k"))
except Exception as e:
    raise RuntimeError("Failed to construct retriever from vectorstore: " + str(e)) from e

# Quick sanity search (robust)
sample_q = "What is the phone number or address of Shah AIR Cool?"
try:
    # Use vectorstore.similarity_search for safety
    hits = vectorstore.similarity_search(sample_q, k=3)
    print(f"\nSample retrieval for: {sample_q}")
    for i, h in enumerate(hits, 1):
        src = h.metadata.get("source") if getattr(h, "metadata", None) else "unknown"
        preview = (h.page_content[:200].replace("\n", " ") if getattr(h, "page_content", None) else str(h)[:200])
        print(f"  {i}. source={src} preview={preview}...")
except Exception as e:
    print("⚠️ Sample retrieval failed (non-fatal):", e)

Building embeddings for 175 chunks using model: sentence-transformers/all-MiniLM-L6-v2


  embedding_model = HuggingFaceEmbeddings(model_name=EMB_MODEL)


✅ Vectorstore saved to: C:\Users\mbkhn\Downloads\Inspired\RAG\vectorstore
✅ Retriever ready. Top-k = 5

Sample retrieval for: What is the phone number or address of Shah AIR Cool?
  1. source=data\20251014-174708_Top AC Repa.csv preview=﻿Company Name: AIR Cool Services Company Name link: https://www.justdial.com/Mumbai/AIR-Cool-Services-Behind-Wadia-School-Near-D-N-Nagar-Andheri-West/022PXX22-XX22-170616171658-Z5P2_BZDET?trkid=&term=...
  2. source=data\20251014-174708_Top AC Repa.csv preview=﻿Company Name: Shah Air Conditioner Company Name link: https://www.justdial.com/Kalyan/Shah-Air-Conditioner-Kalyan-East/022PXX22-XX22-250924155602-Y5B4_BZDET?trkid=&term=&ncatid=10890481&area=&search=...
  3. source=data\20251014-174708_Top AC Repa.csv preview=﻿Company Name: Shah AIR Cool Company Name link: https://www.justdial.com/Thane/Shah-AIR-Cool-Palava-City/022PXX22-XX22-231005182331-X6T7_BZDET?trkid=&term=&ncatid=10890481&area=&search=Popular%20AC%20...


In [10]:
# ===== Cell 8 — Reranker + RAG answerer + verification (robust, full cell) =====
# First, install required packages with specific versions
#!pip install -q google-generativeai==0.3.2 langchain-google-genai==0.0.8 langchain-core==0.1.0
#!pip install -q --upgrade protobuf  # Ensure protobuf is up to date

# Import Google's Generative AI with error handling
try:
    import google.generativeai as genai
    # Configure with your API key
    if "GOOGLE_API_KEY" not in os.environ:
        raise ValueError("GOOGLE_API_KEY environment variable not set")
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
except ImportError:
    raise ImportError("Failed to import google.generativeai. Install with: pip install google-generativeai")

# ---------------- Robust imports for PromptTemplate + LLMChain ----------------
try:
    from langchain.prompts import PromptTemplate
    from langchain.chains import LLMChain
    from langchain.schema import Document
    from langchain_google_genai import ChatGoogleGenerativeAI
    LANGCHAIN_AVAILABLE = True
except ImportError:
    print("Warning: Some LangChain imports failed, using fallback implementations")
    LANGCHAIN_AVAILABLE = False

# Fallback implementations if imports fail
if not LANGCHAIN_AVAILABLE:
    class Document:
        def __init__(self, page_content: str, metadata: dict = None):
            self.page_content = page_content
            self.metadata = metadata or {}

    class PromptTemplate:
        def __init__(self, template: str, input_variables: List[str]):
            self.template = template
            self.input_variables = input_variables
        
        def format(self, **kwargs) -> str:
            return self.template.format(**kwargs)
        
        @classmethod
        def from_template(cls, template: str):
            # Extract input variables from template
            input_vars = re.findall(r'\{([^}]+)\}', template)
            return cls(template=template, input_variables=list(set(input_vars)))

    class LLMChain:
        def __init__(self, llm, prompt):
            self.llm = llm
            self.prompt = prompt
        
        def run(self, inputs: dict) -> str:
            prompt_text = self.prompt.format(**inputs)
            return self.llm(prompt_text)

# ---------------- CrossEncoder (reranker) ----------------
try:
    from sentence_transformers import CrossEncoder
except ImportError:
    raise ImportError(
        "Missing package `sentence-transformers`. Install it with:\n"
        "    pip install -U sentence-transformers\n"
        "Then restart the kernel."
    )

# ---------------- LLM Initialization ----------------
class GeminiLLM:
    """Wrapper for Google's Gemini model with a consistent interface"""
    def __init__(self, model_name: str = "gemini-2.5-flash", temperature: float = 0.4, max_output_tokens: int = 512):
        self.model_name = model_name
        self.temperature = temperature
        self.max_output_tokens = max_output_tokens
        self.model = genai.GenerativeModel(model_name)
    
    def __call__(self, prompt: str) -> str:
        try:
            response = self.model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=self.temperature,
                    max_output_tokens=self.max_output_tokens
                )
            )
            return response.text
        except Exception as e:
            raise RuntimeError(f"Error calling Gemini API: {str(e)}")

# Initialize LLM
try:
    if LANGCHAIN_AVAILABLE:
        llm = ChatGoogleGenerativeAI(
            model="gemini-2.5-flash",  # Using the stable model
            temperature=0.4,
            max_output_tokens=512
        )
        print("Using LangChain's ChatGoogleGenerativeAI")
    else:
        llm = GeminiLLM(model_name="gemini-2.5-flash", temperature=0.4, max_output_tokens=512)
        print("Using direct Gemini API client")
except Exception as e:
    raise RuntimeError(f"Failed to initialize LLM: {str(e)}")

# ---------------- Core RAG Components ----------------
def _extract_text_from_output(out):
    """Extract text from various LLM output formats"""
    if out is None:
        return ""
    if isinstance(out, str):
        return out
    # Handle direct Gemini response
    if hasattr(out, 'text'):
        return out.text
    # Handle dict responses
    if isinstance(out, dict):
        if 'answer' in out:
            return out['answer']
        if 'text' in out:
            return out['text']
        if 'content' in out:
            return out['content']
    # Handle LangChain message objects
    if hasattr(out, 'content'):
        return out.content if isinstance(out.content, str) else str(out.content)
    # Fallback to string representation
    return str(out)

def _call_llm_and_extract(llm_obj, prompt_text: str) -> str:
    """Robust LLM caller that handles different LLM interfaces"""
    try:
        # Try direct call first
        if callable(llm_obj):
            return _extract_text_from_output(llm_obj(prompt_text))
        
        # Try common LLM interfaces
        for method in ['invoke', 'run', 'generate', '__call__', 'predict', 'chat']:
            if hasattr(llm_obj, method):
                try:
                    result = getattr(llm_obj, method)(prompt_text)
                    return _extract_text_from_output(result)
                except Exception:
                    continue
        
        raise RuntimeError("No supported call interface found on the LLM object")
    except Exception as e:
        raise RuntimeError(f"Error calling LLM: {str(e)}")

# ---------------- RAG Implementation ----------------
class RAGSystem:
    def __init__(self, llm, rerank_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.llm = llm
        self.reranker = CrossEncoder(rerank_model, device="cpu")
        self.prompt_template = self._create_prompt_template()
        self.chain = self._create_chain()
        
    def _create_prompt_template(self):
        template = """
        You are an expert assistant. Answer the question using ONLY the numbered context passages below.
        Cite facts inline using the passage numbers like [1], [2]. If the answer cannot be found in the passages, 
        respond exactly: "Information not found in the provided documents."

        Question: {question}

        Passages:
        {context}

        Answer (concise; preserve any formatting needed):
        """
        return PromptTemplate.from_template(template)
    
    def _create_chain(self):
        return LLMChain(llm=self.llm, prompt=self.prompt_template)
    
    def fetch_and_rerank(self, query: str, top_k: int = 20):
        """Fetch and rerank documents based on the query"""
        try:
            candidates = retriever.get_relevant_documents(query)
        except Exception:
            try:
                candidates = vectorstore.similarity_search(query, k=top_k)
            except Exception:
                raise RuntimeError(
                    "Neither `retriever` nor `vectorstore` is available/working in the notebook environment."
                )
        
        if not candidates:
            return []
            
        # Prepare pairs for reranking
        pairs = [(query, doc.page_content[:1000] + ("..." if len(doc.page_content) > 1000 else "")) 
                for doc in candidates[:top_k]]
        
        # Get scores from reranker
        try:
            scores = self.reranker.predict(pairs)
        except Exception as e:
            raise RuntimeError(f"Reranker predict failed: {str(e)}")
        
        # Sort by score
        scored_sorted = sorted(
            [{"doc": d, "score": float(s)} for d, s in zip(candidates, scores)],
            key=lambda x: x["score"], 
            reverse=True
        )
        return scored_sorted
    
    def build_context(self, scored_list, max_docs: int = 4):
        """Build context from scored documents"""
        context_blocks, sources = [], []
        for i, item in enumerate(scored_list[:max_docs], 1):
            doc = item["doc"]
            text = doc.page_content if hasattr(doc, 'page_content') else str(doc)
            text = text[:1000] + ("..." if len(text) > 1000 else "")
            context_blocks.append(f"[{i}] {text}")
            
            metadata = getattr(doc, 'metadata', {}) or {}
            sources.append({
                "id": i,
                "source": metadata.get("source", "unknown"),
                "score": item["score"],
                "snippet": text[:300].replace("\n", " ")
            })
        
        return "\n\n".join(context_blocks), sources
    
    def verify_answer(self, answer_text: str, top_k: int = 5):
        """Verify the facts in the answer against retrieved documents"""
        sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer_text) if s.strip()]
        verification = []
        
        for sent in sentences:
            try:
                try:
                    sup_docs = retriever.get_relevant_documents(sent)[:top_k]
                except Exception:
                    sup_docs = vectorstore.similarity_search(sent, k=top_k)
                
                supported = False
                matched_sources = set()
                sent_words = set(re.findall(r'\w+', sent.lower()))
                
                for doc in sup_docs:
                    doc_text = doc.page_content if hasattr(doc, 'page_content') else str(doc)
                    doc_words = set(re.findall(r'\w+', doc_text.lower()))
                    
                    if len(sent_words & doc_words) >= max(3, len(sent_words) // 6):
                        supported = True
                        metadata = getattr(doc, 'metadata', {}) or {}
                        if 'source' in metadata:
                            matched_sources.add(metadata['source'])
                
                verification.append({
                    "sentence": sent,
                    "supported": supported,
                    "matched_sources": list(matched_sources)
                })
                
            except Exception as e:
                verification.append({
                    "sentence": sent,
                    "supported": False,
                    "error": str(e)
                })
        
        return verification
    
    def answer_query(self, query: str, rerank_top_k: int = 20, final_k: int = 4):
        """Answer a query using the RAG system"""
        try:
            # Step 1: Fetch and rerank documents
            scored = self.fetch_and_rerank(query, top_k=rerank_top_k)
            if not scored:
                return {
                    "answer": "No relevant documents found.",
                    "sources": [],
                    "confidence": 0.0,
                    "verification": []
                }
            
            # Step 2: Build context
            context, sources = self.build_context(scored, max_docs=final_k)
            
            # Calculate confidence based on scores
            scores = [s["score"] for s in scored[:final_k]]
            if len(scores) > 1:
                min_s, max_s = min(scores), max(scores)
                confidence = ((sum(scores) / len(scores)) - min_s) / (max_s - min_s) if max_s > min_s else 1.0
            else:
                confidence = 1.0
            confidence = max(0.0, min(1.0, confidence))
            
            # Step 3: Generate answer
            try:
                if hasattr(self.chain, 'run'):
                    generated = self.chain.run({"question": query, "context": context})
                else:
                    prompt = self.prompt_template.format(question=query, context=context)
                    generated = _call_llm_and_extract(self.llm, prompt)
                
                # Step 4: Verify the answer
                verification = self.verify_answer(generated)
                
                return {
                    "query": query,
                    "answer": generated.strip(),
                    "confidence": round(confidence, 3),
                    "sources": sources,
                    "verification": verification
                }
                
            except Exception as e:
                return {
                    "answer": f"Error generating answer: {str(e)}",
                    "sources": sources,
                    "confidence": 0.0,
                    "verification": []
                }
                
        except Exception as e:
            return {
                "answer": f"Error processing query: {str(e)}",
                "sources": [],
                "confidence": 0.0,
                "verification": []
            }

# Initialize the RAG system
try:
    rag = RAGSystem(llm=llm)
    print("RAG system initialized successfully")
except Exception as e:
    print(f"Error initializing RAG system: {str(e)}")
    raise

# Example usage
if __name__ == "__main__":
    # Example query
    query = "What is the phone number or address of Shah AIR Cool?"
    
    # Get the answer
    result = rag.answer_query(query)
    
    # Print the results
    print("\n--- QUERY ---")
    print(query)
    
    print("\n--- ANSWER ---")
    print(result["answer"])
    
    print("\n--- CONFIDENCE ---")
    print(f"{result['confidence']:.2f}")
    
    if result["sources"]:
        print("\n--- SOURCES ---")
        for i, source in enumerate(result["sources"], 1):
            print(f"{i}. {source['source']} (score: {source['score']:.3f})")
            print(f"   {source['snippet']}...\n")
    
    if result["verification"]:
        print("\n--- VERIFICATION ---")
        for i, verif in enumerate(result["verification"], 1):
            status = "✅" if verif["supported"] else "❌"
            print(f"{i}. {status} {verif['sentence'][:150]}")
            if verif.get("matched_sources"):
                print(f"   Matched sources: {', '.join(verif['matched_sources'])}")
            if 'error' in verif:
                print(f"   Error: {verif['error']}")

Using direct Gemini API client
RAG system initialized successfully

--- QUERY ---
What is the phone number or address of Shah AIR Cool?

--- ANSWER ---
The phone number for Shah AIR Cool is 09035439250 and its address is Palava City Phase 2 Palava City, Thane [1].

--- CONFIDENCE ---
0.33

--- SOURCES ---
1. data\20251014-174708_Top AC Repa.csv (score: 6.563)
   ﻿Company Name: Shah AIR Cool Company Name link: https://www.justdial.com/Thane/Shah-AIR-Cool-Palava-City/022PXX22-XX22-231005182331-X6T7_BZDET?trkid=&term=&ncatid=10890481&area=&search=Popular%20AC%20Repair%20&%20Services%20in%20Mumbai&mncatname=AC%20Repair%20%26%20Services&ftterm=&abd_btn=Send%20En...

2. data\20251014-174708_Top AC Repa.csv (score: 2.943)
   ﻿Company Name: Shah Air Conditioner Company Name link: https://www.justdial.com/Kalyan/Shah-Air-Conditioner-Kalyan-East/022PXX22-XX22-250924155602-Y5B4_BZDET?trkid=&term=&ncatid=10890481&area=&search=Popular%20AC%20Repair%20&%20Services%20in%20Mumbai&mncatname=AC%20Repair

In [11]:
#!pip install --upgrade langchain-google-genai==0.0.8 google-generativeai==0.3.2 langchain-core==0.1.0