updated pipeline

In [None]:
!pip install -q groq
!pip install -q langchain-groq


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/136.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m133.1/136.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.0/136.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Run this in a notebook cell first (only once)
!pip install -U langchain-huggingface langchain-community sentence-transformers faiss-cpu




Complete upgraded pipeline (single cell)

In [None]:
# Upgraded RAG pipeline (category-filtered FAISS indices + LCEL pipeline)
import os
from collections import defaultdict

import numpy as np
import pandas as pd

# LangChain / huggingface imports (for LCEL / Groq setup)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# LCEL style imports (LangChain 0.3+)
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# LLM (Groq)
from langchain_groq import ChatGroq


# ---------------------------
# Config / paths
# ---------------------------
CSV_PATH = "/content/GPT_Input_DB(Sheet1).csv"  # update if needed
ENCODING = "latin1"
FAISS_BASE_DIR = "faiss_indices_by_category"   # where per-category indices are saved
EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"

os.makedirs(FAISS_BASE_DIR, exist_ok=True)


# ---------------------------
# 1) Load CSV & preprocess
# ---------------------------
df = pd.read_csv(CSV_PATH, encoding=ENCODING)
df = df.fillna('')

# Normalize column names
df.columns = [c.strip() for c in df.columns]

required = ['S. No.', 'problem', 'category', 'type', 'data', 'code', 'clause']
for r in required:
    if r not in df.columns:
        raise ValueError(f"Missing required column: {r}")


# ---------------------------
# 2) Build documents by category
# ---------------------------
from langchain.docstore.document import Document

docs_by_cat = defaultdict(list)

for _, row in df.iterrows():
    problem = str(row['problem']).strip()
    category = str(row['category']).strip()
    typ = str(row['type']).strip()
    desc = str(row['data']).strip()
    code = str(row['code']).strip()
    clause = str(row['clause']).strip()
    sno = row['S. No.']

    page_content = (
        f"Problem: {problem}\n"
        f"Category: {category}\n"
        f"Intervention Type: {typ}\n"
        f"Description: {desc}\n"
        f"Code: {code}\n"
        f"Clause: {clause}\n"
    )

    metadata = {
        "S. No.": sno,
        "problem": problem,
        "category": category,
        "type": typ,
        "code": code,
        "clause": clause,
        "source_data": desc
    }

    doc = Document(page_content=page_content, metadata=metadata)
    docs_by_cat[category].append(doc)


# ---------------------------
# 3) Instantiate embeddings
# ---------------------------
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)


# ---------------------------
# 4) Build/load FAISS indices by category + centroids
# ---------------------------
category_centroids = {}
vectorstores = {}

for cat, docs in docs_by_cat.items():

    safe_cat = cat.replace("/", "_").replace(" ", "_")
    idx_dir = os.path.join(FAISS_BASE_DIR, f"faiss_{safe_cat}")

    if os.path.exists(idx_dir) and os.listdir(idx_dir):
        vs = FAISS.load_local(idx_dir, embeddings, allow_dangerous_deserialization=True)
    else:
        vs = FAISS.from_documents(docs, embeddings)
        vs.save_local(idx_dir)

    vectorstores[cat] = vs

    # compute centroid
    text_embs = embeddings.embed_documents([d.page_content for d in docs])
    centroid = np.mean(np.array(text_embs), axis=0)
    category_centroids[cat] = centroid

print(f"Built/loaded FAISS indices for {len(vectorstores)} categories.")


# ---------------------------
# 5) Category selection helper
# ---------------------------
def choose_category_for_query(query, top_n=1):
    q_emb = embeddings.embed_query(query)
    cats = list(category_centroids.keys())
    centroids = np.stack([category_centroids[c] for c in cats], axis=0)

    q = np.array(q_emb)
    q_norm = np.linalg.norm(q) + 1e-12
    cent_norms = np.linalg.norm(centroids, axis=1) + 1e-12

    sims = (centroids @ q) / (cent_norms * q_norm)

    order = np.argsort(-sims)
    top_cats = [cats[i] for i in order[:top_n]]

    return top_cats, sims[order[0]]


# ---------------------------
# 6) RAG Pipeline w/ LCEL
# ---------------------------
prompt_template = """
You are an expert AI assistant for the National Road Safety Hackathon 2025.
Your SOLE purpose is to answer the user's question using ONLY the 'Provided Context'.

If the context is empty or irrelevant, respond exactly with:
"Based on the provided database, I cannot find a specific intervention for this issue."

**Provided Context:**
{context}

**User's Issue:**
{question}

**Recommended Intervention:**
[Your answer]

**Database Reference:**
- Intervention Type: [from context]
- Code: [from context]
- Clause: [from context]
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.0)


def format_docs_for_prompt(docs):
    out = []
    for d in docs:
        md = d.metadata
        out.append(
            f"Intervention Type: {md['type']}\n"
            f"Description: {md['source_data']}\n"
            f"Code: {md['code']}\n"
            f"Clause: {md['clause']}"
        )
    return "\n\n".join(out)


EMPTY_MSG = "Based on the provided database, I cannot find a specific intervention for this issue."


def run_rag_query(query: str, k: int = 3, min_cat_sim: float = 0.1):

    # 1) Category selection
    top_cats, top_sim = choose_category_for_query(query, top_n=3)

    if top_sim < min_cat_sim:
        return {"result": EMPTY_MSG, "source_documents": []}

    for cat in top_cats:

        vs = vectorstores.get(cat)
        retriever = vs.as_retriever(search_kwargs={"k": k})

        # FIX APPLIED HERE
        docs = retriever.invoke(query)

        docs = [d for d in docs if d.metadata["category"] == cat]
        if not docs:
            continue

        # Build context
        context = format_docs_for_prompt(docs)
        if not context.strip():
            continue

        # Run LLM
        final_prompt = PROMPT.format(context=context, question=query)
        response = llm.invoke(final_prompt)

        result = response.content.strip()

        if EMPTY_MSG in result:
            return {"result": EMPTY_MSG, "source_documents": docs}

        return {
            "result": result,
            "source_documents": docs,
            "category": cat,
            "category_similarity": float(top_sim)
        }

    return {"result": EMPTY_MSG, "source_documents": []}


# ---------------------------
# 7) Test queries
# ---------------------------
tests = [
    "The zebra crossing near the school has completely faded. What is the corrective action?",
    "The hospital sign near the clinic has faded and is not visible at night.",
    "The centerline marking on the two-way road is barely visible.",
    "There are deep potholes on the highway.",
]

for q in tests:
    out = run_rag_query(q)
    print("\nQUERY:", q)
    print("CATEGORY:", out.get("category"))
    print("SIM:", out.get("category_similarity"))
    print("RESULT:", out["result"])
    print("SOURCES:")
    for s in out["source_documents"]:
        print(s.metadata)
    print("-" * 80)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Built/loaded FAISS indices for 3 categories.

QUERY: The zebra crossing near the school has completely faded. What is the corrective action?
CATEGORY: Road Marking
SIM: 0.5295431543142993
RESULT: **Recommended Intervention:** 
Intervention Type: Objects adjacent / near to carriageway
Description: Subway piers, abutments, and culvert head walls outside the roadway must have at least six black and white stripes sloping 45 degrees towards traffic. Electrical poles near the carriageway should have horizontal black and white stripes up to 1.25 m high, with each stripe at least 100 mm wide. guard rails, guard stones or drums and trees that are not likely to be hit unless a vehicle runs off the carriageway shall be painted solid white. Trees must be marked up to 1.25 m height, with a 300 mm black band in the middle for visibility. All objects within 2.4 m of the shoulder or kerb must be painted. Object markers, at least 1.2 m high, should be placed in front of such objects to improve visibili

In [None]:
# Gradio Chat Interface for Road Safety RAG Pipeline
import gradio as gr
import os
from collections import defaultdict
import numpy as np
import pandas as pd

# LangChain / huggingface imports
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.docstore.document import Document


# ---------------------------
# Config / paths
# ---------------------------
CSV_PATH = "/content/GPT_Input_DB(Sheet1).csv"  # update if needed
ENCODING = "latin1"
FAISS_BASE_DIR = "faiss_indices_by_category"
EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"

os.makedirs(FAISS_BASE_DIR, exist_ok=True)


# ---------------------------
# 1) Load CSV & preprocess
# ---------------------------
print("Loading CSV data...")
df = pd.read_csv(CSV_PATH, encoding=ENCODING)
df = df.fillna('')

# Normalize column names
df.columns = [c.strip() for c in df.columns]

required = ['S. No.', 'problem', 'category', 'type', 'data', 'code', 'clause']
for r in required:
    if r not in df.columns:
        raise ValueError(f"Missing required column: {r}")


# ---------------------------
# 2) Build documents by category
# ---------------------------
print("Building documents by category...")
docs_by_cat = defaultdict(list)

for _, row in df.iterrows():
    problem = str(row['problem']).strip()
    category = str(row['category']).strip()
    typ = str(row['type']).strip()
    desc = str(row['data']).strip()
    code = str(row['code']).strip()
    clause = str(row['clause']).strip()
    sno = row['S. No.']

    page_content = (
        f"Problem: {problem}\n"
        f"Category: {category}\n"
        f"Intervention Type: {typ}\n"
        f"Description: {desc}\n"
        f"Code: {code}\n"
        f"Clause: {clause}\n"
    )

    metadata = {
        "S. No.": sno,
        "problem": problem,
        "category": category,
        "type": typ,
        "code": code,
        "clause": clause,
        "source_data": desc
    }

    doc = Document(page_content=page_content, metadata=metadata)
    docs_by_cat[category].append(doc)


# ---------------------------
# 3) Instantiate embeddings
# ---------------------------
print("Loading embeddings model...")
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)


# ---------------------------
# 4) Build/load FAISS indices by category + centroids
# ---------------------------
print("Building/loading FAISS indices...")
category_centroids = {}
vectorstores = {}

for cat, docs in docs_by_cat.items():
    safe_cat = cat.replace("/", "_").replace(" ", "_")
    idx_dir = os.path.join(FAISS_BASE_DIR, f"faiss_{safe_cat}")

    if os.path.exists(idx_dir) and os.listdir(idx_dir):
        vs = FAISS.load_local(idx_dir, embeddings, allow_dangerous_deserialization=True)
    else:
        vs = FAISS.from_documents(docs, embeddings)
        vs.save_local(idx_dir)

    vectorstores[cat] = vs

    # compute centroid
    text_embs = embeddings.embed_documents([d.page_content for d in docs])
    centroid = np.mean(np.array(text_embs), axis=0)
    category_centroids[cat] = centroid

print(f"✓ Built/loaded FAISS indices for {len(vectorstores)} categories.")


# ---------------------------
# 5) Category selection helper
# ---------------------------
def choose_category_for_query(query, top_n=1):
    q_emb = embeddings.embed_query(query)
    cats = list(category_centroids.keys())
    centroids = np.stack([category_centroids[c] for c in cats], axis=0)

    q = np.array(q_emb)
    q_norm = np.linalg.norm(q) + 1e-12
    cent_norms = np.linalg.norm(centroids, axis=1) + 1e-12

    sims = (centroids @ q) / (cent_norms * q_norm)

    order = np.argsort(-sims)
    top_cats = [cats[i] for i in order[:top_n]]

    return top_cats, sims[order[0]]


# ---------------------------
# 6) RAG Pipeline
# ---------------------------
prompt_template = """
You are an expert AI assistant for the National Road Safety Hackathon 2025.
Your SOLE purpose is to answer the user's question using ONLY the 'Provided Context'.

If the context is empty or irrelevant, respond exactly with:
"Based on the provided database, I cannot find a specific intervention for this issue."

**Provided Context:**
{context}

**User's Issue:**
{question}

**Recommended Intervention:**
[Your answer]

**Database Reference:**
- Intervention Type: [from context]
- Code: [from context]
- Clause: [from context]
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

print("Initializing LLM...")
llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.0)


def format_docs_for_prompt(docs):
    out = []
    for d in docs:
        md = d.metadata
        out.append(
            f"Intervention Type: {md['type']}\n"
            f"Description: {md['source_data']}\n"
            f"Code: {md['code']}\n"
            f"Clause: {md['clause']}"
        )
    return "\n\n".join(out)


EMPTY_MSG = "Based on the provided database, I cannot find a specific intervention for this issue."


def run_rag_query(query: str, k: int = 3, min_cat_sim: float = 0.1):
    # 1) Category selection
    top_cats, top_sim = choose_category_for_query(query, top_n=3)

    if top_sim < min_cat_sim:
        return {"result": EMPTY_MSG, "source_documents": [], "category": None, "similarity": 0.0}

    for cat in top_cats:
        vs = vectorstores.get(cat)
        retriever = vs.as_retriever(search_kwargs={"k": k})

        docs = retriever.invoke(query)
        docs = [d for d in docs if d.metadata["category"] == cat]
        if not docs:
            continue

        # Build context
        context = format_docs_for_prompt(docs)
        if not context.strip():
            continue

        # Run LLM
        final_prompt = PROMPT.format(context=context, question=query)
        response = llm.invoke(final_prompt)

        result = response.content.strip()

        if EMPTY_MSG in result:
            return {"result": EMPTY_MSG, "source_documents": docs, "category": cat, "similarity": float(top_sim)}

        return {
            "result": result,
            "source_documents": docs,
            "category": cat,
            "similarity": float(top_sim)
        }

    return {"result": EMPTY_MSG, "source_documents": [], "category": None, "similarity": 0.0}


# ---------------------------
# 7) Gradio Chat Interface
# ---------------------------
def format_response_with_metadata(result_dict):
    """Format the response with metadata in a nice readable way"""
    response = result_dict['result']

    # Add metadata section
    if result_dict.get('category'):
        response += f"\n\n---\n**📊 Metadata:**\n"
        response += f"- **Category:** {result_dict['category']}\n"
        response += f"- **Similarity Score:** {result_dict['similarity']:.2%}\n"

        # Add source documents
        if result_dict.get('source_documents'):
            response += f"\n**📚 Source References:**\n"
            for i, doc in enumerate(result_dict['source_documents'][:3], 1):
                md = doc.metadata
                response += f"\n{i}. **{md['type']}**\n"
                response += f"   - Code: `{md['code']}`\n"
                response += f"   - Clause: `{md['clause']}`\n"

    return response


def chat_interface(message, history):
    """Process chat message and return response"""
    if not message.strip():
        return ""

    # Run RAG query
    result = run_rag_query(message)

    # Format response with metadata
    formatted_response = format_response_with_metadata(result)

    # Return formatted response for Gradio chat
    return formatted_response


# Example queries for quick start
examples = [
    "The zebra crossing near the school has completely faded. What is the corrective action?",
    "The hospital sign near the clinic has faded and is not visible at night.",
    "The centerline marking on the two-way road is barely visible.",
    "There are deep potholes on the highway.",
    "Speed limit signs are missing on the main road.",
]


# ---------------------------
# 8) Create Gradio Interface
# ---------------------------
print("Launching Gradio interface...")

with gr.Blocks(theme=gr.themes.Soft(), title="Road Safety Assistant") as demo:
    gr.Markdown("""
    # 🚦 National Road Safety Hackathon 2025 - AI Assistant

    **Welcome!** Describe any road safety issue, and I'll recommend appropriate interventions from the official database.
    """)

    chatbot = gr.Chatbot(
        height=500,
        label="Road Safety Assistant",
        show_label=True,
        type="messages"
    )

    with gr.Row():
        msg = gr.Textbox(
            placeholder="Describe a road safety issue (e.g., 'Faded zebra crossing near school')...",
            show_label=False,
            scale=9,
            container=False
        )
        submit_btn = gr.Button("Send 🚀", scale=1, variant="primary")

    with gr.Row():
        clear = gr.ClearButton([msg, chatbot], value="Clear Chat 🗑️")

    gr.Examples(
        examples=examples,
        inputs=msg,
        label="📋 Try these examples:"
    )

    gr.Markdown("""
    ---
    ### ℹ️ How to use:
    1. Type or select a road safety issue from the examples
    2. Get instant recommendations with official code references
    3. Review the metadata for detailed source information

    **Powered by:** FAISS + LangChain + Groq LLM
    """)

    # Event handlers
    def respond(message, history):
        if not message.strip():
            return history

        # Run RAG query
        result = run_rag_query(message)

        # Format response with metadata
        bot_response = format_response_with_metadata(result)

        # Append to history in correct format
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": bot_response})

        return history

    msg.submit(respond, [msg, chatbot], [chatbot]).then(
        lambda: "", None, msg
    )
    submit_btn.click(respond, [msg, chatbot], [chatbot]).then(
        lambda: "", None, msg
    )


# Launch the interface
if __name__ == "__main__":
    demo.launch(
        share=True,  # Creates a public link (remove if not needed)
        debug=True,
        server_name="0.0.0.0",  # Makes it accessible on your network
        server_port=7860
    )

Loading CSV data...
Building documents by category...
Loading embeddings model...
Building/loading FAISS indices...
✓ Built/loaded FAISS indices for 3 categories.
Initializing LLM...
Launching Gradio interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fe4cd547ebebbe4f59.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
