## Distill Bert and MiniLM

In [None]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
!pip install faiss-cpu



In [None]:
!pip install easyocr




In [None]:
# swimlane_chatbot.py  (generic-image version)

import gradio as gr
import pytesseract
import cv2
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import os, re

# --------------------------------------------------
# Models
# --------------------------------------------------
qa_pipeline = pipeline(
    "question-answering",
    model="distilbert-base-uncased-distilled-squad",
    handle_impossible_answer=True,
)
embedder = SentenceTransformer("all-MiniLM-L6-v2")


# --------------------------------------------------
# OCR helper
# --------------------------------------------------
def _clean(txt: str) -> str:
    txt = re.sub(r"[\n\r]+", " ", txt)
    txt = re.sub(r"\s{2,}", " ", txt)
    return txt.strip(" .-–")


def parse_diagram(image_np):
    """
    Returns a list of step strings extracted from the diagram,
    ordered roughly top-to-bottom then left-to-right.
    """
    # 1) a little pre-processing so Tesseract can read light or dark backgrounds
    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, None, fx=1.6, fy=1.6, interpolation=cv2.INTER_CUBIC)
    gray = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 31, 15
    )

    data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT, config="--psm 6")
    n = len(data["text"])

    # collect high-confidence words
    boxes = []
    for i in range(n):
        if int(data["conf"][i]) < 40:
            continue
        txt = _clean(data["text"][i])
        if len(txt) < 4:
            continue
        x, y = data["left"][i], data["top"][i]
        boxes.append((y, x, txt))

    if not boxes:
        return []

    # cluster by row (≈ swim lane) then sort left-to-right
    ROW_H = 55
    rows = {}
    for y, x, t in boxes:
        rows.setdefault(y // ROW_H, []).append((x, t))

    ordered = []
    for r in sorted(rows):
        ordered.extend([t for x, t in sorted(rows[r])])

    # de-duplicate while preserving order
    seen = set()
    steps = [s for s in ordered if not (s in seen or seen.add(s))]
    return steps


# --------------------------------------------------
# QA per-image
# --------------------------------------------------
def answer_question(image, question: str):
    steps = parse_diagram(image)
    if not steps:
        return "I couldn’t read any steps; try a higher-resolution image."

    # build FAISS on-the-fly
    embeddings = embedder.encode(steps, convert_to_tensor=True).cpu().numpy()
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    q_emb = embedder.encode([question], convert_to_tensor=True).cpu().numpy()
    D, I = index.search(q_emb, min(3, len(steps)))
    context = ". ".join(steps[i] for i in I[0])

    result = qa_pipeline(question=question, context=context)
    if result.get("score", 0) < 0.05 or not result["answer"].strip():
        return "I’m not confident. Try rephrasing the question."
    return result["answer"].strip()


# --------------------------------------------------
# Gradio / CLI wrappers
# --------------------------------------------------
def chatbot_interface(image, question):
    return answer_question(image, question)


def cli_mode():
    print("CLI mode – type 'exit' to quit")
    path = input("Diagram path: ").strip()
    if not os.path.isfile(path):
        print("File not found."); return
    img = cv2.imread(path)
    while True:
        q = input("> ")
        if q.lower() == "exit":
            break
        print(answer_question(img, q))


if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "cli":
        cli_mode()
    else:
        gr.Interface(
            fn=chatbot_interface,
            inputs=[
                gr.Image(type="numpy", label="Upload Swim-lane Diagram"),
                gr.Textbox(label="Question", placeholder="Ask about the workflow…"),
            ],
            outputs=gr.Textbox(label="Answer"),
            title="Swimlane Diagram Chatbot",
            description="Upload any swim-lane diagram, then ask questions about its steps.",
        ).launch()


Device set to use cpu


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9eff65d1778bea9247.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
pip install --upgrade gradio

Collecting gradio
  Downloading gradio-5.34.2-py3-none-any.whl.metadata (16 kB)
Collecting gradio-client==1.10.3 (from gradio)
  Downloading gradio_client-1.10.3-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-5.34.2-py3-none-any.whl (54.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.3/54.3 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gradio_client-1.10.3-py3-none-any.whl (323 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.6/323.6 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gradio-client, gradio
  Attempting uninstall: gradio-client
    Found existing installation: gradio_client 1.10.1
    Uninstalling gradio_client-1.10.1:
      Successfully uninstalled gradio_client-1.10.1
  Attempting uninstall: gradio
    Found existing installation: gradio 5.31.0
    Uninstalling gradio-5.31.0:
      Successfully uninstalled gradio-5.31.0
Successfully installed gradio-5.34.2 gradio-clien

## Roberta Base Squad

In [37]:

import os
import re
from collections import defaultdict
from typing import List, Dict, Tuple

import cv2
import numpy as np
import faiss
import gradio as gr
import easyocr
from sentence_transformers import SentenceTransformer
from transformers import pipeline


auto_gpu = False  # set True if you have a CUDA GPU and EasyOCR was compiled w/ GPU
ocr_reader = easyocr.Reader(["en"], gpu=auto_gpu)
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
embedder = SentenceTransformer("all-mpnet-base-v2")


doc_chunks = [
    "To place an order, the customer submits a request via the UI.",
    "The Sales team confirms receipt and checks if the product is in stock.",
    "If not in stock, the order is canceled.",
    "If in stock, the system checks credit card validity before processing payment.",
    "Finance handles payment and initiates order delivery.",
]

doc_emb = embedder.encode(doc_chunks, convert_to_numpy=True)
faiss_index = faiss.IndexFlatL2(doc_emb.shape[1])
faiss_index.add(doc_emb)


STEP_ALIASES = {
    # ordering/step aliases
    "submit order": "place order",
    "order submitted": "place order",
    "order is submitted": "place order",
    "buy now": "place order",
    "order placed": "place order",
    "checkout": "place order",

    "verify inventory": "check inventory",
    "stock check": "check inventory",
    "inventory check": "check inventory",

    "charge card": "processing the payment",
    "make payment": "processing the payment",
    "payment": "processing the payment",

    "delivery": "deliver the order",
    "shipment": "deliver the order",
    "dispatch": "deliver the order",

    "refund": "cancel the order",
}

FIRST_REGEX = re.compile(r"first step|start|begin", re.I)
LAST_REGEX = re.compile(r"last step|final|finish|end", re.I)
AFTER_REGEX = re.compile(r"after (?:the )?(.*)", re.I)
BEFORE_REGEX = re.compile(r"before (?:the )?(.*)", re.I)
WHO_REGEX = re.compile(r"who|which team|which lane|responsible", re.I)


def _norm(txt: str) -> str:
    txt = txt.lower().strip()
    txt = re.sub(r"[^a-z0-9 ]", "", txt)
    return STEP_ALIASES.get(txt, txt)


def _cluster_rows(coords: List[int], eps: int = 40) -> Dict[int, List[int]]:
    """Group y‑coordinates using a simple 1‑D DBSCAN‑like clustering."""
    if not coords:
        return {}
    coords = sorted(coords)
    clusters, current = defaultdict(list), 0
    clusters[current].append(coords[0])
    for y in coords[1:]:
        if abs(y - clusters[current][-1]) <= eps:
            clusters[current].append(y)
        else:
            current += 1
            clusters[current].append(y)
    return clusters


def parse_diagram(image: np.ndarray) -> Tuple[List[str], str]:
    # OCR
    detected = ocr_reader.readtext(image)

    # Collect (row, col, text)
    raw_nodes: List[Tuple[int, int, str]] = []
    y_coords: List[int] = []
    for (bbox, text, conf) in detected:
        text = text.strip()
        if len(text) < 2:
            continue
        x, y = int(bbox[0][0]), int(bbox[0][1])
        raw_nodes.append((y, x, text))
        y_coords.append(y)

    row_clusters = _cluster_rows(y_coords)

    row_id_of_y = {}
    for cid, ys in row_clusters.items():
        for y in ys:
            row_id_of_y[y] = cid

    rows: Dict[int, List[Tuple[int, str]]] = defaultdict(list)
    for y, x, text in raw_nodes:
        cid = row_id_of_y[y]
        rows[cid].append((x, text))

    ordered_steps: List[str] = []
    for cid in sorted(rows.keys()):
        for x, text in sorted(rows[cid]):
            ordered_steps.append(_norm(text))


    seen = set()
    dedup_steps = []
    for s in ordered_steps:
        if s in seen or len(s) < 2:
            continue
        seen.add(s)
        dedup_steps.append(s)

    pretty = "\n".join(f"Step {i+1}: {step.title()}" for i, step in enumerate(dedup_steps))
    return dedup_steps, pretty


def _flow_reasoning(steps: List[str], question: str) -> str:
    q_norm = question.lower()
    # First / Last
    if FIRST_REGEX.search(q_norm):
        return steps[0]
    if LAST_REGEX.search(q_norm):
        return steps[-1]

    # After X / Before X
    m_after = AFTER_REGEX.search(q_norm)
    if m_after:
        ref = _norm(m_after.group(1))
        if ref in steps and steps.index(ref) < len(steps) - 1:
            return steps[steps.index(ref) + 1]
    m_before = BEFORE_REGEX.search(q_norm)
    if m_before:
        ref = _norm(m_before.group(1))
        if ref in steps and steps.index(ref) > 0:
            return steps[steps.index(ref) - 1]
    return ""


def answer_question(image: np.ndarray, question: str) -> str:
    steps, flow_text = parse_diagram(image)

    rule_ans = _flow_reasoning(steps, question)
    if rule_ans:
        return rule_ans.capitalize()

    # Build temp FAISS index with flow text
    flow_emb = embedder.encode([flow_text], convert_to_numpy=True)
    temp_index = faiss.IndexFlatL2(flow_emb.shape[1])
    temp_index.add(np.vstack([doc_emb, flow_emb]))

    # retrieval
    q_emb = embedder.encode([question], convert_to_numpy=True)
    D, I = temp_index.search(q_emb, 4)
    context = "\n".join((doc_chunks + [flow_text])[i] for i in I[0])

    # LLM QA
    try:
        res = qa_model(question=question, context=context)
        if res.get("answer"):
            return res["answer"].capitalize()
    except Exception:
        pass
    return "Sorry, I couldn't find that in the current workflow."


def chatbot_interface(image, question):
    if image is None or question.strip() == "":
        return "Please provide both a diagram image and a question."
    return answer_question(image, question)


def cli_mode():
    print("CLI Chatbot Mode  |  type 'exit' to quit")
    img_path = input("Image path: ")
    if not os.path.exists(img_path):
        print("Image file not found.")
        return
    img = cv2.imread(img_path)

    while True:
        q = input("Q: ")
        if q.lower() == "exit":
            break
        print("A:", chatbot_interface(img, q))


if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == "cli":
        cli_mode()
    else:
        gr.Interface(
            fn=chatbot_interface,
            inputs=[gr.Image(type="numpy", label="Swim‑lane diagram"),
                    gr.Textbox(placeholder="Ask a question about the workflow…", label="Question")],
            outputs=gr.Textbox(label="Answer"),
            title="Swim‑lane Diagram Chatbot (Robust)",
            description="Upload a process swim‑lane diagram then ask questions like 'What happens after the order is submitted?' or 'Who handles payment?'"
        ).launch(debug = True)


Device set to use cpu


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://6bcbc71262246f275b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7876 <> https://6bcbc71262246f275b.gradio.live
