In [5]:
import re
from PyPDF2 import PdfReader

# =========================
# 1Ô∏è‚É£ Load PDF and extract text
# =========================

pdf_path = r"C:\Users\hamza\Desktop\Medical-Chatbot\data\about-brain-tumors-a-primer-1.pdf"   # use uploaded PDF

reader = PdfReader(pdf_path)
raw_text = ""

for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text + "\n"
    else:
        print(f"‚ö†Ô∏è Warning: Page {i} returned no text (image-based?)")

print("Raw text length:", len(raw_text))


# =========================
# 2Ô∏è‚É£ Clean text
# =========================

def clean_text(text):
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove common page numbers like:
    # "12", "Page 12", "12 AMERICAN BRAIN..." etc.
    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+\s+AMERICAN BRAIN TUMOR ASSOCIATION\b', '', text)
    text = re.sub(r'\b\d+\s+www\.abta\.org\b', '', text)

    # Remove stray line-number-only lines
    text = re.sub(r'\n?\b\d{1,3}\b\n?', ' ', text)

    # Remove repeated headers/footers from the uploaded PDF
    patterns = [
        r'AMERICAN BRAIN TUMOR ASSOCIATION.*?www\.abta\.org',
        r'ABOUT BRAIN TUMORS\s+.*?Caregivers'
    ]

    for p in patterns:
        text = re.sub(p, '', text, flags=re.IGNORECASE)

    # Final trim
    return text.strip()


cleaned_text = clean_text(raw_text)
print("Cleaned text length:", len(cleaned_text))

# =========================
# 3Ô∏è‚É£ Save cleaned text
# =========================
with open("cleaned_book.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Cleaned text saved to cleaned_book.txt")

# =========================
# Custom text splitter for RAG
# =========================
def split_text_custom(text, chunk_size=4000, overlap=800):
    chunks = []
    start = 0
    length = len(text)

    while start < length:
        end = min(start + chunk_size, length)
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks

# =========================
# Load cleaned text
# =========================
with open("cleaned_book.txt", "r", encoding="utf-8") as f:
    cleaned_text = f.read()

# =========================
# Split into chunks
# =========================
chunks = split_text_custom(cleaned_text)

print(f"Total chunks created: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:500], "...")


Raw text length: 216167
Cleaned text length: 109897
Cleaned text saved to cleaned_book.txt
Total chunks created: 35
Sample chunk:
 a primer for patients and caregivers about braintumors  a primer for patients and caregiversabout braintumors 8550 W. Bryn Mawr Avenue, Suite   Chicago, IL 60631 CareLine:  - -ABTA (ÓÄ≤ÓÄ≤ ÓÄ≤) Email: info@abta.org Website: www.abta.org ABOUT THE  . Information contained in this publication was originally published in two volumes as Brain T umor Primer: A Comprehensive Introduction to Brain T umors, 9th Edition ; and Living with a Brain T umor: A Guide for Newly Diagnosed Patients and Their Families . ...


In [19]:
import langchain
print(langchain.__version__)

1.1.2


In [4]:
# =========================
# Custom text splitter for RAG
# =========================
def split_text_custom(text, chunk_size=4000, overlap=800):
    """
    Splits text into chunks of chunk_size characters with overlap.
    Approx 1000 tokens ~ 4000 characters.
    """
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = min(start + chunk_size, text_length)
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap  # move start by chunk_size - overlap
    
    return chunks

# =========================
# Load cleaned text
# =========================
with open("cleaned_book.txt", "r", encoding="utf-8") as f:
    cleaned_text = f.read()

# =========================
# Split text into chunks
# =========================
chunks = split_text_custom(cleaned_text, chunk_size=4000, overlap=800)

print(f"Total chunks created: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:500], "...")


Total chunks created: 0


IndexError: list index out of range

In [6]:
# ------------------------------
# 0Ô∏è‚É£ Imports (keep your existing ones)
# ------------------------------
import psycopg
from psycopg import Cursor
import ollama
from pathlib import Path

# ------------------------------
# 1Ô∏è‚É£ Variables
# ------------------------------
EMBED_MODEL = "embeddinggemma"  # Your Ollama embedding model
db_connection_str = "dbname=medical_rag user=postgres password=1803 host=localhost port=5432"

# If your chunks are already in memory
# chunks = [...]  # list of strings from the previous splitting step

# ------------------------------
# 2Ô∏è‚É£ Helper functions (reuse yours)
# ------------------------------

def calculate_embeddings(corpus: str) -> list[float]:
    response = ollama.embeddings(EMBED_MODEL, corpus)
    return response["embedding"]

def to_pgvector(vec: list[float]) -> str:
    return "[" + ",".join(str(v) for v in vec) + "]"

def save_embedding(corpus: str, embedding: list[float], cursor: Cursor) -> None:
    pg_vec = to_pgvector(embedding)
    cursor.execute(
        """
        INSERT INTO embeddings (corpus, embedding)
        VALUES (%s, %s::vector)
        """,
        (corpus, pg_vec),
    )

def similar_corpus(input_corpus: str, k: int, cursor: Cursor):
    embedding = calculate_embeddings(input_corpus)
    pg_vec = to_pgvector(embedding)

    cursor.execute(
        """
        SELECT id, corpus, embedding <=> %s::vector AS distance
        FROM embeddings
        ORDER BY distance ASC
        LIMIT %s
        """,
        (pg_vec, k),
    )

    return cursor.fetchall()

# ------------------------------
# 3Ô∏è‚É£ Store chunk embeddings in PostgreSQL
# ------------------------------
with psycopg.connect(db_connection_str) as conn:
    conn.autocommit = True

    with conn.cursor() as cur:
        # Drop old table if exists
        cur.execute("DROP TABLE IF EXISTS embeddings")

        # Create extension pgvector
        cur.execute("CREATE EXTENSION IF NOT EXISTS vector")

        # Create embeddings table
        cur.execute(
            """
            CREATE TABLE IF NOT EXISTS embeddings (
                id SERIAL PRIMARY KEY,
                corpus TEXT,
                embedding VECTOR(768)
            );
            """
        )

        # Iterate through your chunks
        for i, chunk in enumerate(chunks):
            emb = calculate_embeddings(chunk)
            save_embedding(chunk, emb, cur)
            if i % 50 == 0:
                print(f"Processed chunk {i+1}/{len(chunks)}")

        conn.commit()

        # Optional: test similarity search
        print("\n--- Test similarity ---")
        test_results = similar_corpus("What causes inflammation?", 3, cur)
        for r in test_results:
            print(r)


Processed chunk 1/35

--- Test similarity ---
(10, 'ing  - -ABTA (2282). THE BRAIN TUMOR GUIDE For Newly Diagnosed Patients and Their Families  INTRODUCTION Causes and risk factors can be environmental , such as being exposed to poisonous substances in the home or at work; eating or not eating certain foods; or whether or not we exercise, smoke cigarettes or drink alcohol. They can also be genetic , such as being born with a gene mutation or susceptibility that one inherits from parents. These genetic mutations/susceptibilities may also accumulate over time, as one grows older. Unfortunately , no risk factor accounting for the majority of brain tumors has been identified, even though many environmental and genetic factors have been and are currently being studied. ENVIRONMENTAl f ACTORS Many studies have looked at a wide spectrum of environmental factors as possible causes of brain tumors including but not limited to: ‚Ä¢ Being exposed to air pollution, residential power lines, second 

In [None]:
import psycopg
import ollama
from groq import Groq
import os
import base64
from datetime import datetime

# ------------------------------
# Variables
# ------------------------------
EMBED_MODEL = "embeddinggemma"
LLM_MODEL = "llama3"
VISION_MODEL = "llama3.2-vision"
GROQ_MODEL = "llama-3.3-70b-versatile"
GROQ_API_KEY = "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" 

db_connection_str = "dbname=rag_chatbot user=postgres password=1803 host=localhost port=5432"
TOP_K = 5

groq_client = Groq(api_key=GROQ_API_KEY)

# Ensure output folder exists
if not os.path.exists("outputs"):
    os.makedirs("outputs")

# ------------------------------
# Helper functions
# ------------------------------

def calculate_embeddings(corpus: str) -> list[float]:
    response = ollama.embeddings(EMBED_MODEL, corpus)
    return response["embedding"]

def to_pgvector(vec: list[float]) -> str:
    return "[" + ",".join(str(v) for v in vec) + "]"

def retrieve_chunks(query: str, k: int = TOP_K):
    embedding = calculate_embeddings(query)
    pg_vec = to_pgvector(embedding)

    with psycopg.connect(db_connection_str) as conn:
        with conn.cursor() as cur:
            cur.execute(
                """
                SELECT corpus, embedding <=> %s::vector AS distance
                FROM embeddings
                ORDER BY distance ASC
                LIMIT %s
                """,
                (pg_vec, k)
            )
            results = cur.fetchall()

    return [r[0] for r in results]

def encode_image_to_base64(image_path: str) -> str:
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# ------------------------------
# AGENT 1: Vision Agent
# ------------------------------

def vision_agent(image_path: str) -> dict:
    print("   üëÅÔ∏è  Agent 1 ‚Äî Analyzing visual features (non-diagnostic)...")

    try:
        if not os.path.exists(image_path):
            return {"success": False, "error": f"Image not found: {image_path}"}

        print(f"   üìÇ File: {os.path.basename(image_path)}")
        img = encode_image_to_base64(image_path)

        response = ollama.chat(
            model=VISION_MODEL,
            messages=[
                {
                    "role": "user",
                    "content": """
Describe ONLY what is visually visible.
NO diagnosis.
""",
                    "images": [img]
                }
            ]
        )

        desc = response["message"]["content"].strip()
        return {"success": True, "description": desc}

    except Exception as e:
        return {"success": False, "error": str(e)}

# ------------------------------
# AGENT 2: Textbook Retrieval
# ------------------------------

def textbook_retrieval_agent(vision_description: str, top_k: int = TOP_K) -> dict:
    print("   üîé Agent 2 ‚Äî Retrieving textbook passages...")

    try:
        q = "medical imaging features " + vision_description[:800]
        chunks = retrieve_chunks(q, top_k)

        if not chunks:
            return {"success": False, "error": "No textbook passages retrieved.", "retrieved_chunks": []}

        return {"success": True, "retrieved_chunks": chunks, "num_chunks": len(chunks)}

    except Exception as e:
        return {"success": False, "error": str(e), "retrieved_chunks": []}

# ------------------------------
# AGENT 3: Enrichment (Groq)
# ------------------------------

def enrichment_agent(chunks: list[str], vision_description: str) -> dict:
    print("   üß† Agent 3 ‚Äî Enriching final output...")

    try:
        joined = "\n\n-----\n\n".join(chunks)
        if len(joined) > 4000:
            joined = joined[:4000] + "\n\n[TRUNCATED]"

        system_msg = {
            "role": "system",
            "content": "Provide ONLY educational explanation. NO diagnosis. Must end with disclaimer."
        }

        user_msg = {
            "role": "user",
            "content": f"""
Vision description:
{vision_description}

Textbook Passages:
{joined}

Task: produce an educational explanation, suggest typical clinical workflows (non-prescriptive), and end with a disclaimer.
"""
        }

        response = groq_client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[system_msg, user_msg],
            max_tokens=1200,
            temperature=0.2
        )

        final = response.choices[0].message.content.strip()
        return {"success": True, "final_text": final}

    except Exception as e:
        return {"success": False, "error": str(e)}

# ------------------------------
# PIPELINE
# ------------------------------

def analyze_pipeline(image_path: str) -> dict:
    print("\n===================== AI IMAGE ANALYSIS PIPELINE =====================\n")

    # Agent 1
    print("‚îÄ‚îÄ Agent 1: Vision ‚îÄ‚îÄ")
    v = vision_agent(image_path)
    if not v["success"]:
        return {"success": False, "error": v["error"]}
    vision_desc = v["description"]
    print("Vision description obtained.\n")

    # Agent 2
    print("‚îÄ‚îÄ Agent 2: Retrieval ‚îÄ‚îÄ")
    t = textbook_retrieval_agent(vision_desc, TOP_K)
    if not t["success"]:
        return {"success": False, "error": t["error"]}
    chunks = t["retrieved_chunks"]
    print(f"Retrieved {t['num_chunks']} textbook passages.\n")

    # Agent 3
    print("‚îÄ‚îÄ Agent 3: Enrichment ‚îÄ‚îÄ")
    e = enrichment_agent(chunks, vision_desc)
    if not e["success"]:
        return {"success": False, "error": e["error"]}
    print("Enrichment complete.\n")

    return {
        "success": True,
        "vision_analysis": vision_desc,
        "retrieved_chunks": chunks,
        "final_synthesis": e["final_text"],
        "sources": t["num_chunks"]
    }

# ------------------------------
# CLI
# ------------------------------

def main():
    print("\n================== MEDICAL MULTI-AGENT SYSTEM ==================\n")

    while True:
        cmd = input("üí¨ Command: ").strip()

        if cmd.lower() in ("exit", "quit"):
            print("\nüëã Exiting.\n")
            break

        if cmd.startswith("analyze "):
            image_path = cmd.split(" ", 1)[1].strip()

            print(f"\nüîç Starting analysis for: {image_path}\n")
            result = analyze_pipeline(image_path)

            if not result["success"]:
                print("‚ùå ERROR:", result["error"])
                continue

            # BUILD FILE NAME
            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            out_path = f"outputs/final_synthesis_{timestamp}.txt"

            # WRITE AGENT 3 OUTPUT TO FILE
            with open(out_path, "w", encoding="utf-8") as f:
                f.write(result["final_synthesis"])

            print("\n================== ANALYSIS COMPLETE ==================\n")
            print("üëÅÔ∏è  AGENT 1 ‚Äî Visual Description (preview):\n", result["vision_analysis"][:400], "...\n")
            print("üìö  AGENT 2 ‚Äî Passages Retrieved:", result["sources"])
            print("üí°  AGENT 3 ‚Äî Full enriched output saved to:")
            print("   üìÑ", out_path)
            print("\n=========================================================\n")

            input("Press ENTER to continue...")

        else:
            print("Unknown command. Use: analyze <path> or exit.\n")

if __name__ == "__main__":
    main()





üîç Starting analysis for: C:\Users\hamza\Desktop\Medical-Chatbot\data\Y10.jpg



‚îÄ‚îÄ Agent 1: Vision ‚îÄ‚îÄ
   üëÅÔ∏è  Agent 1 ‚Äî Analyzing visual features (non-diagnostic)...
   üìÇ File: Y10.jpg
Vision description obtained.

‚îÄ‚îÄ Agent 2: Retrieval ‚îÄ‚îÄ
   üîé Agent 2 ‚Äî Retrieving textbook passages...
Retrieved 5 textbook passages.

‚îÄ‚îÄ Agent 3: Enrichment ‚îÄ‚îÄ
   üß† Agent 3 ‚Äî Enriching final output...
Enrichment complete.



üëÅÔ∏è  AGENT 1 ‚Äî Visual Description (preview):
 The image depicts a cross-sectional view of the human brain, with the left hemisphere facing the viewer. The brain's surface is predominantly gray, with a prominent white border surrounding the entire circumference. A notable feature is a dark, irregularly shaped region, approximately one-third the size of the brain's surface, located on the right side. This region is characterized by a lighter gr ...

üìö  AGENT 2 ‚Äî Passages Retrieved: 5
üí°  AGENT 3 ‚Äî Full enriched output save