# ANEETA Evaluation: Local RAG and Model Comparisons
This notebook provides a reproducible baseline evaluation of the ANEETA Doubt Solver using local models via Ollama and the existing ANEETA vector stores. It also verifies repository setup and supports simple model comparisons for group reporting.

In [59]:
# 1) Verify Local Repos and Paths
import os, sys, subprocess, shutil
from pathlib import Path
ROOT = Path.cwd().resolve()
aneeta_dir = ROOT.parent / "ANEETA"  # sibling repo at same level
combo_dir = ROOT  # this notebook lives in 469ProjectCombined
print("Current working dir:", ROOT)
print("Expecting ANEETA at:", aneeta_dir)
print("Expecting 469ProjectCombined at:", combo_dir)

# Default ANEETA repository URL (override with env ANEETA_GIT_URL if needed)
ANEETA_GIT_URL = os.getenv("ANEETA_GIT_URL", "https://github.com/BenjaminLohDW/ANEETA.git")

def ensure_repo(path: Path, git_url: str | None = None):
    if path.exists():
        print(f"Found: {path}")
        return
    if git_url is None:
        raise SystemExit(f"Missing {path}. Please clone it or provide git_url.")
    print(f"Cloning {git_url} into {path} ...")
    subprocess.run(["git", "clone", git_url, str(path)], check=True)

ensure_repo(combo_dir)
ensure_repo(aneeta_dir, ANEETA_GIT_URL)
print("Repos verified.")

Current working dir: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\notebooks
Expecting ANEETA at: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA
Expecting 469ProjectCombined at: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\notebooks
Found: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\notebooks
Found: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA
Repos verified.


In [60]:
# 2) Create and Validate Python Environment
import platform, sys, subprocess, os
from pathlib import Path
print("Python:", sys.version)
assert sys.version_info >= (3,10), "Python >= 3.10 required"
print("Executable:", sys.executable)
print("Platform:", platform.platform())

# Optionally create venv instructions (not auto-creating to avoid permission issues)
venv_dir = ROOT / ".venv"
print("Suggested venv path:", venv_dir)
print("If you need one, run in PowerShell:")
print("python -m venv .venv; .\\.venv\\Scripts\\Activate.ps1; python -m pip install -U pip")

# CUDA check (optional)
try:
    import torch
    print("Torch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
except Exception as e:
    print("Torch not installed or CUDA not available:", e)

Python: 3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Executable: c:\Users\yanji\anaconda3\envs\is217_env\python.exe
Platform: Windows-10-10.0.26100-SP0
Suggested venv path: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\notebooks\.venv
If you need one, run in PowerShell:
python -m venv .venv; .\.venv\Scripts\Activate.ps1; python -m pip install -U pip
Torch version: 2.8.0+cpu
CUDA available: False


In [61]:
# 3) Install and Import Dependencies (lean and fast on Windows)
import sys, subprocess, os

# Speed: upgrade pip tooling first
subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], check=True)

# Minimal core deps; avoid heavy extras here
# Use binary wheels where possible to speed up installs
packages = [
    "typing_extensions>=4.12.2",
    "pydantic<2.0.0",        # compatible with chromadb 0.4.x
    "numpy==1.26.4",         # stable wheels
    "pandas==2.2.2",         # pin to a wheel-friendly version
    "requests",
    "tqdm>=4.65.0",
    "jsonlines",
    "chromadb==0.4.24",      # no ONNX default embedder
    # Minimal ANEETA runtime deps (pure-Python)
    "langchain", "langchain-community", "langchain-ollama", "langgraph"
]

print("Installing core packages (binary wheels preferred)...")
subprocess.run([sys.executable, "-m", "pip", "install", "--only-binary", ":all:", *packages], check=False)

# Imports (defer MLflow to services cell and make it optional)
import chromadb, pandas as pd, numpy as np, requests  # noqa: F401
print("Core imports OK (lean setup)")

Installing core packages (binary wheels preferred)...
Core imports OK (lean setup)
Core imports OK (lean setup)


In [62]:
# 4) Configure Providers (Ollama first; optional OpenAI)
import os, json, requests
from dataclasses import dataclass
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

@dataclass
class Provider:
    kind: str  # 'ollama' or 'openai'
    model: str

def get_available_models():
    try:
        r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
        if r.ok:
            return {m.get('name') for m in r.json().get('models', []) if 'name' in m}
    except Exception:
        pass
    return set()

def pick_provider(prefer_ollama: bool=True) -> Provider:
    if prefer_ollama:
        installed = get_available_models()
        # Prefer a tools-capable local model by default
        order = [os.getenv('LLM_MODEL'), 'llama3.1:8b', 'phi3.5', 'gemma2:2b', 'deepseek-r1:7b']
        for m in order:
            if m and ((not installed) or (m in installed)):
                return Provider('ollama', m)
    if OPENAI_API_KEY:
        return Provider('openai', os.getenv('OPENAI_MODEL', 'gpt-4o-mini'))
    # Fallback
    return Provider('ollama', 'llama3.1:8b')

provider = pick_provider(True)
print("Using provider:", provider)

Using provider: Provider(kind='ollama', model='llama3.1:8b')


In [63]:
# 5) Start Services: Ollama, optional MLflow, ChromaDB
import time, requests, os, subprocess, sys, pathlib
from pathlib import Path

# Silence Chroma telemetry noise (set before importing chromadb)
os.environ.setdefault("ANONYMIZED_TELEMETRY", "false")
os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")


def check_ollama(url=OLLAMA_URL):
    try:
        r = requests.get(url)
        return r.status_code in (200, 404)  # 404 is OK for root
    except Exception:
        return False


if not check_ollama():
    print("Ollama not reachable at", OLLAMA_URL)
    print("- Install/start Ollama, then run: ollama serve")
else:
    print("Ollama reachable at", OLLAMA_URL)
    print("Pull these tags for the evaluations (pick any/all):")
    print(" - ollama pull llama3.1:8b   # tools-capable local chat model (baseline)")
    print(" - ollama pull gemma2:9b     # higher quality candidate (more RAM)")
    print(" - ollama pull mistral-nemo:12b  # higher quality candidate (heavier)")
    print(" - ollama pull nomic-embed-text  # embeddings needed for vector DB")

# Optional MLflow setup (skip if not installed)
try:
    import mlflow  # noqa: F401
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://127.0.0.1:5000"))
    print("MLflow tracking URI:", mlflow.get_tracking_uri())
    MLFLOW_OK = True
except Exception as e:
    print("MLflow not installed/available (optional):", e)
    MLFLOW_OK = False

# Initialize a persistent Chroma client, reusing any existing kernel client to avoid settings conflicts
import chromadb
persist_dir = str((ROOT / ".chroma").resolve())

reuse = False
if "client" in globals() and getattr(globals().get("client"), "get_or_create_collection", None):
    # Reuse existing client to avoid "already exists with different settings" errors
    print("Reusing existing ChromaDB client. Persisting at:", persist_dir)
    reuse = True

if not reuse:
    try:
        from chromadb.config import Settings
        settings = Settings(persist_directory=persist_dir, anonymized_telemetry=False)
        client = chromadb.Client(settings)
        print("ChromaDB client created via Settings. Persisting at:", persist_dir)
    except Exception as e:
        print("Settings-based client failed, falling back to PersistentClient:", e)
        client = chromadb.PersistentClient(path=persist_dir)
        print("ChromaDB PersistentClient created. Persisting at:", persist_dir)

try:
    _ = client.get_or_create_collection("smoke_test")
    print("ChromaDB OK.")
except ValueError as e:
    if "already exists" in str(e):
        print("Chroma singleton already initialized with different settings; reusing existing client.")
    else:
        print("ChromaDB init failed:", e)
        raise

Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Ollama reachable at http://localhost:11434
Pull these tags for the evaluations (pick any/all):
 - ollama pull llama3.1:8b   # tools-capable local chat model (baseline)
 - ollama pull gemma2:9b     # higher quality candidate (more RAM)
 - ollama pull mistral-nemo:12b  # higher quality candidate (heavier)
 - ollama pull nomic-embed-text  # embeddings needed for vector DB
MLflow tracking URI: http://127.0.0.1:5000
Reusing existing ChromaDB client. Persisting at: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\notebooks\.chroma
ChromaDB OK.


### Troubleshooting Ollama model pulls (Windows)
If `ollama pull` fails with `pull model manifest: file does not exist`:

1) Confirm the service is running
- In a new PowerShell: `ollama serve` (if you see `bind: Only one usage...`, it means Ollama is already running; skip this.)
- Check: `Invoke-WebRequest http://localhost:11434/ -UseBasicParsing`

2) Use valid tags for your version
- Try: `ollama pull llama3.1:8b`  
- Or: `ollama pull gemma2:2b`
- Or: `ollama pull mistral-nemo:12b`

3) Update Ollama
- Download the latest from https://ollama.com/download and reinstall.

4) Clear local cache (optional)
- Stop service; delete `%LOCALAPPDATA%\Ollama\models` cautiously; then `ollama serve` and pull again.

Note: For ANEETA Doubt Solver, you DO need the embedding model `nomic-embed-text` to open the existing Chroma vector stores. Pull it with: `ollama pull nomic-embed-text`. Also prefer a tools-capable chat model like `llama3.1:8b`. 

In [64]:
# 6) Load ANEETA Dataset → normalize to DataFrame (repo data only)
import pandas as pd, json, os, re
from pathlib import Path

# Local convenience path (optional)
local_processed = (ROOT / "datasets" / "aneeta" / "processed").resolve()
# Authoritative repo path
repo_processed = (aneeta_dir / "Processed Data").resolve()

rows = []
source_used = None

# 1) Prefer local test.jsonl if present (explicit test set)
test_path = local_processed / "test.jsonl"
if test_path.exists():
    import jsonlines
    with jsonlines.open(test_path, 'r') as reader:
        rows = list(reader)
    source_used = str(test_path)

# Helper: specialized parser for NEET solved papers (extract Q and numeric answer)
def parse_solved_mcqs(json_path: Path):
    try:
        data = json.loads(json_path.read_text(encoding='utf-8'))
    except Exception:
        return []
    # Concatenate pages and search for patterns like: "1. <question/body> ... Answer (3)"
    text = "\n\n".join([str(obj.get("page_content", "")) for obj in data])
    text = text.replace("\r", "")
    # Regex: capture question number, question block (non-greedy), and the numeric answer in parentheses
    pat = re.compile(r"(?:^|\n)\s*(\d{1,3})\.\s*(.*?)(?:\n\s*Answer\s*\((\d+)\))", re.S)
    out = []
    used_ids = set()
    for i, m in enumerate(pat.finditer(text)):
        qn = m.group(1) or str(i)
        body = (m.group(2) or "").strip()
        # Truncate at a solution marker if present to keep the stem/options only
        body = body.split("\nSol.")[0].strip()
        # Normalize whitespace
        body = re.sub(r"[ \t]+", " ", body)
        ans_num = m.group(3)
        # Build a stable id even if question numbers repeat across sections
        uid = f"neet:{qn}:{i}"
        if uid in used_ids:
            continue
        used_ids.add(uid)
        # Keep the whole body as the question (includes options like (1)...(4) when present)
        out.append({
            "id": uid,
            "question": body,
            "context": "",
            "answer": f"({ans_num})"
        })
    return out

# 2) Otherwise, strictly load from ANEETA repo's Processed Data
if not rows:
    if not repo_processed.exists():
        raise SystemExit(f"ANEETA processed data folder not found at: {repo_processed}")
    print("Using ANEETA repo processed data at:", repo_processed)

    # First, try specialized parse from solved_question_papers.json for MCQ Q/A pairs
    solved_path = repo_processed / "solved_question_papers.json"
    if solved_path.exists():
        parsed = parse_solved_mcqs(solved_path)
        if parsed:
            rows = parsed
            source_used = str(solved_path)

    # If still empty, fall back to generic extraction from other processed files (context only)
    if not rows:
        candidates = [
            "mentor_data.json",
            "processed_biology_chunks.json",
            "processed_chemistry_chunks.json",
            "processed_physics_chunks.json",
        ]
        def load_json(path: Path):
            try:
                data = json.loads(path.read_text(encoding='utf-8'))
                if isinstance(data, dict):
                    # flatten dict-of-lists if needed
                    flat = []
                    for v in data.values():
                        if isinstance(v, list):
                            flat.extend(v)
                    return flat or [data]
                return data if isinstance(data, list) else [data]
            except Exception:
                return []
        def get_ci(d: dict, keys: list[str]):
            # case-insensitive get
            lk = {k.lower(): k for k in d.keys()}
            for k in keys:
                if k.lower() in lk:
                    return d.get(lk[k.lower()])
            return None
        extracted = []
        used_file = None
        for name in candidates:
            f = repo_processed / name
            if not f.exists():
                continue
            data = load_json(f)
            for i, rec in enumerate(data):
                if not isinstance(rec, dict):
                    continue
                # These processed files generally have 'page_content' text chunks
                ctx = get_ci(rec, ["page_content","content","text","chunk"]) or ""
                if isinstance(ctx, (dict, list)):
                    ctx = json.dumps(ctx, ensure_ascii=False)
                if isinstance(ctx, str) and len(ctx.strip()) >= 20:
                    # Use the chunk as context and prompt the agent with a generic instruction
                    q_text = ctx.strip().split("\n")[0]
                    extracted.append({
                        "id": f"{name}:{i}",
                        "question": q_text[:500],
                        "context": ctx[:2000],
                        "answer": ""
                    })
            if extracted:
                rows = extracted
                used_file = f
                break
        source_used = str(used_file) if used_file else source_used

# Build DataFrame
import pandas as pd

df = pd.DataFrame(rows)
if df.empty:
    raise SystemExit("Empty dataset after parsing ANEETA data.")
print("Loaded rows:", len(df), "from:", source_used)

# Normalize columns and order
df = df.rename(columns={"sample_id":"id"})
for col in ["id","question","context","answer"]:
    if col not in df.columns:
        df[col] = None
# Cap dataset size for quick iteration
max_n = int(os.getenv("ANEETA_MAX_RECORDS", "250"))
df = df[["id","question","context","answer"]].head(max_n)
df.head(3)

Using ANEETA repo processed data at: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA\Processed Data
Loaded rows: 8890 from: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA\Processed Data\solved_question_papers.json
Loaded rows: 8890 from: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA\Processed Data\solved_question_papers.json


Unnamed: 0,id,question,context,answer
0,neet:1:0,Two objects of mass 10 kg and 20 kg respective...,,(3)
1,neet:2:1,Match List-I with List-II \n List-I \n(Electro...,,(1)
2,neet:3:2,The energy that will be ideally radiated by a ...,,(2)


In [46]:
# HOTFIX: ensure jsonlines is available
import sys, subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "jsonlines"], check=True)
import jsonlines; print("jsonlines installed and imported")

jsonlines installed and imported


## Baseline ANEETA Doubt Solver (non‑DSPy)
The following cells call ANEETA’s mcq_question_solver_agent through the LangGraph workflow to provide a baseline Doubt Solver run without DSPy.

In [65]:
# Configure ANEETA environment (local-only)
import os, sys, re

# Ensure env is set BEFORE any aneeta imports
non_tools = {"phi3.5", "gemma2:2b", "deepseek-r1:7b"}
if os.getenv("LLM_MODEL") in non_tools or not os.getenv("LLM_MODEL"):
    os.environ["LLM_MODEL"] = "llama3.1:8b"
if os.getenv("CREATIVE_LLM_MODEL") in non_tools or not os.getenv("CREATIVE_LLM_MODEL"):
    os.environ["CREATIVE_LLM_MODEL"] = "llama3.1:8b"
# Embeddings are required to open the persisted Chroma vector stores; use the model that matches the DBs
os.environ.setdefault("EMBEDDING_MODEL", "nomic-embed-text")
# Use the ANEETA repo's persisted Chroma DBs
os.environ.setdefault("VECTORDB_BASE_PATH", str((aneeta_dir / "src" / "aneeta" / "vectordb").resolve()))

print("LLM_MODEL:", os.environ["LLM_MODEL"]) 
print("CREATIVE_LLM_MODEL:", os.environ["CREATIVE_LLM_MODEL"]) 
print("EMBEDDING_MODEL:", os.environ["EMBEDDING_MODEL"])  # tip: if missing locally, run: ollama pull nomic-embed-text
print("ANEETA VECTORDB_BASE_PATH:", os.environ["VECTORDB_BASE_PATH"])

# Add ANEETA src to path for imports
sys.path.insert(0, str((aneeta_dir / "src").resolve()))
print("PYTHONPATH +:", (aneeta_dir/"src").resolve())

# Hard reset: clear Streamlit cache and fully purge ANEETA modules so model changes take effect
import importlib
try:
    import streamlit as st
    try:
        st.cache_resource.clear()
        print("Cleared streamlit cache_resource")
    except Exception:
        pass
except Exception:
    st = None  # type: ignore

# Purge ALL aneeta modules to avoid stale singletons (llm, resources, workflow, etc.)
for mod in list(sys.modules.keys()):
    if mod == "aneeta" or mod.startswith("aneeta."):
        sys.modules.pop(mod, None)

# Import in a clean order: LLM -> resources -> agents/router -> workflow
import aneeta.llm.llm as llm_mod
importlib.reload(llm_mod)

import aneeta.core.resources as resources_mod
importlib.reload(resources_mod)

import aneeta.nodes.agents as agents_mod
import aneeta.nodes.router as router_mod
importlib.reload(agents_mod)
importlib.reload(router_mod)

import aneeta.graph.workflow as workflow_mod
importlib.reload(workflow_mod)

# Print resolved model from ANEETA resources for verification
try:
    from aneeta.core.resources import llm as current_llm
    resolved_model = getattr(current_llm, "model", None) or getattr(current_llm, "model_name", None) or getattr(current_llm, "model_id", None)
    print("ANEETA resources -> llm.model:", resolved_model)
except Exception as e:
    print("Could not read ANEETA resources llm model:", e)

# Define a simple whitespace normalizer to clean streamed outputs
def normalize_whitespace(text: str) -> str:
    t = text.replace("\r", "")
    # Trim spaces around newlines
    t = re.sub(r"[ \t]+\n", "\n", t)
    t = re.sub(r"\n[ \t]+", "\n", t)
    # Collapse 3+ newlines to 2
    t = re.sub(r"\n{3,}", "\n\n", t)
    # Collapse runs of spaces
    t = re.sub(r" {2,}", " ", t)
    # Remove space before punctuation like , . ; : ! ?
    t = re.sub(r"\s+([,.;:!?])", r"\1", t)
    # Fix spaced MCQ options like ( A ) -> (A)
    t = re.sub(r"\(\s*([A-D1-4])\s*\)", r"(\1)", t)
    return t.strip()

# Define baseline solve_mcq using ANEETA LangGraph
from langchain_core.messages import HumanMessage

def solve_mcq(question: str, timeout: int = 120, language: str | None = None):
    try:
        graph = workflow_mod.get_graph()
        init_state = {
            "messages": [HumanMessage(content=question)],
            "user_explanation_language": language or os.getenv("ANEETA_EXPLANATION_LANG", "English"),
        }
        # Stream and collect final text (supports either messages or response_stream)
        final_text = []
        for update in graph.stream(init_state):
            for _, v in update.items():
                if isinstance(v, dict):
                    # If agent provided a streaming generator, consume it
                    rs = v.get("response_stream")
                    if rs is not None:
                        try:
                            for chunk in rs:
                                t = str(getattr(chunk, "content", chunk)).strip()
                                if t:
                                    final_text.append(t)
                        except Exception:
                            pass
                    # Also collect from messages if present
                    if "messages" in v:
                        msgs = v.get("messages") or []
                        if msgs and hasattr(msgs[-1], "content"):
                            final_text.append(str(msgs[-1].content))
        raw_text = "\n".join(final_text).strip()
        return {"text": normalize_whitespace(raw_text)}
    except Exception as e:
        return {"text": f"[baseline error: {e}]"}

# Smoke test
test_q = "Which organelle is the powerhouse of the cell? Options: (A) Ribosome (B) Mitochondria (C) Golgi (D) Lysosome"
print("Smoke test:")
print(solve_mcq(test_q).get("text", "" )[:300])

LLM_MODEL: llama3.1:8b
CREATIVE_LLM_MODEL: llama3.1:8b
EMBEDDING_MODEL: nomic-embed-text
ANEETA VECTORDB_BASE_PATH: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA\src\aneeta\vectordb
PYTHONPATH +: C:\Users\yanji\Desktop\IS469\Project\469ProjectCombined\ANEETA\src
ANEETA resources -> llm.model: llama3.1:8b
Smoke test:
ANEETA resources -> llm.model: llama3.1:8b
Smoke test:
Answer:
(B)
Mit
ochond
ria
Explanation:
The
mitochond
ria
are
often
referred
to
as
the
"
power
house
"
of
the
cell
because
they
generate
most
of
the
cell
's
supply
of
ad
enos
ine
tri
ph
osphate
(
AT
P
),
which
is
used
as
a
source
of
chemical
energy.
In
addition
to
generating
ATP,
mitochond
ria
are
i
Answer:
(B)
Mit
ochond
ria
Explanation:
The
mitochond
ria
are
often
referred
to
as
the
"
power
house
"
of
the
cell
because
they
generate
most
of
the
cell
's
supply
of
ad
enos
ine
tri
ph
osphate
(
AT
P
),
which
is
used
as
a
source
of
chemical
energy.
In
addition
to
generating
ATP,
mitochond
ria
are
i


In [66]:
# Verify 'aneeta' import source and Ollama model availability
import os, importlib, requests
import aneeta
print("ANEETA package path:", getattr(aneeta, "__file__", aneeta))
print("LLM_MODEL (env):", os.environ.get("LLM_MODEL"))
try:
    from aneeta.llm.llm import get_llm
    llm = get_llm()
    # Many LangChain chat models expose .model_name, .model, or .model_id
    actual_model = getattr(llm, "model", None) or getattr(llm, "model_name", None) or getattr(llm, "model_id", None)
    print("LLM (resolved by ANEETA):", actual_model)
except Exception as e:
    print("Could not instantiate LLM to inspect model:", e)

try:
    r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
    if r.ok:
        available = {m.get('name') for m in r.json().get('models', []) if 'name' in m}
    else:
        available = set()
    shown = sorted(list(available))[:8]
    suffix = f"...+{max(0, len(available)-8)} more" if len(available) > 8 else ""
    print("Ollama installed models:", shown, suffix)
    need = os.environ.get("LLM_MODEL")
    if need and (available and need not in available):
        print(f"Note: model '{need}' not found in Ollama. Pull it: ollama pull {need}")
except Exception as e:
    print("Could not query Ollama tags:", e)

# Quick LLM connection smoke (will fail fast if model/server not ready)
try:
    from aneeta.llm.llm import get_llm
    _ = get_llm().invoke("hello")
    print("LLM connection OK.")
except Exception as e:
    print("LLM connection check failed:", e)

ANEETA package path: None
LLM_MODEL (env): llama3.1:8b
LLM (resolved by ANEETA): llama3.1:8b
Ollama installed models: ['Student:latest', 'deepseek-r1:7b', 'gemma2:2b', 'gemma2:9b', 'gemma3:1b', 'llama3.1:8b', 'mistral-nemo:12b', 'nomic-embed-text:latest'] ...+2 more
LLM (resolved by ANEETA): llama3.1:8b
Ollama installed models: ['Student:latest', 'deepseek-r1:7b', 'gemma2:2b', 'gemma2:9b', 'gemma3:1b', 'llama3.1:8b', 'mistral-nemo:12b', 'nomic-embed-text:latest'] ...+2 more
LLM connection OK.
LLM connection OK.


In [67]:
# Quick model switch micro-benchmark (local)
# This is optional; uses installed models only
try:
    r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
    names = [m.get('name') for m in r.json().get('models', [])]
except Exception:
    names = []

# Shortlist: baseline + two higher-quality candidates
model_shortlist = ["llama3.1:8b", "gemma2:9b", "mistral-nemo:12b"]
preferred = [m for m in model_shortlist if (not names) or (m in names)]
if not preferred:
    preferred = ["llama3.1:8b"]  # safe default

benchmarks = []
for name in preferred:
    os.environ["LLM_MODEL"] = name
    s = time.time(); _ = solve_mcq("What is the capital of France? Options: (A) Rome (B) Paris (C) Berlin (D) Madrid"); dt = (time.time()-s)*1000
    benchmarks.append({"model": name, "latency_ms": dt})

pd.DataFrame(benchmarks).sort_values("latency_ms").reset_index(drop=True)

Unnamed: 0,model,latency_ms
0,llama3.1:8b,38550.438643
1,gemma2:9b,47156.552792
2,mistral-nemo:12b,48115.099192


In [None]:
# Evaluate 3 MCQs across different local models and (optionally) log to MLflow
import os, numpy as np, pandas as pd, time, requests

# Quick reachability check to avoid confusing errors
def _ollama_ok(url: str) -> bool:
    try:
        r = requests.get(url, timeout=5)
        return r.status_code in (200, 404)
    except Exception:
        return False

if not _ollama_ok(OLLAMA_URL):
    print("Ollama not reachable at", OLLAMA_URL)
    print("Start the service in a PowerShell:")
    print("ollama serve    # then rerun this cell")
else:
    # Helper: get available Ollama tags to filter the test set
    available = set()
    try:
        r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
        if r.ok:
            for m in r.json().get('models', []):
                if 'name' in m:
                    available.add(m['name'])
    except Exception:
        pass

    # Shortlist for one-click comparisons
    model_shortlist = ["llama3.1:8b", "gemma2:9b", "mistral-nemo:12b"]
    models_to_test = [m for m in model_shortlist if (not available) or (m in available)]
    missing = [m for m in model_shortlist if m not in models_to_test]
    if missing:
        print("Missing models not found in Ollama; pull as needed:")
        for m in missing:
            print(f" - ollama pull {m}")
    if not models_to_test:
        models_to_test = ["llama3.1:8b"]  # safe default

    # Self-contained MCQ examples
    mcq_examples = [
        "A body moving in a circle of radius r with speed v has centripetal acceleration? Options: (A) v^2/r (B) r/v^2 (C) v/r^2 (D) r^2/v",
        "If 2 mol of an ideal gas at constant temperature compress to half the volume, what happens to pressure? Options: (A) doubles (B) halves (C) same (D) zero",
        "Which organelle is the powerhouse of the cell? Options: (A) Ribosome (B) Mitochondria (C) Golgi (D) Lysosome",
    ]

    def eval_model(model_name: str):
        os.environ["LLM_MODEL"] = model_name
        results, latencies = [], []
        for q in mcq_examples:
            t0 = time.time()
            out = solve_mcq(q)
            latencies.append((time.time()-t0)*1000)
            results.append(out.get("text",""))
        return {
            "model": model_name,
            "latency_p50_ms": float(np.percentile(latencies, 50)),
            "latency_p95_ms": float(np.percentile(latencies, 95)),
            "answers": results,
        }

    # Optional MLflow logging
    try:
        import mlflow
        mlflow.set_experiment("aneeta-baseline-doubtsolver")
        rows = []
        for m in models_to_test:
            with mlflow.start_run(run_name=f"baseline_{m}") as run:
                r = eval_model(m)
                mlflow.log_params({"model": m, "agent": "mcq_question_solver"})
                mlflow.log_metrics({"latency_p50_ms": r["latency_p50_ms"], "latency_p95_ms": r["latency_p95_ms"]})
                rows.append(r)
        pd.DataFrame(rows)
    except Exception:
        # If mlflow isn't installed/available, just run and show a table
        rows = [eval_model(m) for m in models_to_test]
        pd.DataFrame(rows)