In [None]:
# === Strict NLU Intent Evaluation (no heuristics) ===
# ËØÑÊµã5Á±ª: RAG, RAG+SQL_tool, RAG+CV_tool, CV_tool, SQL_tool

# --- Auto-install required packages if missing ---
import importlib
import subprocess
import sys

required_packages = [
    "numpy",
    "pandas",
    "scikit-learn",
    "matplotlib",
    "sentence-transformers"
]

for pkg in required_packages:
    try:
        importlib.import_module(pkg)
    except ImportError:
        print(f"üîß Installing missing package: {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# --- Imports (after ensuring they exist) ---
from pathlib import Path
from typing import Optional
import importlib.util
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

INTENTS = ["RAG", "RAG+SQL_tool", "RAG+CV_tool", "CV_tool", "SQL_tool"]

def synthesize_examples() -> pd.DataFrame:
    data = []
    # RAG
    rag_queries = [
        "What are the recommended maintenance steps for turf in spring?",
        "Summarize best practices for playground equipment inspection.",
        "List safety precautions for lawn mower operation.",
        "How do we prepare fields after heavy rain?",
        "Give me the SOP for trimming shrubs near walkways.",
        "What materials are needed for seeding bare patches?",
        "Explain the weekly inspection checklist for park benches.",
        "When should fertilizer be applied to soccer fields?",
        "What is the difference between dethatching and aeration?",
        "How do we restore compacted soil in high-traffic areas?",
        "Provide guidance on safe use of herbicides around playgrounds.",
        "Outline the steps to winterize irrigation systems.",
        "What are the typical signs of fungal disease on turf?",
        "How often should gravel paths be graded?",
        "What is the recommended height for grass cutting in summer?",
        "How to handle invasive species along the fence line?",
        "Create a checklist for post-storm cleanup in small parks?",
        "What are the PPE requirements for chainsaw use?",
        "Give me maintenance intervals for wooden seating areas?",
        "How to reduce trip hazards on uneven paths?",
        "What should we document during routine inspections?",
        "List common causes of poor drainage in park lawns?",
        "How to manage clippings for sustainability?",
        "Describe key steps for seedbed preparation?",
        "What are typical warning signs of equipment wear?",
    ]
    for q in rag_queries:
        data.append({"text": q, "image_uri": None, "label": "RAG"})

    # RAG+SQL_toolÔºàÁªìÊûÑÂåñÂàÜÊûê + Ëß£Èáä/ÂºïÁî®Ôºâ
    sql_rag_queries = [
        "Show monthly mowing labor cost for Kitsilano in 2024, and explain reasons based on SOPs.",
        "Trend of total maintenance costs from May to August 2023 with relevant policy notes.",
        "Which park had the highest mowing expense last month? Cite guidance for cost control.",
        "Compare trimming costs between Stanley Park and Queen Elizabeth Park for Q2 2024 and provide context.",
        "When was the last mowing date for Trout Lake Park, and what is the recommended frequency?",
        "Break down July costs by park for line marking; include recommended thresholds.",
        "Top-5 parks by maintenance hours in September 2024 with high-level interpretations.",
        "Average weekly mowing cost for community parks this summer and reference staffing SOPs.",
        "What is the cost trend for fertilization in 2023 and relevant scheduling guidance?",
        "List parks with above-average irrigation repair costs in June and cite possible causes.",
        "How many hours were logged for hedge trimming in April and what is typical?",
        "What is the median mowing cost per park in Q3 2024, with normative ranges?",
        "Monthly cost comparison for baseball vs soccer fields in 2024 and maintenance insights?",
        "Give me the maintenance cost by park and month for October with brief commentary.",
        "Which two parks had the lowest total cost in March and are they under-serviced?",
        "Find anomalies in mowing cost for August 2023 and discuss likely drivers.",
        "Return cost breakdown by activity for Stanley Park, May 2024, with notes.",
        "Compute YOY change in mowing labor cost for July and give interpretation.",
        "List parks with zero mowing activity in June and reference schedule expectations.",
        "Which park‚Äôs cost trend is increasing fastest this quarter and potential reasons?",
        "How many visits occurred for irrigation fixes last month and suggested follow-up?",
        "Total cost across all parks for Q1 2024 with short narrative.",
        "Which park has the longest time since last mowing and SOP expectations?",
        "Give me a monthly histogram of costs for field renovation with brief reading.",
        "Return the last three maintenance dates for playgrounds in July, plus scheduling note.",
    ]
    for q in sql_rag_queries:
        data.append({"text": q, "image_uri": None, "label": "RAG+SQL_tool"})

    # RAG+CV_toolÔºàÂõæÂÉè + ÊñáÊú¨Áü•ËØÜÔºâ
    cv_combo_queries = [
        "Given this photo, assess the turf condition and suggest maintenance steps.",
        "Use the image to identify field type and recommend actions.",
        "Analyze the attached picture for safety hazards and cite SOP steps.",
        "From this image, diagnose drainage issues and propose remedies.",
        "Check the photo: is the grass height within recommended range?",
        "Identify bare patches in the image and suggest treatment.",
        "Review this field photo and list likely fungal issues with references.",
        "From the image, does the infield surface require grading?",
        "Using the picture, evaluate line-mark visibility and next actions.",
        "Assess the goalmouth wear in the image and provide SOP-based guidance.",
        "Review the image for debris hazards and outline cleanup steps.",
        "From the photo, is dethatching needed? Provide rationale.",
        "Does the picture indicate irrigation leaks? Provide evidence.",
        "Check if mower scalping occurred in this image and recommend fix.",
        "Assess weed prevalence in the photo and cite herbicide precautions.",
        "Evaluate compaction from the image and recommend aeration schedule.",
        "Review surface evenness from this picture and list remediation steps.",
        "From the image, is there standing water? Provide maintenance plan.",
        "Check fence-line overgrowth in the photo and SOP guidance.",
        "Verify if seed establishment is adequate based on the image.",
        "From this image, prioritize safety issues and immediate actions.",
        "Assess turf color variability and potential nutrient deficiency.",
        "Check if field markings meet visibility standards in the photo.",
        "From the image, estimate thatch level and next steps.",
        "Review this picture for sign of pest damage and cite sources.",
    ]
    for q in cv_combo_queries:
        data.append({"text": q, "image_uri": "file://example_image.jpg", "label": "RAG+CV_tool"})

    # CV_toolÔºàÁ∫ØËßÜËßâÔºâ
    cv_only_queries = [
        "From this image, rate the field condition from 1 to 5.",
        "Detect hazards in the attached photo and output as JSON.",
        "Classify the field type in this image.",
        "Identify visible maintenance needs from the photo only.",
        "Assess grass height from the image and return a numeric rating.",
        "Detect bare patches and estimate area coverage percentage.",
        "Identify standing water regions in the photo.",
        "Check for line-mark visibility solely from the image.",
        "Classify surface evenness from the picture.",
        "Detect debris or foreign objects in the field image.",
        "Rate weed presence in the image on a scale of 0‚Äì3.",
        "Identify mower scalping in the photo if present.",
        "Detect fence-line overgrowth in the picture.",
        "Identify soil compaction indicators visually.",
        "Rate overall playability from the image only.",
        "Detect irrigation leakage indicators in the photo.",
        "Estimate thatch accumulation from the image.",
        "Classify surface type (turf, dirt, gravel) from the photo.",
        "Detect fungal disease signs visually.",
        "Identify safety-critical issues from the image alone.",
        "Classify marking completeness visually.",
        "Detect areas requiring immediate remediation.",
        "Rate turf density from the image.",
        "Identify pest damage patterns visually.",
        "Provide a visual-only condition summary.",
    ]
    for q in cv_only_queries:
        data.append({"text": q, "image_uri": "file://example_image.jpg", "label": "CV_tool"})

    # SQL_toolÔºàÂè™Ë¶ÅË°®/Êï∞ÂÄºÔºå‰∏çË¶ÅËß£ÈáäÔºâ
    sql_only_queries = [
        "Return a table of monthly mowing labor cost for Kitsilano in 2024. Only numbers.",
        "DuckDB SQL: last mowing date by park for July. Output as rows only.",
        "Compute median mowing cost per park in Q3 2024. No explanation.",
        "List top-5 parks by total maintenance hours in September 2024; table only.",
        "Total cost by activity for May 2024; return CSV.",
        "Histogram bins of monthly costs for field renovation; counts only.",
        "Show parks with zero mowing activity in June; one column.",
        "YOY change in mowing labor cost for July 2023 vs 2024; numeric output.",
        "Maintenance cost by park and month for October; data frame only.",
        "Visits for irrigation fixes last month; count per park.",
        "Breakdown of July costs for line marking; two columns (park, cost).",
        "Compare trimming hours between two parks in Q2 2024; numbers only.",
        "Median and IQR of mowing cost for community parks; table.",
        "Total cost across all parks for Q1 2024; single scalar.",
        "Longest time since last mowing per park; days as integer.",
        "Mean weekly mowing cost this summer; numeric with std.",
        "List anomalies in mowing cost for August 2023; park and z-score.",
        "Return last three maintenance dates per playground; rows only.",
        "Monthly cost trend for fertilization in 2023; series output.",
        "Top-10 parks by irrigation repair cost in June; table.",
        "Cost comparison for baseball vs soccer fields in 2024; pivot table.",
        "Count of visits for hedge trimming in April; integer only.",
        "Above-average irrigation repair costs in June; filter and list.",
        "Maintenance hours boxplot stats for July; quartiles only.",
        "Parks with lowest total cost in March; list two names only.",
    ]
    for q in sql_only_queries:
        data.append({"text": q, "image_uri": None, "label": "SQL_tool"})
    return pd.DataFrame(data)

df = synthesize_examples()

# --- ‰∏•Ê†ºÂä†ËΩΩ‰Ω†ÁöÑ nlu.py Âπ∂Ë∞ÉÁî® nlu_parse ---
CURRENT_DIR = Path(__file__).parent if "__file__" in locals() else Path.cwd()
NLU_PATH = (CURRENT_DIR.parent / "nlu.py").resolve()  # notebookÂú® experiment/Ôºånlu.py Âú®‰∏ä‰∏ÄÁ∫ß
print("‚úÖ NLU path resolved to:", NLU_PATH)

spec = importlib.util.spec_from_file_location("user_nlu", NLU_PATH)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
assert hasattr(module, "nlu_parse"), "nlu_parse not found in nlu.py"

def predict_intent_strict(text: str, image_uri: Optional[str]) -> str:
    res = module.nlu_parse(text=text, image_uri=image_uri)
    if isinstance(res, dict) and "intent" in res:
        return str(res["intent"])
    if hasattr(res, "intent"):
        return str(getattr(res, "intent"))
    raise ValueError("nlu_parse must return a dict/object with an 'intent' field.")

df["pred_intent"] = df.apply(lambda r: predict_intent_strict(r["text"], r["image_uri"]), axis=1)

# --- ÊåáÊ†á ---
y_true, y_pred = df["label"].values, df["pred_intent"].values
print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, labels=INTENTS, zero_division=0))

# --- Ê∑∑Ê∑ÜÁü©Èòµ ---
cm = confusion_matrix(y_true, y_pred, labels=INTENTS)
plt.figure(figsize=(6.8, 5.6))
plt.imshow(cm, interpolation='nearest')
plt.title('NLU Intent Confusion Matrix (5 intents) ‚Äî Strict nlu_parse')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.xticks(np.arange(len(INTENTS)), INTENTS, rotation=45, ha='right')
plt.yticks(np.arange(len(INTENTS)), INTENTS)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center')
plt.tight_layout(); plt.show()

# --- ÂØºÂá∫ËØØÂàÜÁ±ªÊ∏ÖÂçïÔºå‰æø‰∫é‰Ω†ÊéíÊü• ---
errors = df[df["label"] != df["pred_intent"]][["text","image_uri","label","pred_intent"]]
errors.to_csv("nlu_eval_misclassified_strict.csv", index=False)
df.to_csv("nlu_eval_dataset_strict.csv", index=False)
print("Saved: nlu_eval_dataset_strict.csv, nlu_eval_misclassified_strict.csv")

üîß Installing missing package: scikit-learn ...
üîß Installing missing package: sentence-transformers ...


NameError: name 'Path' is not defined