## EVALUATION RUNNING 

In [8]:
# =========================
# Imports
# =========================
from evaluation.eval_runner import run_daily_eval
from evaluation.daily_cases import DAILY_CASES
from llm_core.agent_process import agent_process

from collections import Counter

# =========================
# Run DAILY evaluation
# =========================
daily_stats, quality_dist = run_daily_eval(DAILY_CASES)

total_cases = len(DAILY_CASES)

print("=== DAILY EVALUATION (Layer A) ===\n")

# JSON validity
json_valid_rate = daily_stats["json_valid"] / total_cases
print(f"JSON validity rate: {json_valid_rate:.2%}")

# Clarification metrics
tp = daily_stats["clar_tp"]
fp = daily_stats["clar_fp"]
fn = daily_stats["clar_fn"]
tn = daily_stats["clar_tn"]

clar_precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
clar_recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
false_clar_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0

print(f"Clarification precision: {clar_precision:.2%}")
print(f"Clarification recall:    {clar_recall:.2%}")
print(f"False clarification rate:{false_clar_rate:.2%}")

# Task ID correctness (только SUCCESS внутри eval)
task_valid = daily_stats["task_id_valid"]
task_invalid = daily_stats["task_id_invalid"]
task_id_correctness = (
    task_valid / (task_valid + task_invalid)
    if (task_valid + task_invalid) > 0
    else 0.0
)

print(f"Task ID correctness: {task_id_correctness:.2%}")

# Quality distribution (diagnostic)
print("\nQuality distribution:")
for quality, count in quality_dist.items():
    share = count / total_cases
    print(f"  {quality}: {share:.2%}")

print("\nRaw stats:")
print(dict(daily_stats))

=== DAILY EVALUATION (Layer A) ===

JSON validity rate: 93.75%
Clarification precision: 62.50%
Clarification recall:    83.33%
False clarification rate:33.33%
Task ID correctness: 85.71%

Quality distribution:
  DETAIL_OK: 56.25%
  GREAT: 12.50%
  TOO_SHORT: 18.75%
  EMPTY: 6.25%

Raw stats:
{'action_success': 7, 'clar_tn': 6, 'json_valid': 15, 'task_id_valid': 6, 'action_ask_clarification': 8, 'clar_tp': 5, 'clar_fp': 3, 'task_id_invalid': 1, 'clar_fn': 1, 'runtime_error': 1}


In [3]:
# =========================
# ANALYTICS EVALUATION
# =========================
from evaluation.eval_runner import run_analytics_eval
from evaluation.analytics_cases import ANALYTICS_CASES

analytics_stats = run_analytics_eval(ANALYTICS_CASES)
total_cases = len(ANALYTICS_CASES)

print("=== ANALYTICS EVALUATION (Layer A) ===\n")

intent_correct = analytics_stats.get("intent_correct", 0)
intent_wrong = analytics_stats.get("intent_wrong", 0)

intent_accuracy = (
    intent_correct / (intent_correct + intent_wrong)
    if (intent_correct + intent_wrong) > 0
    else 0.0
)

unsupported_correct = analytics_stats.get("unsupported_correct", 0)
unsupported_wrong = analytics_stats.get("unsupported_wrong", 0)

unsupported_accuracy = (
    unsupported_correct / (unsupported_correct + unsupported_wrong)
    if (unsupported_correct + unsupported_wrong) > 0
    else 0.0
)

print(f"Intent accuracy (supported): {intent_accuracy:.2%}")
print(f"Unsupported intent correctness: {unsupported_accuracy:.2%}")

print("\nRaw stats:")
print(dict(analytics_stats))

=== ANALYTICS EVALUATION (Layer A) ===

Intent accuracy (supported): 92.31%
Unsupported intent correctness: 100.00%

Raw stats:
{'intent_correct': 12, 'intent_wrong': 1, 'unsupported_correct': 5}
