## 1. Setup

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import precision_score, recall_score, f1_score
import seaborn as sns
import json
from tqdm.notebook import tqdm
import shutil
from google.colab import files



EXPERIMENT_NAME = "Dataset_Original"
RESULTS_DIR = f"results_{EXPERIMENT_NAME}"

if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

## 2. Model loading & Config

In [None]:
try:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
    print(e);

# Hierarchy map for relaxed evaluation
HIERARCHY_MAP = {
    "Trigger": "Condition",
    "Precondition": "Condition",
    "System_response": "Action",
    "Main_actor": "Entity"
}

TARGET_COLUMNS = [
    'Purpose', 'Condition', 'Main_actor', 'Entity',
    'System_response', 'Action', 'Precondition', 'Trigger'
]

## 3. Ground Truth Loader (JSON List Format)

In [None]:
def normalize_field(value):
    if value is None:
        return []
    if isinstance(value, list):
        return [str(v).strip() for v in value if v and str(v).strip()]
    if isinstance(value, str):
        return [s.strip() for s in value.split('&-&') if s.strip()]
    return []

def load_and_normalize_ground_truth(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        return {}
    except json.JSONDecodeError:
        return {}

    normalized_data = {}

    if not isinstance(data, list):
        return {}

    for item in tqdm(data, desc="Parsing JSON"):
        rid = item.get('id')
        if rid is None:
            continue

        row_id = f"req_{rid}"

        entry = {
            'text': item.get('Text', '')
        }

        for col in TARGET_COLUMNS:
            raw_val = item.get(col)
            entry[col] = normalize_field(raw_val)

        normalized_data[row_id] = entry

    return normalized_data

# --- EXECUTION ---
GT_FILENAME = 'requirements.json'

ground_truth_data = load_and_normalize_ground_truth(GT_FILENAME)

## 4. Prediction Loader

In [None]:
ZERO_SHOT_FILE = 'zero_shot_predictions.json'
ONE_SHOT_FILE = 'one_shot_predictions.json'
FEW_SHOT_FILE = 'few_shot_predictions.json'
MULTI_AGENT_FILE = 'multi_agent_predictions.json'
MULTI_AGENT_FILE_1 = 'multi_agent_predictions1.json'

def load_predictions_json(file_path):
    preds = {}

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if isinstance(data, list):
            for item in data:
                rid = item.get('id')
                if rid is None: continue

                row_id = f"req_{rid}"
                clean_entry = {}

                source = item.get('prediction', item)

                for col in TARGET_COLUMNS:
                    raw_val = source.get(col)
                    clean_entry[col] = normalize_field(raw_val)

                preds[row_id] = clean_entry

        return preds

    except FileNotFoundError:
        return {}
    except json.JSONDecodeError:
        return {}
    except Exception as e:
        return {}

# --- EXECUTION ---
zero_shot_preds = load_predictions_json(ZERO_SHOT_FILE)
if zero_shot_preds:
  print("zero shot")
one_shot_preds = load_predictions_json(ONE_SHOT_FILE)
if one_shot_preds:
  print("one shot")
few_shot_preds = load_predictions_json(FEW_SHOT_FILE)
if few_shot_preds:
  print("few shot")
multi_agent_preds = load_predictions_json(MULTI_AGENT_FILE)
if multi_agent_preds:
  print("multi")
multi_agent_preds_1 = load_predictions_json(MULTI_AGENT_FILE_1)
if multi_agent_preds_1:
  print("multi_1")

## 5. Core Evaluation Engine (Semantic Similarity)

In [None]:
def evaluate_single_prediction(gt_list, pred_list, threshold):
    tp, fp, fn = 0, 0, 0

    gt_temp = gt_list.copy()
    pred_temp = pred_list.copy()

    for p in pred_temp[:]:
        best_match_idx = -1
        best_sim = -1

        if p in gt_temp:
            best_sim = 1.0
            best_match_idx = gt_temp.index(p)
        else:
            emb_p = sbert_model.encode(p, convert_to_tensor=True)

            for i, g in enumerate(gt_temp):
                emb_g = sbert_model.encode(g, convert_to_tensor=True)
                # Calculate cosine similarity
                sim = util.cos_sim(emb_p, emb_g).item()

                if sim > best_sim:
                    best_sim = sim
                    best_match_idx = i

        # Check if the best match exceeds the similarity threshold
        if best_sim >= threshold and best_match_idx != -1:
            tp += 1
            gt_temp.pop(best_match_idx) # Remove matched GT item to avoid double counting
            pred_temp.remove(p)         # Remove matched Prediction item

    # Hallucinations
    fp = len(pred_temp)
    # Omissions
    fn = len(gt_temp)

    return tp, fp, fn

def run_full_evaluation_optimized(gt_data, pred_data, threshold):
    total_tp, total_fp, total_fn = 0, 0, 0

    for req_id, gt_entry in gt_data.items():
        if req_id not in pred_data:
            for col in TARGET_COLUMNS:
                total_fn += len(gt_entry.get(col, []))
            continue

        pred_entry = pred_data[req_id]

        for col in TARGET_COLUMNS:
            gt_vals = gt_entry.get(col, [])
            pred_vals = pred_entry.get(col, [])

            tp, fp, fn = evaluate_single_prediction(gt_vals, pred_vals, threshold)

            total_tp += tp
            total_fp += fp
            total_fn += fn

    # Calculate Macro Metrics (Safe division)
    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {"precision": precision, "recall": recall, "f1": f1}

## 6. Threshold Optimization (ROC Analysis)

In [None]:
thresholds = np.linspace(0.15, 0.99, 15)
f1_scores = []
precisions = []
recalls = []

if multi_agent_preds_1:
    tuning_preds = multi_agent_preds_1
    tuning_name = "Multi-Agent_1"
    print(f"MultiAgent_1")
elif multi_agent_preds:
    tuning_preds = multi_agent_preds
    tuning_name = "Multi-Agent"
    print(f"MultiAgent")
elif few_shot_preds:
    tuning_preds = few_shot_preds
    tuning_name = "Few-Shot"
    print(f"few-shot")
elif one_shot_preds:
    tuning_preds = one_shot_preds
    tuning_name = "One-Shot"
    print(f"one-shot")
elif zero_shot_preds:
    tuning_preds = zero_shot_preds
    tuning_name = "Zero-Shot"
    print(f"zeroshot")
else:
    tuning_preds = {}
    tuning_name = "None"

if tuning_preds:
    for t in tqdm(thresholds, desc="Testing Thresholds"):
        metrics = run_full_evaluation_optimized(ground_truth_data, tuning_preds, threshold=t)
        f1_scores.append(metrics['f1'])
        precisions.append(metrics['precision'])
        recalls.append(metrics['recall'])

    # --- Visualization ---
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, f1_scores, marker='o', label='F1 Score', linewidth=3, color='#1f77b4')
    plt.plot(thresholds, precisions, linestyle='--', label='Precision', color='#2ca02c')
    plt.plot(thresholds, recalls, linestyle=':', label='Recall', color='#d62728')

    plt.title(f"Threshold Optimization Analysis ({tuning_name})", fontsize=14)
    plt.xlabel("Cosine Similarity Threshold", fontsize=12)
    plt.ylabel("Score", fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)

    save_path = f"{RESULTS_DIR}/1_threshold_tuning_{tuning_name}.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')

    plt.show()

    # --- Select Best Threshold ---
    if f1_scores:
        best_idx = np.argmax(f1_scores)
        best_threshold = thresholds[best_idx]
        print(f"Optimal Threshold Found: {best_threshold:.2f} (Max F1: {f1_scores[best_idx]:.3f})")
    else:
        best_threshold = 0.70
else:
    best_threshold = 0.70

## 7. Comparative Report & Visualization

In [None]:
def calculate_exact_match_score(gt_data, pred_data):
    total_tp, total_fp, total_fn = 0, 0, 0

    for req_id, gt_entry in gt_data.items():
        if req_id not in pred_data:
            for col in TARGET_COLUMNS:
                total_fn += len(gt_entry.get(col, []))
            continue

        pred_entry = pred_data[req_id]

        for col in TARGET_COLUMNS:
            gt_vals = [s.lower().strip() for s in gt_entry.get(col, [])]
            pred_vals = [s.lower().strip() for s in pred_entry.get(col, [])]

            current_tp = 0
            for p in pred_vals[:]:
                if p in gt_vals:
                    current_tp += 1
                    gt_vals.remove(p)
                    pred_vals.remove(p)

            total_tp += current_tp
            total_fp += len(pred_vals)
            total_fn += len(gt_vals)

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def calculate_category_breakdown(gt_data, pred_data, threshold):
    breakdown = {}
    for col in TARGET_COLUMNS:
        total_tp, total_fp, total_fn = 0, 0, 0
        for req_id, gt_entry in gt_data.items():
            if req_id not in pred_data:
                total_fn += len(gt_entry.get(col, []))
                continue
            gt_vals = gt_entry.get(col, [])
            pred_vals = pred_data[req_id].get(col, [])
            # Use SBERT
            tp, fp, fn = evaluate_single_prediction(gt_vals, pred_vals, threshold)
            total_tp += tp; total_fp += fp; total_fn += fn

        p = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        r = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        f1 = 2 * (p * r) / (p + r) if (p + r) > 0 else 0
        breakdown[col] = f1
    return breakdown

potential_models = [
    {
        "id": "zero_shot",
        "data": zero_shot_preds,
        "name": "Llama-3 Zero-Shot",
        "strategy": "Baseline",
        "color": "#444444" # Grey
    },
    {
        "id": "one_shot",
        "data": one_shot_preds,
        "name": "Llama-3 One-Shot",
        "strategy": "In-Context Learning",
        "color": "#FF991C" # Red/Orange
    },
    {
        "id": "few_shot",
        "data": few_shot_preds,
        "name": "Llama-3 Few-Shot",
        "strategy": "In-Context (5-Shot)",
        "color": "#1f77b4" # Blue
    },
    {
        "id": "multi_agent",
        "data": multi_agent_preds,
        "name": "Llama-3 Multi-Agent",
        "strategy": "Agentic Workflow",
        "color": "#55aa22" # Green
    },
    {
        "id": "multi_agent_1",
        "data": multi_agent_preds_1,
        "name": "Llama-3 Multi-Agent_1",
        "strategy": "Agentic Workflow",
        "color": "#fd3db5" # Magenta
    }
]

results_list = []
colors_used = []

best_model_data = {}
best_model_name = ""
max_f1 = -1

for model in potential_models:
    if model["data"]:
        metrics = run_full_evaluation_optimized(ground_truth_data, model["data"], threshold=best_threshold)

        results_list.append({
            "Model Architecture": model["name"],
            "Strategy": model["strategy"],
            "Precision": metrics['precision'],
            "Recall": metrics['recall'],
            "F1-Score": metrics['f1']
        })
        colors_used.append(model["color"])

        if metrics['f1'] > max_f1:
            max_f1 = metrics['f1']
            best_model_data = model["data"]
            best_model_name = model["name"]

    else:
        print(f"No data loaded")

if results_list:
    results_df = pd.DataFrame(results_list)

    print("\nEXPERIMENTAL RESULTS SUMMARY")

    display(results_df.style.background_gradient(cmap='Greens', subset=['F1-Score']))

    plt.figure(figsize=(10, 6))
    bars = plt.bar(results_df['Model Architecture'], results_df['F1-Score'], color=colors_used)

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2.0, height,
                 f'{height:.3f}', ha='center', va='bottom', fontsize=12, fontweight='bold')

    plt.title('F1-Score Comparison by Architecture', fontsize=14)
    plt.ylabel('F1-Score')
    plt.ylim(0, 1.1)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.xticks(rotation=0)



    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/2_model_comparison_f1.png", dpi=300, bbox_inches='tight')
    plt.show()

    if best_model_data:
        print(f"{best_model_name}")

        exact_f1 = calculate_exact_match_score(ground_truth_data, best_model_data)
        sbert_f1 = max_f1

        print(f"\n1. WHY SBERT?")
        print(f"Exact String Match F1:  {exact_f1:.3f}")
        print(f"SBERT Semantic F1:      {sbert_f1:.3f}")
        print(f"- Improvement: +{(sbert_f1 - exact_f1)*100:.1f}% thanks to semantic evaluation.")

        plt.figure(figsize=(6, 4))
        plt.bar(['Exact Match', 'SBERT AI'], [exact_f1, sbert_f1], color=['#999999', '#1f77b4'])
        plt.title("Impact of Semantic Evaluation Metrics")
        plt.ylabel("F1 Score")
        plt.ylim(0, 1.0)
        for i, v in enumerate([exact_f1, sbert_f1]):
            plt.text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold')

        plt.tight_layout()
        plt.savefig(f"{RESULTS_DIR}/3_exact_vs_semantic.png", dpi=300, bbox_inches='tight')
        plt.show()


        print(f"\n2. PERFORMANCE BY CATEGORY")
        cat_scores = calculate_category_breakdown(ground_truth_data, best_model_data, best_threshold)
        sorted_cats = dict(sorted(cat_scores.items(), key=lambda item: item[1], reverse=True))

        plt.figure(figsize=(10, 6))
        colors_cat = plt.cm.viridis(np.linspace(0, 0.9, len(sorted_cats)))
        c_bars = plt.bar(sorted_cats.keys(), sorted_cats.values(), color=colors_cat)
        plt.title(f"F1-Score by Category ({best_model_name})")
        plt.ylabel("F1 Score")
        plt.ylim(0, 1.1)
        plt.xticks(rotation=45, ha='right')

        for bar in c_bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.2f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

        plt.tight_layout()
        plt.savefig(f"{RESULTS_DIR}/4_category_breakdown.png", dpi=300, bbox_inches='tight')
        plt.show()

else:
    print("check Loader.")

## 8. Qualitative Error Analysis
Exports specific examples of hallucinations and omissions to validate metrics with human review.

In [None]:
def analyze_qualitative_errors(gt_data, pred_data, threshold, num_examples=2, model_name="Model"):
    report_lines = []

    header = f"QUALITATIVE ANALYSIS REPORT: {model_name}\n" + "="*60
    print(header)
    report_lines.append(header)

    for col in TARGET_COLUMNS:
        errors_found = 0

        section_header = f"\n\nCategory: {col}\n" + "-"*40
        print(section_header)
        report_lines.append(section_header)

        for req_id, gt_entry in gt_data.items():
            if errors_found >= num_examples: break

            if req_id not in pred_data: continue

            gt_vals = gt_entry.get(col, [])
            pred_vals = pred_data.get(req_id, {}).get(col, [])

            tp, fp, fn = evaluate_single_prediction(gt_vals, pred_vals, threshold)

            if fp > 0 or fn > 0:

                error_block = []
                error_block.append(f"\n[ID: {req_id}]")

                text_content = gt_entry.get('text', '')
                text_preview = text_content[:80] + "..." if len(text_content) > 80 else text_content
                print(f"Context: \"{text_preview}\"")
                error_block.append(f"Context: \"{text_preview}\"")
                error_block.append(f"Ground Truth: {gt_vals}")
                error_block.append(f"Prediction:   {pred_vals}")

                if fp > 0:
                  error_block.append(f"   Type: HALLUCINATION (Extracted info not in GT)")
                if fn > 0:
                  error_block.append(f"   Type: OMISSION (Missed info present in GT)")

                print("-" * 40)

                msg = "\n".join(error_block)
                report_lines.append(msg)
                print(msg)

                errors_found += 1

        if errors_found == 0:
            msg = "\n   (No errors found in this category for the checked samples)"
            print(msg)
            report_lines.append(msg)

    filename = f"qualitative_errors_{model_name.replace(' ', '_')}.txt"
    save_path = os.path.join(RESULTS_DIR, filename)

    try:
        with open(save_path, "w", encoding="utf-8") as f:
            f.write("\n".join(report_lines))
    except Exception as e:
        print(f"{e}")

# --- EXECUTION ---

target_preds = {}
target_name = ""

if multi_agent_preds_1:
    target_preds = multi_agent_preds_1
    target_name = "Multi-Agent_1"
elif multi_agent_preds:
    target_preds = multi_agent_preds
    target_name = "Multi-Agent"
elif few_shot_preds:
    target_preds = few_shot_preds
    target_name = "Few-Shot"
elif one_shot_preds:
    target_preds = one_shot_preds
    target_name = "One-Shot"
elif zero_shot_preds:
    target_preds = zero_shot_preds
    target_name = "Zero-Shot"

if target_preds:
    analyze_qualitative_errors(ground_truth_data, target_preds, best_threshold, num_examples=3, model_name=target_name)
else:
    print("No predictions available")

## 9. Error Distribution

In [None]:
def check_semantic_match_visual(text1, text2, threshold):
    if not text1 or not text2: return False

    emb1 = sbert_model.encode(text1, convert_to_tensor=True)
    emb2 = sbert_model.encode(text2, convert_to_tensor=True)

    return util.cos_sim(emb1, emb2).item() >= threshold

def plot_dual_confusion_matrix(gt_data, pred_data, threshold, model_name="Model"):
    cols = TARGET_COLUMNS
    matrix = np.zeros((len(cols), len(cols)))

    print(f"Confusion matrix for {model_name}...")

    for req_id, gt_entry in tqdm(gt_data.items(), desc="Processing Matrix"):
        if req_id not in pred_data: continue
        pred_entry = pred_data[req_id]

        for i, gt_col in enumerate(cols):
            gt_vals = gt_entry.get(gt_col, [])
            for gt_val in gt_vals:
                match_found_in_col = -1

                for j, pred_col in enumerate(cols):
                    pred_vals = pred_entry.get(pred_col, [])
                    for p_val in pred_vals:
                        if check_semantic_match_visual(gt_val, p_val, threshold):
                            match_found_in_col = j
                            break
                    if match_found_in_col != -1: break

                if match_found_in_col != -1:
                    matrix[i, match_found_in_col] += 1

    plt.figure(figsize=(10, 8))
    sns.heatmap(matrix, annot=True, fmt='g', xticklabels=cols, yticklabels=cols, cmap='Oranges', cbar=False)
    plt.title(f"Confusion Matrix (RAW COUNTS) - {model_name}", fontsize=14, fontweight='bold', pad=15)
    plt.xlabel("Predicted Category", fontsize=11)
    plt.ylabel("Ground Truth Category", fontsize=11)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)

    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/5_confusion_matrix_raw_{model_name}.png", dpi=300, bbox_inches='tight')
    plt.show()

    row_sums = matrix.sum(axis=1)[:, np.newaxis]
    norm_matrix = matrix / (row_sums + 1e-10) # Epsilon to aviod div/0

    plt.figure(figsize=(10, 8))
    sns.heatmap(norm_matrix, annot=True, fmt='.1%', xticklabels=cols, yticklabels=cols, cmap='Blues', vmin=0, vmax=1)
    plt.title(f"Confusion Matrix (NORMALIZED) - {model_name}", fontsize=14, fontweight='bold', pad=15)
    plt.xlabel("Predicted Category", fontsize=11)
    plt.ylabel("Ground Truth Category", fontsize=11)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)

    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/6_confusion_matrix_norm_{model_name}.png", dpi=300, bbox_inches='tight')
    plt.show()

MODEL_TO_ANALYZE = 'multi_1' # zero || one || few || multi || multi_1

target_preds = {}
target_name = ""

if MODEL_TO_ANALYZE == 'multi_1' and multi_agent_preds_1:
    target_preds = multi_agent_preds_1
    target_name = "Multi-Agent_1"
elif MODEL_TO_ANALYZE == 'multi' and multi_agent_preds:
    target_preds = multi_agent_preds
    target_name = "Multi-Agent"
elif MODEL_TO_ANALYZE == 'few' and few_shot_preds:
    target_preds = few_shot_preds
    target_name = "Few-Shot"
elif MODEL_TO_ANALYZE == 'one' and one_shot_preds:
    target_preds = one_shot_preds
    target_name = "One-Shot"
elif zero_shot_preds:
    target_preds = zero_shot_preds
    target_name = "Zero-Shot"

if target_preds:
    plot_dual_confusion_matrix(ground_truth_data, target_preds, best_threshold, target_name)
else:
    print("No data")

## 10. Final Executive Summary & Efficiency Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Total times taken from Time.docx
total_time_zero = 2456.34
total_time_one = 3217.84
total_time_few = 3188.60
total_time_multi = 3198.90
total_time_multi_1 = 5218.89

num_reqs = len(ground_truth_data) if 'ground_truth_data' in globals() and ground_truth_data else 254

avg_time_zero = total_time_zero / num_reqs
avg_time_one = total_time_one / num_reqs
avg_time_few = total_time_few / num_reqs
avg_time_multi = total_time_multi / num_reqs
avg_time_multi_1 = total_time_multi_1 / num_reqs

models_to_plot = []
times_to_plot = []
f1_to_plot = []


def get_f1(model_name_part):
    if 'results_df' not in globals(): return None
    try:
        row = results_df[results_df['Model Architecture'].str.contains(model_name_part, case=False, na=False)]
        if not row.empty:
            return row['F1-Score'].values[0]
    except:
        pass
    return None

# Data Retrieval (Logic: if F1 exists, add the model to the plot)
f1_z = get_f1("Zero-Shot")
if f1_z is not None:
    models_to_plot.append("Zero-Shot")
    times_to_plot.append(avg_time_zero)
    f1_to_plot.append(f1_z)

f1_o = get_f1("One-Shot")
if f1_o is not None:
    models_to_plot.append("One-Shot")
    times_to_plot.append(avg_time_one)
    f1_to_plot.append(f1_o)

f1_f = get_f1("Few-Shot")
if f1_f is not None:
    models_to_plot.append("Few-Shot")
    times_to_plot.append(avg_time_few)
    f1_to_plot.append(f1_f)

f1_m = get_f1("Multi-Agent")
if f1_m is not None:
    models_to_plot.append("Multi-Agent")
    times_to_plot.append(avg_time_multi)
    f1_to_plot.append(f1_m)

f1_m1 = get_f1("Multi-Agent_1")
if f1_m1 is not None:
    models_to_plot.append("Multi-Agent_1")
    times_to_plot.append(avg_time_multi_1)
    f1_to_plot.append(f1_m1)

if models_to_plot:
    color_f1 = '#4682B4'
    color_time = '#B22222'

    fig, ax1 = plt.subplots(figsize=(10, 6))

    ax1.set_xlabel('Architecture', fontsize=12, fontweight='bold', labelpad=10)
    ax1.set_ylabel('F1 Score', color=color_f1, fontsize=12, fontweight='bold')

    bars = ax1.bar(models_to_plot, f1_to_plot, color=color_f1, alpha=0.6, width=0.5, label='F1 Score')

    max_f1 = max(f1_to_plot) if f1_to_plot else 1.0
    ax1.set_ylim(0, max_f1 * 1.2)
    ax1.tick_params(axis='y', labelcolor=color_f1)
    ax1.grid(visible=True, axis='y', linestyle='--', alpha=0.3) # Light grid for F1 only

    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax1.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.3f}',
                     ha='center', va='bottom', color=color_f1, fontweight='bold', fontsize=11,
                     bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=1))

    ax2 = ax1.twinx()
    ax2.set_ylabel('Avg Latency (s/req)', color=color_time, fontsize=12, fontweight='bold')

    ax2.plot(models_to_plot, times_to_plot, color=color_time, marker='D', markersize=8, linewidth=2.5, label='Latency', zorder=10)

    # Dynamic scale for Time
    max_time = max(times_to_plot) if times_to_plot else 10
    ax2.set_ylim(0, max_time * 1.3)
    ax2.tick_params(axis='y', labelcolor=color_time)

    for i, txt in enumerate(times_to_plot):
        ax2.text(i, txt, f'{txt:.1f}s',
                 ha='center', va='bottom', color=color_time, fontweight='bold', fontsize=11,
                 bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', pad=1))

    plt.title("Model Quality vs. Computational Cost", fontsize=14, pad=20)

    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/7_tradeoff_analysis.png", dpi=300, bbox_inches='tight')
    plt.show()

# --- 4. TEXTUAL REPORT ---

report_lines = []
report_lines.append("="*60)
report_lines.append("FINAL EXECUTIVE SUMMARY")
report_lines.append("="*60)

report_lines.append("\n1. PERFORMANCE METRICS (Quality)")
if 'results_df' in globals():
    report_lines.append(results_df[['Model Architecture', 'Precision', 'Recall', 'F1-Score']].to_string(index=False))
else:
    report_lines.append("No metric data available.")

report_lines.append(f"\n2. OPTIMIZATION PARAMETERS")
if 'best_threshold' in globals():
    report_lines.append(f"- Optimal Semantic Threshold (SBERT): {best_threshold:.2f}")

report_lines.append("\n3. COST/BENEFIT ANALYSIS")
try:
    if len(f1_to_plot) >= 2:
        # Compare Best vs Baseline
        base_idx = 0
        best_idx = np.argmax(f1_to_plot)

        base_name = models_to_plot[base_idx]
        best_name = models_to_plot[best_idx]

        f1_gain = ((f1_to_plot[best_idx] - f1_to_plot[base_idx]) / f1_to_plot[base_idx]) * 100

        if times_to_plot[base_idx] > 0:
            time_change = ((times_to_plot[best_idx] - times_to_plot[base_idx]) / times_to_plot[base_idx]) * 100
        else:
            time_change = 0.0

        trend_time = "increase" if time_change > 0 else "decrease"

        report_lines.append(f"- Best Model (Quality): {best_name}")
        report_lines.append(f"- Quality Impact: Moving from {base_name} to {best_name}")
        report_lines.append(f"  resulted in an F1-Score improvement of {f1_gain:+.1f}%.")

        if abs(time_change) > 0.1:
            report_lines.append(f"- Cost Impact: This resulted in a latency {trend_time} of {abs(time_change):.1f}%")
            report_lines.append(f"  ({times_to_plot[base_idx]:.1f}s -> {times_to_plot[best_idx]:.1f}s per requirement).")
        else:
            report_lines.append(f"- Cost Impact: Latency data unavailable or identical.")

    else:
        report_lines.append("Insufficient data for comparative trade-off calculation.")

except Exception as e:
    report_lines.append(f"Note: Unable to calculate detailed trade-off ({e})")

report_lines.append("="*60)

# Uniamo tutto in un unico testo
full_report_text = "\n".join(report_lines)

# A. Stampa a schermo
print(full_report_text)

# B. Salva su file TXT
report_path = f"{RESULTS_DIR}/executive_summary.txt"
with open(report_path, "w", encoding="utf-8") as f:
    f.write(full_report_text)

## 11. Final Export

In [None]:
print(EXPERIMENT_NAME)
print(RESULTS_DIR)

if 'results_df' in globals():
    # CSV
    csv_path = f"{RESULTS_DIR}/summary_metrics.csv"
    results_df.to_csv(csv_path, index=False)

    # LaTeX
    latex_path = f"{RESULTS_DIR}/latex_table.txt"
    with open(latex_path, "w") as f:
        f.write(results_df.to_latex(index=False, float_format="%.3f"))



try:
    best_model_key = None
    if 'multi_agent_preds_1' in globals() and multi_agent_preds_1: best_model_key = multi_agent_preds_1
    elif 'multi_agent_preds' in globals() and multi_agent_preds: best_model_key = multi_agent_preds
    elif 'few_shot_preds' in globals() and few_shot_preds: best_model_key = few_shot_preds
    elif 'one_shot_preds' in globals() and one_shot_preds: best_model_key = one_shot_preds

    if best_model_key:
        detailed_rows = []

        for req_id, gt_entry in ground_truth_data.items():
            if req_id in best_model_key:
                pred_entry = best_model_key[req_id]

                gt_all = []; pred_all = []
                for col in TARGET_COLUMNS:
                    gt_all.extend(gt_entry.get(col, []))
                    pred_all.extend(pred_entry.get(col, []))

                tp, fp, fn = evaluate_single_prediction(gt_all, pred_all, best_threshold)

                p = tp / (tp + fp) if (tp + fp) > 0 else 0
                r = tp / (tp + fn) if (tp + fn) > 0 else 0
                f1 = 2 * (p * r) / (p + r) if (p + r) > 0 else 0

                detailed_rows.append({
                    "Req_ID": req_id,
                    "F1_Score": f1,
                    "Precision": p,
                    "Recall": r,
                    "GT_Items": len(gt_all),
                    "Pred_Items": len(pred_all)
                })

        det_path = f"{RESULTS_DIR}/detailed_analysis_per_req.csv"
        pd.DataFrame(detailed_rows).to_csv(det_path, index=False)
        print(f"saved to: {det_path}")

except Exception as e:
    print(e)

zip_filename = f"results_{EXPERIMENT_NAME}"
shutil.make_archive(zip_filename, 'zip', RESULTS_DIR)

print(f"\n ready {zip_filename}.zip")
files.download(f"{zip_filename}.zip")