In [1]:
#!/usr/bin/env python3
"""
orchestrator_x_method_table.py

Creates ONE paper-ready table showing how the 5 methods perform across the 3 orchestrators
using all_sessions_cleaned.csv.

Output (default): 15-row table (Orchestrator × Method) with:
N, Pass%, SAT, PCT, Combined, ORT_scaled, Issues (Total/Crit/Major/Minor)

Also prints an optional wide pivot (Orchestrator rows, Method columns) for Pass% / ORT / Combined.
"""

import pandas as pd
import numpy as np

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "/Users/abubakarialidu/Desktop/Data Result/all_sessions_cleaned.csv"

METHOD_ORDER = [
    "Direct (Non-Reasoning)",
    "Prompt2DAG (Template)",
    "Prompt2DAG (LLM)",
    "Prompt2DAG (Hybrid)",
    "Direct (Reasoning)",
]

ORCH_ORDER = ["airflow", "dagster", "prefect"]

# ORT penalty weights (match your main analysis)
ALPHA_CRIT = 2.0
BETA_MAJOR = 1.0
GAMMA_MINOR = 0.25

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 220)

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df):,} rows, {len(df.columns)} cols")

# Normalize orchestrator labels
if "Orchestrator" not in df.columns:
    raise ValueError("Missing column: Orchestrator")
df["Orchestrator"] = df["Orchestrator"].astype(str).str.lower().str.strip()

# Ensure Passed is boolean-ish
if "Passed" not in df.columns:
    raise ValueError("Missing column: Passed")
if df["Passed"].dtype != bool:
    # handles 0/1, "True"/"False", etc.
    df["Passed"] = df["Passed"].astype(str).str.lower().map({"true": True, "false": False, "1": True, "0": False})

# -----------------------------
# 5-method classification
# -----------------------------
def classify_method(row):
    workflow = row.get("Workflow", "")
    strategy = str(row.get("Strategy") or "").lower()
    if workflow == "Direct":
        return "Direct (Non-Reasoning)"
    elif workflow == "Reasoning":
        return "Direct (Reasoning)"
    elif workflow == "Prompt2DAG":
        if "template" in strategy:
            return "Prompt2DAG (Template)"
        elif "llm" in strategy:
            return "Prompt2DAG (LLM)"
        elif "hybrid" in strategy:
            return "Prompt2DAG (Hybrid)"
        else:
            return f"Prompt2DAG ({row.get('Strategy','Unknown')})"
    else:
        return workflow

df["Method"] = df.apply(classify_method, axis=1)
df = df[df["Method"].isin(METHOD_ORDER)].copy()

# -----------------------------
# Ensure issues + scores exist
# -----------------------------
for c in ["Static_Score", "Compliance_Score", "Combined_Score"]:
    if c not in df.columns:
        raise ValueError(f"Missing column: {c}")

for c in ["Critical_Issues", "Major_Issues", "Minor_Issues"]:
    if c not in df.columns:
        df[c] = 0
    df[c] = df[c].fillna(0)

df["Total_Issues"] = df["Critical_Issues"] + df["Major_Issues"] + df["Minor_Issues"]

# -----------------------------
# ORT (match your earlier approach: raw then min-max scaled to [0,10])
# -----------------------------
df["Base_Score"] = np.where(df["Passed"] == True, df["Combined_Score"], 0.0)
df["Penalty"] = (
    ALPHA_CRIT * df["Critical_Issues"]
    + BETA_MAJOR * df["Major_Issues"]
    + GAMMA_MINOR * df["Minor_Issues"]
)
df["ORT_raw"] = df["Base_Score"] - df["Penalty"]

ort_min = df["ORT_raw"].min()
ort_max = df["ORT_raw"].max()
df["ORT_scaled"] = 0.0
if ort_max > ort_min:
    df["ORT_scaled"] = 10.0 * (df["ORT_raw"] - ort_min) / (ort_max - ort_min)

# -----------------------------
# Helpers
# -----------------------------
def mean_sd_str(x):
    x = pd.to_numeric(x, errors="coerce").dropna()
    if len(x) == 0:
        return "NA"
    return f"{x.mean():.2f} ± {x.std(ddof=1):.2f}"

def pct_str(x):
    x = pd.to_numeric(x, errors="coerce").dropna()
    if len(x) == 0:
        return "NA"
    return f"{100*x.mean():.1f}%"

# -----------------------------
# SINGLE TABLE (15 rows): Orchestrator × Method
# -----------------------------
grp = df.groupby(["Orchestrator", "Method"], observed=True)

table = grp.agg(
    N=("Passed", "size"),
    N_Passed=("Passed", "sum"),
    Pass_Rate=("Passed", "mean"),
    SAT_mean=("Static_Score", "mean"),
    SAT_sd=("Static_Score", "std"),
    PCT_mean=("Compliance_Score", "mean"),
    PCT_sd=("Compliance_Score", "std"),
    Combined_mean=("Combined_Score", "mean"),
    Combined_sd=("Combined_Score", "std"),
    ORT_mean=("ORT_scaled", "mean"),
    ORT_sd=("ORT_scaled", "std"),
    TotalIssues_mean=("Total_Issues", "mean"),
    Crit_mean=("Critical_Issues", "mean"),
    Major_mean=("Major_Issues", "mean"),
    Minor_mean=("Minor_Issues", "mean"),
).reset_index()

# format
table["Pass%"] = (100 * table["Pass_Rate"]).round(1)
table["SAT"] = table.apply(lambda r: f"{r['SAT_mean']:.2f} ± {r['SAT_sd']:.2f}", axis=1)
table["PCT"] = table.apply(lambda r: f"{r['PCT_mean']:.2f} ± {r['PCT_sd']:.2f}", axis=1)
table["Combined"] = table.apply(lambda r: f"{r['Combined_mean']:.2f} ± {r['Combined_sd']:.2f}", axis=1)
table["ORT_scaled"] = table.apply(lambda r: f"{r['ORT_mean']:.2f} ± {r['ORT_sd']:.2f}", axis=1)

# Keep paper columns
table_out = table[[
    "Orchestrator", "Method", "N", "N_Passed", "Pass%", "SAT", "PCT", "Combined", "ORT_scaled",
    "TotalIssues_mean", "Crit_mean", "Major_mean", "Minor_mean"
]].copy()

# ordering
table_out["Orchestrator"] = pd.Categorical(table_out["Orchestrator"], ORCH_ORDER, ordered=True)
table_out["Method"] = pd.Categorical(table_out["Method"], METHOD_ORDER, ordered=True)
table_out = table_out.sort_values(["Orchestrator", "Method"]).reset_index(drop=True)

print("\n" + "="*110)
print("TABLE O1: Orchestrator × Method (5-method classification; aggregated over LLMs)")
print("="*110)
print(table_out.to_string(index=False))

# -----------------------------
# OPTIONAL: wide pivot versions (nice for a compact paper table)
# -----------------------------
def make_wide(metric_col, fmt=None):
    t = grp[metric_col].agg(["mean", "std", "size"]).reset_index()
    if fmt is None:
        t["val"] = t.apply(lambda r: f"{r['mean']:.2f} ± {r['std']:.2f}", axis=1)
    else:
        t["val"] = t["mean"].map(fmt)
    wide = t.pivot(index="Orchestrator", columns="Method", values="val")
    wide = wide.reindex(index=ORCH_ORDER, columns=METHOD_ORDER)
    return wide

print("\n" + "="*110)
print("OPTIONAL WIDE TABLES (Orchestrator rows × Method cols)")
print("="*110)

pass_wide = grp["Passed"].mean().reset_index()
pass_wide["val"] = (100 * pass_wide["Passed"]).map(lambda v: f"{v:.1f}%")
pass_wide = pass_wide.pivot(index="Orchestrator", columns="Method", values="val").reindex(index=ORCH_ORDER, columns=METHOD_ORDER)

print("\nPass rate (%):")
print(pass_wide.to_string())

print("\nORT_scaled (mean ± sd):")
print(make_wide("ORT_scaled").to_string())

print("\nCombined_Score (mean ± sd):")
print(make_wide("Combined_Score").to_string())

Loaded 8,742 rows, 94 cols

TABLE O1: Orchestrator × Method (5-method classification; aggregated over LLMs)
Orchestrator                 Method   N  N_Passed  Pass%         SAT         PCT    Combined  ORT_scaled  TotalIssues_mean  Crit_mean  Major_mean  Minor_mean
     airflow Direct (Non-Reasoning) 798       322   40.4 4.02 ± 2.22 4.62 ± 2.67 4.35 ± 2.43 5.75 ± 1.84          7.338346   0.497494    1.897243    4.943609
     airflow  Prompt2DAG (Template) 681        60    8.8 0.55 ± 1.78 0.68 ± 2.18 0.62 ± 1.99 5.25 ± 1.09          2.334802   1.116006    0.615272    0.603524
     airflow       Prompt2DAG (LLM) 681       369   54.2 5.30 ± 2.55 4.10 ± 3.78 4.72 ± 2.77 6.48 ± 1.64          6.183554   0.547724    1.461087    4.174743
     airflow    Prompt2DAG (Hybrid) 681       507   74.4 4.78 ± 2.22 5.30 ± 3.13 5.08 ± 2.55 6.98 ± 1.34          6.640235   0.503671    1.356828    4.779736
     airflow     Direct (Reasoning) 228       171   75.0 5.77 ± 1.46 6.68 ± 2.03 6.31 ± 1.72 7.09 ± 1.