In [1]:
#!/usr/bin/env python3
"""
s1_issue_normalization_by_complexity.py

Goal:
- Address the "higher S1 => more issues" anomaly by normalizing issue counts
  by pipeline complexity proxies (nodes, edges, components, depth).
- Re-run correlations:
    S1 semantic vs (issues per node)
    S1 graph vs (issues per node)
  and compare to raw issue correlations.

Recommended reporting:
- Show raw correlations vs normalized correlations (all runs + passed-only + failed-only).
"""

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings("ignore")

# -----------------------
# Config
# -----------------------
CSV_PATH = "/Users/abubakarialidu/Desktop/Data Result/all_sessions_cleaned.csv"

# Use Prompt2DAG only because S1 metrics are complete there
FILTER_PROMPT2DAG_ONLY = True

# Choose which issue severities to normalize
ISSUE_COLS = ["Critical_Issues", "Major_Issues", "Minor_Issues", "Total_Issues"]

# Complexity proxies (edit if your column names differ)
COMPLEXITY_COLS = [
    "S1_Graph_total_nodes_in_flow",
    "S1_Graph_total_edges",
    "S1_Graph_total_components",
    "S1_Graph_max_pipeline_depth",
]

# Core S1 metrics to correlate (you can expand this list)
S1_SEM_COLS = [
    "S1_Sem_BERT_f1",
    "S1_Sem_ROUGE1_norm",
    "S1_Sem_KeyTerm_rate",
    "S1_Sem_tok_overlap_ratio",
]

S1_GRAPH_COLS = [
    "S1_Graph_overall_score",
    "S1_Graph_total_issues",
    "S1_Graph_Node_Connectivity_score",
    "S1_Graph_Structural_Integrity_score",
]

# -----------------------
# Helpers
# -----------------------
def safe_divide(numer, denom):
    denom = denom.replace(0, np.nan)
    return numer / denom

def corr_table(df, x_cols, y_cols, label, min_n=50):
    """
    Spearman correlations between x_cols and y_cols.
    """
    print("\n" + "=" * 110)
    print(f"CORRELATIONS: {label}")
    print("=" * 110)

    header = f"{'X (S1 metric)':<40}" + "".join([f"{y:>18}" for y in y_cols])
    print(header)
    print("-" * len(header))

    for x in x_cols:
        row = f"{x:<40}"
        for y in y_cols:
            sub = df[[x, y]].dropna()
            if len(sub) < min_n:
                row += f"{'n<min':>18}"
                continue
            r, p = spearmanr(sub[x], sub[y])
            sig = "***" if p < 1e-3 else "**" if p < 1e-2 else "*" if p < 5e-2 else ""
            row += f"{r:+.3f}{sig:>3}".rjust(18)
        print(row)

def describe_complexity(df):
    print("\n" + "=" * 110)
    print("COMPLEXITY DISTRIBUTIONS (Prompt2DAG)")
    print("=" * 110)
    for c in COMPLEXITY_COLS:
        if c not in df.columns:
            continue
        s = df[c].dropna()
        print(f"{c:<30} N={len(s):>5}  mean={s.mean():>8.3f}  median={s.median():>8.3f}  "
              f"min={s.min():>8.3f}  max={s.max():>8.3f}")

# -----------------------
# Load
# -----------------------
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df):,} rows, {len(df.columns)} cols")

if FILTER_PROMPT2DAG_ONLY:
    df = df[df["Workflow"] == "Prompt2DAG"].copy()
    print(f"Filtered to Prompt2DAG: {len(df):,} rows")

# Ensure issues
for col in ["Critical_Issues", "Major_Issues", "Minor_Issues"]:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].fillna(0)

df["Total_Issues"] = df["Critical_Issues"] + df["Major_Issues"] + df["Minor_Issues"]

# Ensure Passed numeric
df["Passed_num"] = df["Passed"].astype(int)

# Check required columns exist
missing_complex = [c for c in COMPLEXITY_COLS if c not in df.columns]
if missing_complex:
    print("\nWARNING: missing complexity columns:", missing_complex)

available_complex = [c for c in COMPLEXITY_COLS if c in df.columns]
available_s1 = [c for c in (S1_SEM_COLS + S1_GRAPH_COLS) if c in df.columns]
if not available_s1:
    raise RuntimeError("No S1 columns found. Check your CSV column names.")

describe_complexity(df)

# -----------------------
# Create normalized issue rates
# -----------------------
# Primary: per node in flow
if "S1_Graph_total_nodes_in_flow" in df.columns:
    nodes = df["S1_Graph_total_nodes_in_flow"]
else:
    nodes = pd.Series(np.nan, index=df.index)

if "S1_Graph_total_edges" in df.columns:
    edges = df["S1_Graph_total_edges"]
else:
    edges = pd.Series(np.nan, index=df.index)

if "S1_Graph_total_components" in df.columns:
    comps = df["S1_Graph_total_components"]
else:
    comps = pd.Series(np.nan, index=df.index)

# Create normalized columns
for issue in ISSUE_COLS:
    if issue not in df.columns:
        continue

    df[f"{issue}_per_node"] = safe_divide(df[issue], nodes)
    df[f"{issue}_per_edge"] = safe_divide(df[issue], edges)
    df[f"{issue}_per_comp"] = safe_divide(df[issue], comps)

# A cleaner “nonblocking issues” measure (optional)
df["NonCritical_Issues"] = df["Major_Issues"] + df["Minor_Issues"]
df["NonCritical_Issues_per_node"] = safe_divide(df["NonCritical_Issues"], nodes)

# -----------------------
# Correlations: raw vs normalized
# -----------------------
raw_targets = ["Total_Issues", "Critical_Issues", "Major_Issues", "Minor_Issues"]
norm_targets = [
    "Total_Issues_per_node",
    "Critical_Issues_per_node",
    "NonCritical_Issues_per_node",
    "Total_Issues_per_edge",
    "Total_Issues_per_comp",
]

# All runs
corr_table(df, available_s1, raw_targets, label="RAW issue counts (All Prompt2DAG runs)")
corr_table(df, available_s1, [t for t in norm_targets if t in df.columns],
           label="NORMALIZED issue rates (All Prompt2DAG runs)")

# Passed-only
df_pass = df[df["Passed"] == True].copy()
corr_table(df_pass, available_s1, raw_targets, label="RAW issue counts (Passed-only)")
corr_table(df_pass, available_s1, [t for t in norm_targets if t in df.columns],
           label="NORMALIZED issue rates (Passed-only)")

# Failed-only
df_fail = df[df["Passed"] == False].copy()
corr_table(df_fail, available_s1, raw_targets, label="RAW issue counts (Failed-only)")
corr_table(df_fail, available_s1, [t for t in norm_targets if t in df.columns],
           label="NORMALIZED issue rates (Failed-only)")

print("\nDone.")

Loaded 8,742 rows, 94 cols
Filtered to Prompt2DAG: 5,664 rows

COMPLEXITY DISTRIBUTIONS (Prompt2DAG)
S1_Graph_total_nodes_in_flow   N= 5664  mean=   3.968  median=   4.000  min=   0.000  max=  25.000
S1_Graph_total_edges           N= 5664  mean=   3.455  median=   3.000  min=   0.000  max=  25.000
S1_Graph_total_components      N= 5664  mean=   3.779  median=   4.000  min=   0.000  max=  16.000
S1_Graph_max_pipeline_depth    N= 5664  mean=   3.123  median=   3.000  min=   0.000  max=  10.000

CORRELATIONS: RAW issue counts (All Prompt2DAG runs)
X (S1 metric)                                 Total_Issues   Critical_Issues      Major_Issues      Minor_Issues
----------------------------------------------------------------------------------------------------------------
S1_Sem_BERT_f1                                   +0.116***         -0.089***         +0.045***         +0.178***
S1_Sem_ROUGE1_norm                               +0.286***         -0.168***         +0.180***         +0.334*