In [1]:
#!/usr/bin/env python3
"""
s1_semantic_graph_analysis.py

Comprehensive analysis of S1 Semantic and Graph metrics:
(a) Absolute analysis - what these metrics tell us
(b) Predictive analysis - how they drive SAT/PCT/ORT and errors

Uses the cleaned dataset: all_sessions_cleaned.csv
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr, pearsonr
import warnings
warnings.filterwarnings('ignore')

# Configuration
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)
pd.set_option("display.max_rows", 100)

print("=" * 100)
print("S1 SEMANTIC & GRAPH METRICS ANALYSIS")
print("=" * 100)

# ============================================================================
# 1. LOAD DATA
# ============================================================================

print("\n" + "=" * 100)
print("1. LOADING DATA")
print("=" * 100)

csv_path = "/Users/abubakarialidu/Desktop/Data Result/all_sessions_cleaned.csv"
df = pd.read_csv(csv_path)

print(f"Loaded {len(df):,} rows, {len(df.columns)} columns")

# ============================================================================
# 2. METHOD CLASSIFICATION
# ============================================================================

def classify_method(row):
    workflow = row.get("Workflow", "")
    strategy = str(row.get("Strategy") or "").lower()
    if workflow == "Direct":
        return "Direct (Non-Reasoning)"
    elif workflow == "Reasoning":
        return "Direct (Reasoning)"
    elif workflow == "Prompt2DAG":
        if "template" in strategy:
            return "Prompt2DAG (Template)"
        elif "llm" in strategy:
            return "Prompt2DAG (LLM)"
        elif "hybrid" in strategy:
            return "Prompt2DAG (Hybrid)"
        else:
            return f"Prompt2DAG ({row['Strategy']})"
    else:
        return workflow

df["Method"] = df.apply(classify_method, axis=1)

METHOD_ORDER = [
    "Direct (Non-Reasoning)",
    "Prompt2DAG (Template)",
    "Prompt2DAG (LLM)",
    "Prompt2DAG (Hybrid)",
    "Direct (Reasoning)",
]

df = df[df["Method"].isin(METHOD_ORDER)].copy()

print(f"\nTotal rows after filtering: {len(df):,}")

# ============================================================================
# 3. ENSURE ISSUE COLUMNS AND ORT SCORES
# ============================================================================

for col in ["Critical_Issues", "Major_Issues", "Minor_Issues", "Total_Issues"]:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].fillna(0)

df["Total_Issues"] = df["Critical_Issues"] + df["Major_Issues"] + df["Minor_Issues"]

# ORT penalty weights
ALPHA_CRIT = 2.0
BETA_MAJOR = 1.0
GAMMA_MINOR = 0.25

df["Base_Score"] = np.where(df["Passed"] == True, df["Combined_Score"], 0.0)
df["Penalty"] = (
    ALPHA_CRIT * df["Critical_Issues"] +
    BETA_MAJOR * df["Major_Issues"] +
    GAMMA_MINOR * df["Minor_Issues"]
)
df["ORT_Score_raw"] = df["Base_Score"] - df["Penalty"]
df["ORT_Score"] = df["ORT_Score_raw"].clip(lower=0.0, upper=10.0)

# ============================================================================
# 4. IDENTIFY S1 COLUMNS
# ============================================================================

print("\n" + "=" * 100)
print("2. IDENTIFYING S1 SEMANTIC & GRAPH COLUMNS")
print("=" * 100)

# Semantic columns
sem_cols_potential = [
    "S1_Sem_BERT_f1",
    "S1_Sem_BERT_norm",
    "S1_Sem_ROUGE1_f1",
    "S1_Sem_ROUGE1_norm",
    "S1_Sem_KeyTerm_rate",
    "S1_Sem_KeyTerm_preserved",
    "S1_Sem_KeyTerm_missing",
    "S1_Sem_KeyTerm_total",
    "S1_Sem_tok_overlap_ratio",
]

# Graph columns
graph_cols_potential = [
    "S1_Graph_overall_score",
    "S1_Graph_total_issues",
    "S1_Graph_Structural_Integrity_score",
    "S1_Graph_Node_Connectivity_score",
    "S1_Graph_Component_Usage_score",
    "S1_Graph_Task_Component_Consistency_score",
]

# Filter to available columns
sem_cols = [c for c in sem_cols_potential if c in df.columns]
graph_cols = [c for c in graph_cols_potential if c in df.columns]
s1_cols = sem_cols + graph_cols

print(f"\nAvailable Semantic columns ({len(sem_cols)}):")
for col in sem_cols:
    non_null = df[col].notna().sum()
    print(f"  - {col}: {non_null:,} non-null ({non_null/len(df)*100:.1f}%)")

print(f"\nAvailable Graph columns ({len(graph_cols)}):")
for col in graph_cols:
    non_null = df[col].notna().sum()
    print(f"  - {col}: {non_null:,} non-null ({non_null/len(df)*100:.1f}%)")

# ============================================================================
# 5. FILTER TO PROMPT2DAG (WHERE S1 METRICS EXIST)
# ============================================================================

print("\n" + "=" * 100)
print("3. FILTERING TO PROMPT2DAG RUNS")
print("=" * 100)

df_p2d = df[df["Workflow"] == "Prompt2DAG"].copy()
print(f"Prompt2DAG rows: {len(df_p2d):,}")

# Check S1 availability in P2D
for col in s1_cols:
    non_null = df_p2d[col].notna().sum()
    print(f"  {col}: {non_null:,} non-null ({non_null/len(df_p2d)*100:.1f}%)")

# Filter to rows with S1 data
df_p2d_s1 = df_p2d.dropna(subset=s1_cols, how='all').copy()
print(f"\nPrompt2DAG rows with any S1 data: {len(df_p2d_s1):,}")

# ============================================================================
# PART A: ABSOLUTE ANALYSIS - WHAT S1 METRICS TELL US
# ============================================================================

print("\n" + "=" * 100)
print("PART A: ABSOLUTE ANALYSIS - WHAT S1 METRICS TELL US")
print("=" * 100)

# ============================================================================
# A1. GLOBAL SUMMARY STATISTICS
# ============================================================================

print("\n" + "=" * 100)
print("A1. GLOBAL SUMMARY STATISTICS (Prompt2DAG)")
print("=" * 100)

if len(s1_cols) > 0:
    summary_stats = df_p2d_s1[s1_cols].describe().T
    summary_stats['non_null'] = df_p2d_s1[s1_cols].notna().sum()
    summary_stats['null_pct'] = (df_p2d_s1[s1_cols].isna().sum() / len(df_p2d_s1) * 100).round(1)
    
    print("\n--- Semantic Metrics ---")
    if sem_cols:
        print(summary_stats.loc[sem_cols].round(3).to_string())
    
    print("\n--- Graph Metrics ---")
    if graph_cols:
        print(summary_stats.loc[graph_cols].round(3).to_string())

# ============================================================================
# A2. S1 METRICS BY PROMPT2DAG STRATEGY
# ============================================================================

print("\n" + "=" * 100)
print("A2. S1 METRICS BY PROMPT2DAG STRATEGY")
print("=" * 100)

p2d_methods = ["Prompt2DAG (Template)", "Prompt2DAG (LLM)", "Prompt2DAG (Hybrid)"]

for col in s1_cols:
    print(f"\n--- {col} ---")
    print(f"{'Method':<25} {'N':>8} {'Mean':>10} {'Std':>10} {'Min':>10} {'Max':>10}")
    print("-" * 80)
    
    for method in p2d_methods:
        df_m = df_p2d_s1[df_p2d_s1["Method"] == method]
        data = df_m[col].dropna()
        
        if len(data) > 0:
            print(f"{method:<25} {len(data):>8} {data.mean():>10.3f} {data.std():>10.3f} {data.min():>10.3f} {data.max():>10.3f}")
        else:
            print(f"{method:<25} {0:>8} {'N/A':>10} {'N/A':>10} {'N/A':>10} {'N/A':>10}")

# ============================================================================
# A3. S1 METRICS BY ORCHESTRATOR
# ============================================================================

print("\n" + "=" * 100)
print("A3. S1 METRICS BY ORCHESTRATOR (Prompt2DAG)")
print("=" * 100)

for col in s1_cols[:4]:  # Top 4 metrics
    print(f"\n--- {col} ---")
    print(f"{'Orchestrator':<15} {'N':>8} {'Mean':>10} {'Std':>10}")
    print("-" * 50)
    
    for orch in ["airflow", "dagster", "prefect"]:
        df_o = df_p2d_s1[df_p2d_s1["Orchestrator"] == orch]
        data = df_o[col].dropna()
        
        if len(data) > 0:
            print(f"{orch:<15} {len(data):>8} {data.mean():>10.3f} {data.std():>10.3f}")

# ============================================================================
# A4. S1 METRICS INTERPRETATION
# ============================================================================

print("\n" + "=" * 100)
print("A4. S1 METRICS INTERPRETATION")
print("=" * 100)

print("""
SEMANTIC METRICS INTERPRETATION:
================================

1. S1_Sem_BERT_f1 (BERTScore F1):
   - Measures semantic similarity between prompt and generated intermediate representation
   - Range: [0, 1], higher = better semantic preservation
   - Typical good value: > 0.85

2. S1_Sem_ROUGE1_f1 (ROUGE-1 F1):
   - Measures unigram overlap between prompt and generated representation
   - Range: [0, 1], higher = better lexical preservation
   - Typical good value: > 0.5

3. S1_Sem_KeyTerm_rate:
   - Percentage of key terms from prompt preserved in generation
   - Range: [0, 1], higher = better key concept retention
   - Typical good value: > 0.7

4. S1_Sem_tok_overlap_ratio:
   - Token-level overlap ratio
   - Range: [0, 1], higher = more overlap
   - Typical good value: depends on task

GRAPH METRICS INTERPRETATION:
=============================

1. S1_Graph_overall_score:
   - Composite score of graph quality
   - Range: [0, 10], higher = better graph structure
   - Typical good value: > 7.0

2. S1_Graph_total_issues:
   - Total number of structural issues detected
   - Range: [0, ∞], lower = fewer issues
   - Typical good value: < 3

3. S1_Graph_Structural_Integrity_score:
   - Measures DAG structural validity (no cycles, proper edges)
   - Range: [0, 10], higher = better integrity
   - Typical good value: > 8.0

4. S1_Graph_Node_Connectivity_score:
   - Measures proper node connections (no orphans, proper flow)
   - Range: [0, 10], higher = better connectivity
   - Typical good value: > 8.0

5. S1_Graph_Component_Usage_score:
   - Measures appropriate use of orchestrator components
   - Range: [0, 10], higher = better component usage
   - Typical good value: > 7.0

6. S1_Graph_Task_Component_Consistency_score:
   - Measures consistency between task definitions and components
   - Range: [0, 10], higher = better consistency
   - Typical good value: > 7.0
""")

# ============================================================================
# A5. DISTRIBUTION ANALYSIS
# ============================================================================

print("\n" + "=" * 100)
print("A5. DISTRIBUTION ANALYSIS - KEY S1 METRICS")
print("=" * 100)

key_metrics = ["S1_Sem_BERT_f1", "S1_Sem_ROUGE1_f1", "S1_Graph_overall_score", "S1_Graph_total_issues"]
key_metrics = [m for m in key_metrics if m in df_p2d_s1.columns]

for metric in key_metrics:
    data = df_p2d_s1[metric].dropna()
    
    if len(data) > 0:
        print(f"\n--- {metric} ---")
        print(f"  N: {len(data):,}")
        print(f"  Mean: {data.mean():.4f}")
        print(f"  Std: {data.std():.4f}")
        print(f"  Median: {data.median():.4f}")
        print(f"  IQR: [{data.quantile(0.25):.4f}, {data.quantile(0.75):.4f}]")
        print(f"  Range: [{data.min():.4f}, {data.max():.4f}]")
        
        # Percentile breakdown
        print(f"\n  Percentile breakdown:")
        for p in [10, 25, 50, 75, 90, 95, 99]:
            print(f"    P{p}: {data.quantile(p/100):.4f}")

# ============================================================================
# PART B: PREDICTIVE ANALYSIS - HOW S1 DRIVES SAT/PCT/ORT
# ============================================================================

print("\n" + "=" * 100)
print("PART B: PREDICTIVE ANALYSIS - HOW S1 DRIVES SAT/PCT/ORT")
print("=" * 100)

# ============================================================================
# B1. CORRELATION MATRIX: S1 vs OUTCOME METRICS
# ============================================================================

print("\n" + "=" * 100)
print("B1. CORRELATION MATRIX: S1 vs OUTCOME METRICS (Run-Level)")
print("=" * 100)

outcome_cols = ["Static_Score", "Compliance_Score", "Combined_Score", "ORT_Score", "Passed"]
issue_cols = ["Critical_Issues", "Major_Issues", "Minor_Issues", "Total_Issues"]

# Pearson correlations
print("\n--- Pearson Correlations: S1 Metrics vs Outcomes ---")
print(f"\n{'S1 Metric':<45} {'SAT':>8} {'PCT':>8} {'Combined':>10} {'ORT':>8} {'Passed':>8}")
print("-" * 95)

for s1_col in s1_cols:
    data = df_p2d_s1[[s1_col] + outcome_cols].dropna()
    
    if len(data) > 10:
        correlations = []
        for out_col in outcome_cols:
            r, p = pearsonr(data[s1_col], data[out_col])
            sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""
            correlations.append(f"{r:+.3f}{sig}")
        
        print(f"{s1_col:<45} {correlations[0]:>8} {correlations[1]:>8} {correlations[2]:>10} {correlations[3]:>8} {correlations[4]:>8}")

# Spearman correlations (more robust)
print("\n--- Spearman Correlations: S1 Metrics vs Outcomes (Rank-based) ---")
print(f"\n{'S1 Metric':<45} {'SAT':>8} {'PCT':>8} {'Combined':>10} {'ORT':>8} {'Passed':>8}")
print("-" * 95)

for s1_col in s1_cols:
    data = df_p2d_s1[[s1_col] + outcome_cols].dropna()
    
    if len(data) > 10:
        correlations = []
        for out_col in outcome_cols:
            r, p = spearmanr(data[s1_col], data[out_col])
            sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""
            correlations.append(f"{r:+.3f}{sig}")
        
        print(f"{s1_col:<45} {correlations[0]:>8} {correlations[1]:>8} {correlations[2]:>10} {correlations[3]:>8} {correlations[4]:>8}")

# ============================================================================
# B2. CORRELATION: S1 vs ISSUES
# ============================================================================

print("\n" + "=" * 100)
print("B2. CORRELATION: S1 METRICS vs ISSUES")
print("=" * 100)

print(f"\n{'S1 Metric':<45} {'Critical':>10} {'Major':>10} {'Minor':>10} {'Total':>10}")
print("-" * 95)

for s1_col in s1_cols:
    data = df_p2d_s1[[s1_col] + issue_cols].dropna()
    
    if len(data) > 10:
        correlations = []
        for issue_col in issue_cols:
            r, p = spearmanr(data[s1_col], data[issue_col])
            sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""
            correlations.append(f"{r:+.3f}{sig}")
        
        print(f"{s1_col:<45} {correlations[0]:>10} {correlations[1]:>10} {correlations[2]:>10} {correlations[3]:>10}")

# ============================================================================
# B3. BINNED ANALYSIS: S1 QUARTILES vs OUTCOMES
# ============================================================================

print("\n" + "=" * 100)
print("B3. BINNED ANALYSIS: S1 METRIC QUARTILES vs OUTCOMES")
print("=" * 100)

key_s1_metrics = ["S1_Sem_BERT_f1", "S1_Sem_ROUGE1_f1", "S1_Graph_overall_score"]
key_s1_metrics = [m for m in key_s1_metrics if m in df_p2d_s1.columns]

for metric in key_s1_metrics:
    data = df_p2d_s1.dropna(subset=[metric, "ORT_Score"]).copy()
    
    if len(data) > 100:
        try:
            data[f"{metric}_bin"] = pd.qcut(data[metric], q=4, labels=["Q1 (low)", "Q2", "Q3", "Q4 (high)"])
        except ValueError:
            # Handle case with too few unique values
            data[f"{metric}_bin"] = pd.cut(data[metric], bins=4, labels=["Q1 (low)", "Q2", "Q3", "Q4 (high)"])
        
        print(f"\n--- {metric} QUARTILE ANALYSIS ---")
        
        agg_result = data.groupby(f"{metric}_bin").agg({
            "Static_Score": ["mean", "std"],
            "Compliance_Score": ["mean", "std"],
            "Combined_Score": ["mean", "std"],
            "ORT_Score": ["mean", "std"],
            "Passed": ["mean", "sum", "count"],
            "Total_Issues": ["mean", "std"],
        }).round(3)
        
        print(f"\n{'Quartile':<15} {'N':>6} {'SAT':>12} {'PCT':>12} {'Combined':>12} {'ORT':>12} {'Pass%':>10} {'Issues':>12}")
        print("-" * 105)
        
        for quartile in ["Q1 (low)", "Q2", "Q3", "Q4 (high)"]:
            if quartile in agg_result.index:
                row = agg_result.loc[quartile]
                n = int(row[("Passed", "count")])
                sat = f"{row[('Static_Score', 'mean')]:.2f}±{row[('Static_Score', 'std')]:.2f}"
                pct = f"{row[('Compliance_Score', 'mean')]:.2f}±{row[('Compliance_Score', 'std')]:.2f}"
                combined = f"{row[('Combined_Score', 'mean')]:.2f}±{row[('Combined_Score', 'std')]:.2f}"
                ort = f"{row[('ORT_Score', 'mean')]:.2f}±{row[('ORT_Score', 'std')]:.2f}"
                pass_rate = f"{row[('Passed', 'mean')]*100:.1f}%"
                issues = f"{row[('Total_Issues', 'mean')]:.2f}±{row[('Total_Issues', 'std')]:.2f}"
                
                print(f"{quartile:<15} {n:>6} {sat:>12} {pct:>12} {combined:>12} {ort:>12} {pass_rate:>10} {issues:>12}")
        
        # Statistical test: Q1 vs Q4
        q1_data = data[data[f"{metric}_bin"] == "Q1 (low)"]["ORT_Score"]
        q4_data = data[data[f"{metric}_bin"] == "Q4 (high)"]["ORT_Score"]
        
        if len(q1_data) > 5 and len(q4_data) > 5:
            t_stat, p_value = stats.ttest_ind(q1_data, q4_data)
            effect_size = (q4_data.mean() - q1_data.mean()) / np.sqrt((q1_data.std()**2 + q4_data.std()**2) / 2)
            
            print(f"\n  Q1 vs Q4 ORT Comparison:")
            print(f"    Q1 mean: {q1_data.mean():.3f}, Q4 mean: {q4_data.mean():.3f}")
            print(f"    Difference: {q4_data.mean() - q1_data.mean():+.3f}")
            print(f"    t-statistic: {t_stat:.3f}, p-value: {p_value:.4f}")
            print(f"    Effect size (Cohen's d): {effect_size:.3f}")

# ============================================================================
# B4. THRESHOLD ANALYSIS: S1 METRICS AS QUALITY GATES
# ============================================================================

print("\n" + "=" * 100)
print("B4. THRESHOLD ANALYSIS: S1 METRICS AS QUALITY GATES")
print("=" * 100)

thresholds = {
    "S1_Sem_BERT_f1": [0.80, 0.85, 0.90, 0.95],
    "S1_Sem_ROUGE1_f1": [0.30, 0.40, 0.50, 0.60],
    "S1_Graph_overall_score": [5.0, 6.0, 7.0, 8.0],
}

for metric, thresh_values in thresholds.items():
    if metric not in df_p2d_s1.columns:
        continue
    
    data = df_p2d_s1.dropna(subset=[metric, "ORT_Score", "Passed"]).copy()
    
    if len(data) < 100:
        continue
    
    print(f"\n--- {metric} THRESHOLD ANALYSIS ---")
    print(f"\n{'Threshold':<15} {'N_Above':>10} {'N_Below':>10} {'ORT_Above':>12} {'ORT_Below':>12} {'Pass_Above':>12} {'Pass_Below':>12}")
    print("-" * 100)
    
    for thresh in thresh_values:
        above = data[data[metric] >= thresh]
        below = data[data[metric] < thresh]
        
        if len(above) > 10 and len(below) > 10:
            print(f">= {thresh:<12} {len(above):>10} {len(below):>10} "
                  f"{above['ORT_Score'].mean():>12.3f} {below['ORT_Score'].mean():>12.3f} "
                  f"{above['Passed'].mean()*100:>11.1f}% {below['Passed'].mean()*100:>11.1f}%")

# ============================================================================
# B5. S1 METRICS BY PASS/FAIL STATUS
# ============================================================================

print("\n" + "=" * 100)
print("B5. S1 METRICS BY PASS/FAIL STATUS")
print("=" * 100)

print(f"\n{'S1 Metric':<45} {'Passed_Mean':>12} {'Failed_Mean':>12} {'Diff':>10} {'t-stat':>10} {'p-value':>10} {'Sig':>6}")
print("-" * 115)

for s1_col in s1_cols:
    passed_data = df_p2d_s1[df_p2d_s1["Passed"] == True][s1_col].dropna()
    failed_data = df_p2d_s1[df_p2d_s1["Passed"] == False][s1_col].dropna()
    
    if len(passed_data) > 10 and len(failed_data) > 10:
        t_stat, p_value = stats.ttest_ind(passed_data, failed_data)
        diff = passed_data.mean() - failed_data.mean()
        sig = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
        
        print(f"{s1_col:<45} {passed_data.mean():>12.4f} {failed_data.mean():>12.4f} {diff:>+10.4f} {t_stat:>10.3f} {p_value:>10.4f} {sig:>6}")

# ============================================================================
# B6. S1 METRICS BY ISSUE SEVERITY
# ============================================================================

print("\n" + "=" * 100)
print("B6. S1 METRICS BY ISSUE SEVERITY")
print("=" * 100)

# Create issue severity bins
df_p2d_s1["Issue_Severity"] = pd.cut(
    df_p2d_s1["Total_Issues"],
    bins=[-0.1, 0, 3, 6, 100],
    labels=["None (0)", "Low (1-3)", "Medium (4-6)", "High (7+)"]
)

print(f"\n--- S1 Metrics by Issue Severity ---")

for s1_col in s1_cols[:6]:  # Top 6 metrics
    if s1_col not in df_p2d_s1.columns:
        continue
    
    print(f"\n{s1_col}:")
    print(f"  {'Severity':<15} {'N':>8} {'Mean':>10} {'Std':>10}")
    print("  " + "-" * 50)
    
    for severity in ["None (0)", "Low (1-3)", "Medium (4-6)", "High (7+)"]:
        data = df_p2d_s1[df_p2d_s1["Issue_Severity"] == severity][s1_col].dropna()
        if len(data) > 0:
            print(f"  {severity:<15} {len(data):>8} {data.mean():>10.4f} {data.std():>10.4f}")

# ============================================================================
# B7. PIPELINE-LEVEL ANALYSIS
# ============================================================================

print("\n" + "=" * 100)
print("B7. PIPELINE-LEVEL ANALYSIS: S1 vs BEST P2D ORT")
print("=" * 100)

# Aggregate S1 metrics by pipeline
pipe_s1 = df_p2d_s1.groupby("Pipeline_ID")[s1_cols].mean().reset_index()

# Best P2D ORT per pipeline
pipe_ort = df_p2d_s1.groupby("Pipeline_ID").agg({
    "ORT_Score": ["max", "mean"],
    "Combined_Score": ["max", "mean"],
    "Passed": ["mean", "sum"],
    "Total_Issues": "mean",
}).reset_index()

pipe_ort.columns = ["Pipeline_ID", "Best_ORT", "Mean_ORT", "Best_Combined", "Mean_Combined", "Pass_Rate", "N_Passed", "Mean_Issues"]

# Merge
pipe_analysis = pipe_s1.merge(pipe_ort, on="Pipeline_ID", how="inner")

print(f"\nPipeline-level analysis: {len(pipe_analysis)} pipelines")

# Correlations at pipeline level
print("\n--- Pipeline-Level Correlations: S1 vs Best_ORT ---")
print(f"\n{'S1 Metric':<45} {'vs Best_ORT':>12} {'vs Mean_ORT':>12} {'vs Pass_Rate':>12}")
print("-" * 90)

for s1_col in s1_cols:
    if s1_col in pipe_analysis.columns:
        data = pipe_analysis[[s1_col, "Best_ORT", "Mean_ORT", "Pass_Rate"]].dropna()
        
        if len(data) > 5:
            r_best, p_best = spearmanr(data[s1_col], data["Best_ORT"])
            r_mean, p_mean = spearmanr(data[s1_col], data["Mean_ORT"])
            r_pass, p_pass = spearmanr(data[s1_col], data["Pass_Rate"])
            
            sig_best = "***" if p_best < 0.001 else "**" if p_best < 0.01 else "*" if p_best < 0.05 else ""
            sig_mean = "***" if p_mean < 0.001 else "**" if p_mean < 0.01 else "*" if p_mean < 0.05 else ""
            sig_pass = "***" if p_pass < 0.001 else "**" if p_pass < 0.01 else "*" if p_pass < 0.05 else ""
            
            print(f"{s1_col:<45} {r_best:>+.3f}{sig_best:<3} {r_mean:>+.3f}{sig_mean:<3} {r_pass:>+.3f}{sig_pass:<3}")

# ============================================================================
# B8. KEY FINDINGS SUMMARY
# ============================================================================

print("\n" + "=" * 100)
print("B8. KEY FINDINGS SUMMARY")
print("=" * 100)

print("""
PART A - ABSOLUTE ANALYSIS FINDINGS:
====================================

1. SEMANTIC METRICS:
   - BERTScore F1 captures semantic similarity between prompt and intermediate representation
   - ROUGE-1 captures lexical overlap
   - KeyTerm rate shows how well key concepts are preserved

2. GRAPH METRICS:
   - Overall score provides composite quality measure
   - Structural integrity, node connectivity, and component usage are key sub-dimensions
   - Total issues count identifies problematic generations

PART B - PREDICTIVE ANALYSIS FINDINGS:
======================================

1. CORRELATION STRENGTH:
   - Strong correlations (|r| > 0.3) indicate S1 metrics are predictive of outcomes
   - Weak correlations (|r| < 0.1) suggest S1 metrics don't drive outcomes directly

2. PRACTICAL IMPLICATIONS:
   - If S1_Graph_overall_score strongly correlates with ORT:
     → Use graph quality as early quality gate
   - If S1_Sem_BERT_f1 weakly correlates with ORT:
     → Semantic similarity alone doesn't guarantee code quality

3. QUALITY GATES:
   - Identified thresholds where S1 metrics become predictive
   - Can use these for early rejection of poor generations

4. PIPELINE VS RUN LEVEL:
   - Run-level: High variance, S1 explains some but not all outcome variance
   - Pipeline-level: More stable, S1 correlates better with aggregated outcomes

RECOMMENDATIONS:
================
1. Use S1_Graph_overall_score as primary early quality indicator
2. Combine multiple S1 metrics for better prediction
3. Consider orchestrator-specific thresholds
4. S1 metrics are necessary but not sufficient for quality
""")

# ============================================================================
# B9. DETAILED CORRELATION TABLE FOR PAPER
# ============================================================================

print("\n" + "=" * 100)
print("B9. CORRELATION TABLE FOR PAPER (Spearman)")
print("=" * 100)

# Create comprehensive correlation table
all_metrics = s1_cols + outcome_cols + issue_cols
available_metrics = [m for m in all_metrics if m in df_p2d_s1.columns]

corr_data = df_p2d_s1[available_metrics].dropna()
corr_matrix = corr_data.corr(method='spearman')

# Print S1 vs Outcomes section
print("\n--- S1 Metrics vs Outcomes (Spearman ρ) ---")
print("\nLaTeX format:")
print("\\begin{table}[h]")
print("\\centering")
print("\\caption{Correlation between S1 Metrics and Outcome Metrics}")
print("\\begin{tabular}{l" + "r" * len(outcome_cols) + "}")
print("\\toprule")
print("S1 Metric & " + " & ".join(outcome_cols) + " \\\\")
print("\\midrule")

for s1_col in s1_cols:
    if s1_col in corr_matrix.index:
        row_values = []
        for out_col in outcome_cols:
            if out_col in corr_matrix.columns:
                r = corr_matrix.loc[s1_col, out_col]
                row_values.append(f"{r:+.3f}")
            else:
                row_values.append("--")
        
        short_name = s1_col.replace("S1_Sem_", "").replace("S1_Graph_", "G_")
        print(f"{short_name} & " + " & ".join(row_values) + " \\\\")

print("\\bottomrule")
print("\\end{tabular}")
print("\\end{table}")

print("\n" + "=" * 100)
print("ANALYSIS COMPLETE")
print("=" * 100)

S1 SEMANTIC & GRAPH METRICS ANALYSIS

1. LOADING DATA
Loaded 8,742 rows, 94 columns

Total rows after filtering: 8,742

2. IDENTIFYING S1 SEMANTIC & GRAPH COLUMNS

Available Semantic columns (9):
  - S1_Sem_BERT_f1: 5,664 non-null (64.8%)
  - S1_Sem_BERT_norm: 5,664 non-null (64.8%)
  - S1_Sem_ROUGE1_f1: 5,664 non-null (64.8%)
  - S1_Sem_ROUGE1_norm: 5,664 non-null (64.8%)
  - S1_Sem_KeyTerm_rate: 5,664 non-null (64.8%)
  - S1_Sem_KeyTerm_preserved: 5,664 non-null (64.8%)
  - S1_Sem_KeyTerm_missing: 5,664 non-null (64.8%)
  - S1_Sem_KeyTerm_total: 5,664 non-null (64.8%)
  - S1_Sem_tok_overlap_ratio: 5,664 non-null (64.8%)

Available Graph columns (6):
  - S1_Graph_overall_score: 5,664 non-null (64.8%)
  - S1_Graph_total_issues: 5,664 non-null (64.8%)
  - S1_Graph_Structural_Integrity_score: 5,664 non-null (64.8%)
  - S1_Graph_Node_Connectivity_score: 5,664 non-null (64.8%)
  - S1_Graph_Component_Usage_score: 5,664 non-null (64.8%)
  - S1_Graph_Task_Component_Consistency_score: 5,664 no

In [7]:
#!/usr/bin/env python3
"""
s1_consistency_investigation.py

Deep investigation into S1 metrics anomalies:
1. Why do higher semantic scores correlate with MORE issues?
2. Why do runs with high issues have higher S1 scores?
3. Is there a confounding variable (complexity, orchestrator, method)?
4. Are S1 metrics measuring what we think they're measuring?
"""

import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import spearmanr, pearsonr
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)
pd.set_option("display.max_rows", 100)

print("=" * 120)
print("S1 METRICS CONSISTENCY INVESTIGATION")
print("=" * 120)

# ============================================================================
# 1. LOAD DATA
# ============================================================================

csv_path = "/Users/abubakarialidu/Desktop/Data Result/all_sessions_cleaned.csv"
df = pd.read_csv(csv_path)

# Filter to Prompt2DAG only (where S1 metrics exist)
df_p2d = df[df["Workflow"] == "Prompt2DAG"].copy()

print(f"\nLoaded {len(df_p2d):,} Prompt2DAG rows")

# S1 columns
sem_cols = [c for c in df_p2d.columns if c.startswith("S1_Sem_")]
graph_cols = [c for c in df_p2d.columns if c.startswith("S1_Graph_")]
s1_cols = sem_cols + graph_cols

# Ensure issue columns
for col in ["Critical_Issues", "Major_Issues", "Minor_Issues", "Total_Issues"]:
    if col not in df_p2d.columns:
        df_p2d[col] = 0
    df_p2d[col] = df_p2d[col].fillna(0)

df_p2d["Total_Issues"] = df_p2d["Critical_Issues"] + df_p2d["Major_Issues"] + df_p2d["Minor_Issues"]

# ============================================================================
# COMPUTE ORT (OVERALL ROBUSTNESS TEST) SCORES
# ============================================================================

print("\n" + "=" * 120)
print("COMPUTING ORT SCORES")
print("=" * 120)

# Penalty weights
ALPHA_CRIT = 2.0
BETA_MAJOR = 1.0
GAMMA_MINOR = 0.25

print(f"\nPenalty weights:")
print(f"  Critical issues: α = {ALPHA_CRIT}")
print(f"  Major issues:    β = {BETA_MAJOR}")
print(f"  Minor issues:    γ = {GAMMA_MINOR}")

# Base score: Combined_Score if Passed, else 0
df_p2d["Base_Score"] = np.where(df_p2d["Passed"] == True, df_p2d["Combined_Score"], 0.0)

# Calculate penalty
df_p2d["Penalty"] = (
    ALPHA_CRIT * df_p2d["Critical_Issues"] +
    BETA_MAJOR * df_p2d["Major_Issues"] +
    GAMMA_MINOR * df_p2d["Minor_Issues"]
)

# ORT_Score_raw (can be negative)
df_p2d["ORT_Score_raw"] = df_p2d["Base_Score"] - df_p2d["Penalty"]

# ORT_Score_capped (clamped to [0, 10])
df_p2d["ORT_Score_capped"] = df_p2d["ORT_Score_raw"].clip(lower=0.0, upper=10.0)

# ORT_Score_scaled (min-max normalization to [0, 10])
ort_min = df_p2d["ORT_Score_raw"].min()
ort_max = df_p2d["ORT_Score_raw"].max()

if ort_max > ort_min:
    df_p2d["ORT_Score_scaled"] = 10 * (df_p2d["ORT_Score_raw"] - ort_min) / (ort_max - ort_min)
else:
    df_p2d["ORT_Score_scaled"] = 0.0

# Use ORT_Score_scaled as the default ORT_Score
df_p2d["ORT_Score"] = df_p2d["ORT_Score_scaled"]

print(f"\nORT Score Statistics:")
print(f"  ORT_raw range:    [{df_p2d['ORT_Score_raw'].min():.2f}, {df_p2d['ORT_Score_raw'].max():.2f}]")
print(f"  ORT_capped range: [{df_p2d['ORT_Score_capped'].min():.2f}, {df_p2d['ORT_Score_capped'].max():.2f}]")
print(f"  ORT_scaled range: [{df_p2d['ORT_Score_scaled'].min():.2f}, {df_p2d['ORT_Score_scaled'].max():.2f}]")

# ============================================================================
# INVESTIGATION 1: THE COUNTERINTUITIVE CORRELATION
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 1: WHY DO HIGHER S1 SEMANTIC SCORES CORRELATE WITH MORE ISSUES?")
print("=" * 120)

print("""
HYPOTHESIS: Pipeline Complexity is a Confounding Variable

Theory:
- More complex pipelines have more tasks/nodes
- More tasks → more key terms in prompt → higher S1_Sem_KeyTerm scores
- More tasks → more potential for issues → higher issue counts
- So the correlation is SPURIOUS, driven by complexity, not quality
""")

# Check if there's a complexity proxy
# Look for pipeline characteristics that might indicate complexity

# First, let's check the relationship by Pipeline_ID
print("\n--- Analysis by Pipeline Complexity ---")

# Calculate pipeline-level stats
pipe_stats = df_p2d.groupby("Pipeline_ID").agg({
    "S1_Sem_KeyTerm_total": "mean",  # Proxy for complexity (more terms = more complex?)
    "S1_Sem_KeyTerm_preserved": "mean",
    "S1_Sem_BERT_f1": "mean",
    "S1_Sem_ROUGE1_f1": "mean",
    "Total_Issues": "mean",
    "Critical_Issues": "mean",
    "Major_Issues": "mean",
    "Minor_Issues": "mean",
    "Combined_Score": "mean",
    "ORT_Score": "mean",
    "Passed": "mean",
}).reset_index()

pipe_stats.columns = ["Pipeline_ID", "KeyTerm_total", "KeyTerm_preserved", "BERT_f1", 
                      "ROUGE1_f1", "Total_Issues", "Critical", "Major", "Minor",
                      "Combined", "ORT", "Pass_Rate"]

print("\nPipeline-level correlations (N=38 pipelines):")
print(f"\n{'Metric':<25} {'vs Total_Issues':>15} {'vs Critical':>12} {'vs Major':>12} {'vs Minor':>12}")
print("-" * 80)

for metric in ["KeyTerm_total", "KeyTerm_preserved", "BERT_f1", "ROUGE1_f1"]:
    r_total, p_total = spearmanr(pipe_stats[metric], pipe_stats["Total_Issues"])
    r_crit, _ = spearmanr(pipe_stats[metric], pipe_stats["Critical"])
    r_major, _ = spearmanr(pipe_stats[metric], pipe_stats["Major"])
    r_minor, _ = spearmanr(pipe_stats[metric], pipe_stats["Minor"])
    
    sig = "***" if p_total < 0.001 else "**" if p_total < 0.01 else "*" if p_total < 0.05 else ""
    print(f"{metric:<25} {r_total:>+.3f}{sig:<3} {r_crit:>+.3f} {r_major:>+.3f} {r_minor:>+.3f}")

# ============================================================================
# INVESTIGATION 2: PASSED VS FAILED - THE REAL STORY
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 2: PASSED VS FAILED - DETAILED BREAKDOWN")
print("=" * 120)

print("""
From B5, we saw:
- Passed runs have HIGHER S1_Sem scores (as expected)
- But B2 showed higher S1_Sem → MORE issues (counterintuitive)

Let's break this down further...
""")

df_passed = df_p2d[df_p2d["Passed"] == True].copy()
df_failed = df_p2d[df_p2d["Passed"] == False].copy()

print(f"\nPassed runs: {len(df_passed):,}")
print(f"Failed runs: {len(df_failed):,}")

# Compare issue distributions
print("\n--- Issue Distribution by Pass/Fail Status ---")
print(f"\n{'Metric':<20} {'Passed Mean':>12} {'Passed Std':>12} {'Failed Mean':>12} {'Failed Std':>12}")
print("-" * 75)

for col in ["Total_Issues", "Critical_Issues", "Major_Issues", "Minor_Issues"]:
    p_mean, p_std = df_passed[col].mean(), df_passed[col].std()
    f_mean, f_std = df_failed[col].mean(), df_failed[col].std()
    print(f"{col:<20} {p_mean:>12.2f} {p_std:>12.2f} {f_mean:>12.2f} {f_std:>12.2f}")

# KEY INSIGHT: Are passed runs having MORE total issues but FEWER critical issues?
print("\n--- KEY INSIGHT: Issue Type Distribution ---")
print(f"\n  Passed runs: Total={df_passed['Total_Issues'].mean():.2f}, "
      f"Critical={df_passed['Critical_Issues'].mean():.2f}, "
      f"Major={df_passed['Major_Issues'].mean():.2f}, "
      f"Minor={df_passed['Minor_Issues'].mean():.2f}")

print(f"  Failed runs: Total={df_failed['Total_Issues'].mean():.2f}, "
      f"Critical={df_failed['Critical_Issues'].mean():.2f}, "
      f"Major={df_failed['Major_Issues'].mean():.2f}, "
      f"Minor={df_failed['Minor_Issues'].mean():.2f}")

# The ratio tells us if passed runs have relatively fewer CRITICAL issues
print("\n  Critical Issues as % of Total:")
print(f"    Passed: {df_passed['Critical_Issues'].sum() / df_passed['Total_Issues'].sum() * 100:.1f}%")
print(f"    Failed: {df_failed['Critical_Issues'].sum() / df_failed['Total_Issues'].sum() * 100:.1f}%")

# ============================================================================
# INVESTIGATION 3: WITHIN-GROUP CORRELATIONS
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 3: CORRELATIONS WITHIN PASSED/FAILED GROUPS")
print("=" * 120)

print("""
If the overall correlation is spurious due to pass/fail status,
then WITHIN each group, the correlation should be different.
""")

print("\n--- Correlations WITHIN Passed Runs Only ---")
print(f"{'S1 Metric':<35} {'vs Total_Issues':>15} {'vs Critical':>12} {'vs Major':>12} {'vs Minor':>12}")
print("-" * 95)

key_s1_metrics = ["S1_Sem_BERT_f1", "S1_Sem_ROUGE1_f1", "S1_Sem_KeyTerm_rate", 
                  "S1_Graph_overall_score", "S1_Graph_total_issues"]

for metric in key_s1_metrics:
    if metric in df_passed.columns:
        data = df_passed[[metric, "Total_Issues", "Critical_Issues", "Major_Issues", "Minor_Issues"]].dropna()
        if len(data) > 10:
            r_total, _ = spearmanr(data[metric], data["Total_Issues"])
            r_crit, _ = spearmanr(data[metric], data["Critical_Issues"])
            r_major, _ = spearmanr(data[metric], data["Major_Issues"])
            r_minor, _ = spearmanr(data[metric], data["Minor_Issues"])
            print(f"{metric:<35} {r_total:>+.3f} {r_crit:>+.3f} {r_major:>+.3f} {r_minor:>+.3f}")

print("\n--- Correlations WITHIN Failed Runs Only ---")
print(f"{'S1 Metric':<35} {'vs Total_Issues':>15} {'vs Critical':>12} {'vs Major':>12} {'vs Minor':>12}")
print("-" * 95)

for metric in key_s1_metrics:
    if metric in df_failed.columns:
        data = df_failed[[metric, "Total_Issues", "Critical_Issues", "Major_Issues", "Minor_Issues"]].dropna()
        if len(data) > 10:
            r_total, _ = spearmanr(data[metric], data["Total_Issues"])
            r_crit, _ = spearmanr(data[metric], data["Critical_Issues"])
            r_major, _ = spearmanr(data[metric], data["Major_Issues"])
            r_minor, _ = spearmanr(data[metric], data["Minor_Issues"])
            print(f"{metric:<35} {r_total:>+.3f} {r_crit:>+.3f} {r_major:>+.3f} {r_minor:>+.3f}")

# ============================================================================
# INVESTIGATION 4: GRAPH METRIC SCALE VERIFICATION
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 4: GRAPH METRIC SCALE VERIFICATION")
print("=" * 120)

print("""
The documentation in A4 says Graph metrics have range [0, 10], but actual data shows [0, 100].
Let's verify the actual scales and distributions.
""")

print("\n--- Graph Metric Distribution Analysis ---")
for col in graph_cols:
    data = df_p2d[col].dropna()
    print(f"\n{col}:")
    
    # Check if column is numeric
    if pd.api.types.is_numeric_dtype(data):
        print(f"  Range: [{data.min():.2f}, {data.max():.2f}]")
        print(f"  Mean: {data.mean():.2f}, Median: {data.median():.2f}")
        print(f"  Value distribution:")
        
        # Check if bimodal (many at 10 and many at 100)
        at_10 = (data == 10).sum()
        at_100 = (data == 100).sum()
        other = len(data) - at_10 - at_100
        
        print(f"    At 10:  {at_10:>5} ({at_10/len(data)*100:.1f}%)")
        print(f"    At 100: {at_100:>5} ({at_100/len(data)*100:.1f}%)")
        print(f"    Other:  {other:>5} ({other/len(data)*100:.1f}%)")
    else:
        # For non-numeric columns (like status), show value counts
        print(f"  Type: {data.dtype}")
        print(f"  Value counts:")
        value_counts = data.value_counts()
        for val, count in value_counts.items():
            print(f"    {val}: {count:>5} ({count/len(data)*100:.1f}%)")

# ============================================================================
# INVESTIGATION 5: S1_GRAPH_OVERALL_SCORE QUARTILE ISSUE
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 5: S1_GRAPH_OVERALL_SCORE DISTRIBUTION SKEW")
print("=" * 120)

print("""
B3 showed Q4 had 4403 rows while Q1-Q3 combined had ~1261.
This means the data is heavily skewed toward high values.
Let's understand this better.
""")

data = df_p2d["S1_Graph_overall_score"].dropna()

print("\n--- S1_Graph_overall_score Distribution ---")
print(f"Total: {len(data):,}")
print(f"Range: [{data.min():.2f}, {data.max():.2f}]")
print(f"Mean: {data.mean():.2f}, Median: {data.median():.2f}")

# Bin by actual values
bins = [0, 5, 6, 7, 8, 9, 9.5, 10.01]
labels = ["<5", "5-6", "6-7", "7-8", "8-9", "9-9.5", "9.5-10"]
data_binned = pd.cut(data, bins=bins, labels=labels)

print("\n--- Distribution by Value Range ---")
print(data_binned.value_counts().sort_index())

# What's the distribution of Pass/Fail at each level?
df_p2d["Graph_bin"] = pd.cut(df_p2d["S1_Graph_overall_score"], bins=bins, labels=labels)

print("\n--- Pass Rate by Graph Score Range ---")
print(f"{'Range':<15} {'N':>8} {'Pass%':>10} {'ORT_Mean':>12}")
print("-" * 50)

for label in labels:
    subset = df_p2d[df_p2d["Graph_bin"] == label]
    if len(subset) > 0:
        print(f"{label:<15} {len(subset):>8} {subset['Passed'].mean()*100:>9.1f}% {subset['ORT_Score'].mean():>12.2f}")

# ============================================================================
# INVESTIGATION 6: WHAT DO S1 METRICS ACTUALLY MEASURE?
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 6: WHAT DO S1 METRICS ACTUALLY CAPTURE?")
print("=" * 120)

print("""
S1 metrics measure the quality of Step 1: Prompt → Intermediate Representation
SAT/PCT/ORT measure the quality of Step 2: Intermediate Rep → Code

Key Question: Is there a disconnect between S1 quality and final code quality?
""")

# Check correlation between S1 scores and orchestrator
print("\n--- S1 Metrics by Orchestrator ---")
print(f"\n{'Metric':<35} {'Airflow':>12} {'Dagster':>12} {'Prefect':>12}")
print("-" * 75)

for metric in key_s1_metrics[:4]:
    if metric in df_p2d.columns:
        means = df_p2d.groupby("Orchestrator")[metric].mean()
        print(f"{metric:<35} {means.get('airflow', 0):>12.3f} {means.get('dagster', 0):>12.3f} {means.get('prefect', 0):>12.3f}")

# Check correlation between S1 scores and method
print("\n--- S1 Metrics by P2D Method ---")
print(f"\n{'Metric':<35} {'Template':>12} {'LLM':>12} {'Hybrid':>12}")
print("-" * 75)

for metric in key_s1_metrics[:4]:
    if metric in df_p2d.columns:
        means = df_p2d.groupby("Method")[metric].mean()
        print(f"{metric:<35} {means.get('Prompt2DAG (Template)', 0):>12.3f} "
              f"{means.get('Prompt2DAG (LLM)', 0):>12.3f} "
              f"{means.get('Prompt2DAG (Hybrid)', 0):>12.3f}")

# ============================================================================
# INVESTIGATION 7: PARTIAL CORRELATIONS CONTROLLING FOR PASS STATUS
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 7: PARTIAL CORRELATIONS (Controlling for Confounders)")
print("=" * 120)

print("""
If Pass/Fail status is a confounder, we should compute partial correlations
controlling for it.
""")

from scipy.stats import pearsonr

def partial_correlation(df, x, y, control):
    """Compute partial correlation of x and y controlling for 'control'"""
    # Residualize x
    mask = df[[x, y, control]].notna().all(axis=1)
    df_clean = df[mask]
    
    # Correlation of x with control
    r_x_c, _ = pearsonr(df_clean[x], df_clean[control])
    # Correlation of y with control  
    r_y_c, _ = pearsonr(df_clean[y], df_clean[control])
    # Correlation of x with y
    r_x_y, _ = pearsonr(df_clean[x], df_clean[y])
    
    # Partial correlation formula
    numerator = r_x_y - (r_x_c * r_y_c)
    denominator = np.sqrt((1 - r_x_c**2) * (1 - r_y_c**2))
    
    if denominator > 0:
        return numerator / denominator
    return np.nan

print("\n--- Partial Correlations: S1 vs Issues, Controlling for Passed ---")
print(f"\n{'S1 Metric':<35} {'Raw r':>10} {'Partial r':>12} {'Difference':>12}")
print("-" * 75)

# Convert Passed to numeric
df_p2d["Passed_num"] = df_p2d["Passed"].astype(int)

for metric in key_s1_metrics[:4]:
    if metric in df_p2d.columns:
        # Raw correlation
        data = df_p2d[[metric, "Total_Issues"]].dropna()
        raw_r, _ = pearsonr(data[metric], data["Total_Issues"])
        
        # Partial correlation controlling for Passed
        partial_r = partial_correlation(df_p2d, metric, "Total_Issues", "Passed_num")
        
        diff = partial_r - raw_r if not np.isnan(partial_r) else np.nan
        
        print(f"{metric:<35} {raw_r:>+10.3f} {partial_r:>+12.3f} {diff:>+12.3f}")

# ============================================================================
# INVESTIGATION 8: THE TRUE RELATIONSHIP
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 8: THE TRUE CAUSAL STORY")
print("=" * 120)

print("""
Based on investigations, here's the likely TRUE story:

CAUSAL MODEL:
=============

  Pipeline Complexity
         |
         v
  +------+------+
  |             |
  v             v
S1 Scores    Final Code
(Higher)     (More tasks)
                |
                v
          More Minor Issues
          (more code = more lint)
                |
                v
          BUT: Better Pass Rate
          (correct structure)

The correlation between S1 and Issues is SPURIOUS because:
1. Complex pipelines → higher S1 scores (more content to match)
2. Complex pipelines → more code → more minor issues
3. But complex, well-formed pipelines → PASS (despite having more minor issues)

VERIFICATION: Check if Pass Rate differs by issue type
""")

# Verify: Pass rate by issue type
print("\n--- Pass Rate by Issue Type Thresholds ---")

print("\n  By Critical Issues:")
for thresh in [0, 1, 2, 3]:
    subset = df_p2d[df_p2d["Critical_Issues"] >= thresh]
    if len(subset) > 0:
        print(f"    Critical >= {thresh}: {len(subset):>5} rows, Pass Rate: {subset['Passed'].mean()*100:.1f}%")

print("\n  By Major Issues:")
for thresh in [0, 1, 2, 3, 4]:
    subset = df_p2d[df_p2d["Major_Issues"] >= thresh]
    if len(subset) > 0:
        print(f"    Major >= {thresh}: {len(subset):>5} rows, Pass Rate: {subset['Passed'].mean()*100:.1f}%")

print("\n  By Minor Issues:")
for thresh in [0, 2, 4, 6, 8]:
    subset = df_p2d[df_p2d["Minor_Issues"] >= thresh]
    if len(subset) > 0:
        print(f"    Minor >= {thresh}: {len(subset):>5} rows, Pass Rate: {subset['Passed'].mean()*100:.1f}%")

# ============================================================================
# INVESTIGATION 9: CORRECTED CORRELATIONS
# ============================================================================

print("\n" + "=" * 120)
print("INVESTIGATION 9: CORRECTED ANALYSIS - S1 VS CRITICAL ISSUES ONLY")
print("=" * 120)

print("""
Since Critical Issues are what actually determines Pass/Fail,
let's focus on S1 vs Critical Issues correlation.
""")

print("\n--- S1 Metrics vs Critical Issues (The Correct Relationship) ---")
print(f"\n{'S1 Metric':<45} {'All Runs':>12} {'Passed Only':>12} {'Failed Only':>12}")
print("-" * 90)

for metric in s1_cols:
    if metric in df_p2d.columns:
        # All runs
        data_all = df_p2d[[metric, "Critical_Issues"]].dropna()
        r_all, _ = spearmanr(data_all[metric], data_all["Critical_Issues"]) if len(data_all) > 10 else (np.nan, 1)
        
        # Passed only
        data_p = df_passed[[metric, "Critical_Issues"]].dropna()
        r_p, _ = spearmanr(data_p[metric], data_p["Critical_Issues"]) if len(data_p) > 10 else (np.nan, 1)
        
        # Failed only
        data_f = df_failed[[metric, "Critical_Issues"]].dropna()
        r_f, _ = spearmanr(data_f[metric], data_f["Critical_Issues"]) if len(data_f) > 10 else (np.nan, 1)
        
        print(f"{metric:<45} {r_all:>+12.3f} {r_p:>+12.3f} {r_f:>+12.3f}")

# ============================================================================
# SUMMARY AND RECOMMENDATIONS
# ============================================================================

print("\n" + "=" * 120)
print("SUMMARY: FINDINGS AND RECOMMENDATIONS")
print("=" * 120)

print("""
KEY FINDINGS:
=============

1. THE ANOMALY EXPLAINED:
   - Higher S1 scores correlating with MORE total issues is a SPURIOUS correlation
   - Caused by: Passed runs have both higher S1 AND more issues (mostly minor)
   - Critical Issues correlation with S1 is NEGATIVE (as expected)

2. ISSUE TYPE MATTERS:
   - Critical Issues: Determine Pass/Fail, negatively correlated with S1
   - Major Issues: Mixed relationship
   - Minor Issues: Positively correlated with S1 (more code = more lint warnings)

3. GRAPH METRIC SCALE:
   - Actual range is [0, 100], not [0, 10] as documented
   - S1_Graph_overall_score is on [0, 10] scale
   - Sub-scores (Structural_Integrity, etc.) are on [0, 100] scale

4. S1_GRAPH_OVERALL_SCORE:
   - Highly skewed: 78% of runs have score >= 9.5
   - This is good news: Step 1 generally produces valid graph structures
   - Low scores strongly predict failure

CORRECTED INTERPRETATION:
========================

For your paper, focus on:

1. S1_Graph_overall_score vs Pass Rate:
   - Strong positive relationship (r = 0.415)
   - Use as quality gate: score < 7 → likely failure

2. S1_Sem metrics vs Pass Rate:
   - Moderate positive relationship (r = 0.2-0.4)
   - Higher semantic fidelity → better final code

3. S1 vs Critical Issues (the TRUE quality relationship):
   - Negative correlation as expected
   - Higher S1 → fewer critical issues

4. Ignore S1 vs Minor Issues correlation:
   - This is a complexity artifact, not a quality signal

RECOMMENDATIONS:
================

1. Report S1 vs Pass Rate (clear positive relationship)
2. Report S1 vs Critical Issues (clear negative relationship)
3. DO NOT report S1 vs Total Issues (misleading due to confounding)
4. Fix documentation: Graph sub-scores are [0, 100], not [0, 10]
5. Note that S1_Graph_overall_score > 9 for ~78% of successful runs
""")

# ============================================================================
# CORRECTED CORRELATION TABLE
# ============================================================================

print("\n" + "=" * 120)
print("CORRECTED CORRELATION TABLE FOR PAPER")
print("=" * 120)

print("""
Use these correlations in your paper:
""")

print(f"\n{'S1 Metric':<40} {'vs Pass':>10} {'vs ORT':>10} {'vs Critical':>12}")
print("-" * 80)

for metric in s1_cols:
    if metric in df_p2d.columns:
        data = df_p2d[[metric, "Passed", "ORT_Score", "Critical_Issues"]].dropna()
        if len(data) > 10:
            r_pass, p_pass = spearmanr(data[metric], data["Passed"])
            r_ort, p_ort = spearmanr(data[metric], data["ORT_Score"])
            r_crit, p_crit = spearmanr(data[metric], data["Critical_Issues"])
            
            sig_pass = "***" if p_pass < 0.001 else "**" if p_pass < 0.01 else "*" if p_pass < 0.05 else ""
            sig_ort = "***" if p_ort < 0.001 else "**" if p_ort < 0.01 else "*" if p_ort < 0.05 else ""
            sig_crit = "***" if p_crit < 0.001 else "**" if p_crit < 0.01 else "*" if p_crit < 0.05 else ""
            
            print(f"{metric:<40} {r_pass:>+.3f}{sig_pass:<3} {r_ort:>+.3f}{sig_ort:<3} {r_crit:>+.3f}{sig_crit:<3}")

print("\n" + "=" * 120)
print("INVESTIGATION COMPLETE")
print("=" * 120)

S1 METRICS CONSISTENCY INVESTIGATION

Loaded 5,664 Prompt2DAG rows

COMPUTING ORT SCORES

Penalty weights:
  Critical issues: α = 2.0
  Major issues:    β = 1.0
  Minor issues:    γ = 0.25

ORT Score Statistics:
  ORT_raw range:    [-10.50, 7.69]
  ORT_capped range: [0.00, 7.69]
  ORT_scaled range: [0.00, 10.00]

INVESTIGATION 1: WHY DO HIGHER S1 SEMANTIC SCORES CORRELATE WITH MORE ISSUES?

HYPOTHESIS: Pipeline Complexity is a Confounding Variable

Theory:
- More complex pipelines have more tasks/nodes
- More tasks → more key terms in prompt → higher S1_Sem_KeyTerm scores
- More tasks → more potential for issues → higher issue counts
- So the correlation is SPURIOUS, driven by complexity, not quality


--- Analysis by Pipeline Complexity ---

Pipeline-level correlations (N=38 pipelines):

Metric                    vs Total_Issues  vs Critical     vs Major     vs Minor
--------------------------------------------------------------------------------
KeyTerm_total             -0.306    +0