# Steering Analysis Results: Statistical Significance Testing

This notebook analyzes all human-model comparison metrics from the comprehensive steering experiments, calculating p-values and statistical significance for model performance differences.

## Overview:
- **Data Source**: JSON metrics files from `outputs/feature_classification/metrics/`
- **Statistical Tests**: Binomial tests for accuracy, confidence intervals for correlations
- **Output**: Comprehensive table with p-values and significance indicators

In [None]:
# Import Required Libraries
import json
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
from scipy.stats import binom, norm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load Metrics Files
metrics_dir = Path('../outputs/feature_classification/metrics')
print(f"Looking for metrics files in: {metrics_dir.absolute()}")

# Find all JSON metrics files
metrics_files = list(metrics_dir.glob('*_metrics.json'))
print(f"Found {len(metrics_files)} metrics files:")
for f in sorted(metrics_files):
    print(f"  - {f.name}")

if not metrics_files:
    print("\n⚠️  No metrics files found! Make sure to run the steering analysis experiments first.")
    print("   Run: python src/fsrl/scripts/alignment_steering_analysis.py --run_all_experiments")
else:
    print(f"\n✅ Ready to analyze {len(metrics_files)} experiment results!")

In [None]:
# Parse and Structure Data
all_metrics = []

for metrics_file in metrics_files:
    try:
        with metrics_file.open('r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Extract key information from filename
        filename = metrics_file.stem
        # Expected format: "human_labels_MODE_vs_MODEL_metrics"
        parts = filename.replace('_metrics', '').split('_vs_')
        if len(parts) == 2:
            human_part = parts[0]  # e.g., "12-gemmascope-res-65k__l0-21_human_labels_alignment"
            model_part = parts[1]  # e.g., "12-gemmascope-res-65k__l0-21_alignment_classified_deepseek-v3-0324"
            
            # Extract mode from human_part
            if '_alignment' in human_part:
                mode = 'alignment'
            elif '_formatting' in human_part:
                mode = 'formatting'
            else:
                mode = 'unknown'
            
            # Extract model name from model_part
            if '_classified_' in model_part:
                model_name = model_part.split('_classified_')[-1]
            else:
                model_name = model_part
        else:
            mode = 'unknown'
            model_name = filename
        
        # Structure the data
        metrics_record = {
            'filename': filename,
            'mode': mode,
            'model_name': model_name,
            'samples': data.get('samples', 0),
            'accuracy': data.get('accuracy', None),
            'phi_correlation': data.get('phi', None),
            'tp': data.get('tp', 0),
            'tn': data.get('tn', 0),
            'fp': data.get('fp', 0),
            'fn': data.get('fn', 0),
            'timestamp': data.get('timestamp', ''),
            'human_labels_path': data.get('human_labels', ''),
            'model_classifications_path': data.get('model_classifications', '')
        }
        all_metrics.append(metrics_record)
        
    except Exception as e:
        print(f"Error processing {metrics_file.name}: {e}")

# Create DataFrame
df = pd.DataFrame(all_metrics)
print(f"\nLoaded {len(df)} experiment results:")
if len(df) > 0:
    print(f"Modes: {df['mode'].value_counts().to_dict()}")
    print(f"Models: {df['model_name'].nunique()} unique models")
    print(f"Sample sizes: {df['samples'].min()} - {df['samples'].max()}")
    
# Display first few rows
print("\nFirst few records:")
display(df[['mode', 'model_name', 'samples', 'accuracy', 'phi_correlation']].head())

In [None]:
# Calculate Statistical Tests
def calculate_accuracy_pvalue(tp, tn, fp, fn, null_accuracy=0.5):
    """Calculate p-value for accuracy being significantly different from null hypothesis."""
    total = tp + tn + fp + fn
    correct = tp + tn
    if total == 0:
        return None
    # Two-tailed binomial test
    p_value = binom.cdf(correct, total, null_accuracy)
    p_value = 2 * min(p_value, 1 - p_value)  # Two-tailed
    return p_value

def calculate_phi_confidence_interval(phi, n, confidence=0.95):
    """Calculate confidence interval for phi coefficient."""
    if phi is None or n <= 0:
        return None, None
    
    # Fisher z-transformation
    z = 0.5 * np.log((1 + phi) / (1 - phi)) if abs(phi) < 0.999 else None
    if z is None:
        return None, None
    
    se = 1 / np.sqrt(n - 3)
    alpha = 1 - confidence
    z_crit = norm.ppf(1 - alpha/2)
    
    ci_z_lower = z - z_crit * se
    ci_z_upper = z + z_crit * se
    
    # Transform back
    ci_lower = (np.exp(2 * ci_z_lower) - 1) / (np.exp(2 * ci_z_lower) + 1)
    ci_upper = (np.exp(2 * ci_z_upper) - 1) / (np.exp(2 * ci_z_upper) + 1)
    
    return ci_lower, ci_upper

def significance_level(p_value):
    """Convert p-value to significance symbols."""
    if p_value is None or pd.isna(p_value):
        return ''
    elif p_value < 0.001:
        return '***'
    elif p_value < 0.01:
        return '**'
    elif p_value < 0.05:
        return '*'
    else:
        return ''

# Apply statistical tests
df['accuracy_pvalue'] = df.apply(lambda row: calculate_accuracy_pvalue(row['tp'], row['tn'], row['fp'], row['fn']), axis=1)
df['phi_ci_lower'] = None
df['phi_ci_upper'] = None

for idx, row in df.iterrows():
    ci_lower, ci_upper = calculate_phi_confidence_interval(row['phi_correlation'], row['samples'])
    df.at[idx, 'phi_ci_lower'] = ci_lower
    df.at[idx, 'phi_ci_upper'] = ci_upper

# Add significance indicators
df['accuracy_sig'] = df['accuracy_pvalue'].apply(significance_level)
df['phi_significant'] = df['phi_ci_lower'].apply(lambda x: '✓' if x is not None and x > 0 else '')

print("Statistical tests completed!")
print(f"Significant accuracy results: {(df['accuracy_sig'] != '').sum()}/{len(df)}")
print(f"Significant phi correlations: {(df['phi_significant'] == '✓').sum()}/{len(df)}")

In [None]:
# Create Comprehensive Results Table
def format_number(x, decimals=3):
    """Format number for display."""
    if pd.isna(x) or x is None:
        return 'N/A'
    return f'{x:.{decimals}f}'

def format_ci(lower, upper, decimals=3):
    """Format confidence interval for display."""
    if pd.isna(lower) or pd.isna(upper) or lower is None or upper is None:
        return 'N/A'
    return f'[{lower:.{decimals}f}, {upper:.{decimals}f}]'

# Prepare display table
results_table = df.copy()
results_table = results_table.sort_values(['mode', 'accuracy'], ascending=[True, False])

# Create formatted columns
results_table['Accuracy'] = results_table['accuracy'].apply(lambda x: format_number(x, 3)) + ' ' + results_table['accuracy_sig']
results_table['Phi (r)'] = results_table['phi_correlation'].apply(lambda x: format_number(x, 3)) + ' ' + results_table['phi_significant']
results_table['Accuracy p-value'] = results_table['accuracy_pvalue'].apply(lambda x: format_number(x, 4) if x is not None else 'N/A')
results_table['Phi 95% CI'] = results_table.apply(lambda row: format_ci(row['phi_ci_lower'], row['phi_ci_upper']), axis=1)
results_table['Confusion Matrix'] = results_table.apply(lambda row: f'TP:{row["tp"]}, TN:{row["tn"]}, FP:{row["fp"]}, FN:{row["fn"]}', axis=1)

# Select columns for display
display_columns = [
    'mode', 'model_name', 'samples', 'Accuracy', 'Accuracy p-value', 
    'Phi (r)', 'Phi 95% CI', 'Confusion Matrix'
]

final_table = results_table[display_columns].copy()
final_table.columns = ['Mode', 'Model', 'N', 'Accuracy', 'Acc p-value', 'Phi Correlation', 'Phi 95% CI', 'Confusion Matrix']

print("📊 COMPREHENSIVE STEERING ANALYSIS RESULTS")
print("=" * 80)
print("Statistical Significance: *** p<0.001, ** p<0.01, * p<0.05")
print("Phi Correlation: ✓ indicates 95% CI excludes zero")
print("=" * 80)

# Display with nice formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 30)

display(final_table)

In [None]:
# Visualize Model Performance
if len(df) > 0:
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Steering Analysis Results: Model Performance Overview', fontsize=16, fontweight='bold')
    
    # 1. Accuracy by Mode and Model
    ax1 = axes[0, 0]
    df_plot = df[df['accuracy'].notna()]
    if len(df_plot) > 0:
        # Color by significance
        colors = ['red' if sig != '' else 'lightblue' for sig in df_plot['accuracy_sig']]
        bars = ax1.bar(range(len(df_plot)), df_plot['accuracy'], color=colors, alpha=0.7)
        ax1.set_xticks(range(len(df_plot)))
        ax1.set_xticklabels([f"{row['mode']}\n{row['model_name'][:15]}" for _, row in df_plot.iterrows()], 
                           rotation=45, ha='right')
        ax1.set_ylabel('Accuracy')
        ax1.set_title('Accuracy by Model (Red = Significant)')
        ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Chance level')
        ax1.legend()
        
        # Add significance annotations
        for i, (_, row) in enumerate(df_plot.iterrows()):
            if row['accuracy_sig']:
                ax1.annotate(row['accuracy_sig'], (i, row['accuracy'] + 0.01), ha='center', fontweight='bold')
    
    # 2. Phi Correlation by Mode and Model
    ax2 = axes[0, 1]
    df_phi = df[df['phi_correlation'].notna()]
    if len(df_phi) > 0:
        # Color by significance
        colors = ['red' if sig == '✓' else 'lightblue' for sig in df_phi['phi_significant']]
        bars = ax2.bar(range(len(df_phi)), df_phi['phi_correlation'], color=colors, alpha=0.7)
        
        # Add error bars for confidence intervals
        errors_lower = df_phi['phi_correlation'] - df_phi['phi_ci_lower'].fillna(df_phi['phi_correlation'])
        errors_upper = df_phi['phi_ci_upper'].fillna(df_phi['phi_correlation']) - df_phi['phi_correlation']
        ax2.errorbar(range(len(df_phi)), df_phi['phi_correlation'], 
                    yerr=[errors_lower, errors_upper], fmt='none', color='black', capsize=3, alpha=0.7)
        
        ax2.set_xticks(range(len(df_phi)))
        ax2.set_xticklabels([f"{row['mode']}\n{row['model_name'][:15]}" for _, row in df_phi.iterrows()], 
                           rotation=45, ha='right')
        ax2.set_ylabel('Phi Correlation')
        ax2.set_title('Phi Correlation with 95% CI (Red = Significant)')
        ax2.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    
    # 3. Accuracy vs Phi Correlation Scatter
    ax3 = axes[1, 0]
    df_scatter = df[(df['accuracy'].notna()) & (df['phi_correlation'].notna())]
    if len(df_scatter) > 0:
        for mode in df_scatter['mode'].unique():
            mode_data = df_scatter[df_scatter['mode'] == mode]
            ax3.scatter(mode_data['accuracy'], mode_data['phi_correlation'], 
                       label=mode, alpha=0.7, s=80)
        
        ax3.set_xlabel('Accuracy')
        ax3.set_ylabel('Phi Correlation')
        ax3.set_title('Accuracy vs Phi Correlation by Mode')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
    
    # 4. Sample Sizes
    ax4 = axes[1, 1]
    if len(df) > 0:
        sample_counts = df.groupby(['mode', 'model_name'])['samples'].first().reset_index()
        ax4.bar(range(len(sample_counts)), sample_counts['samples'], alpha=0.7, color='lightgreen')
        ax4.set_xticks(range(len(sample_counts)))
        ax4.set_xticklabels([f"{row['mode']}\n{row['model_name'][:15]}" for _, row in sample_counts.iterrows()], 
                           rotation=45, ha='right')
        ax4.set_ylabel('Sample Size')
        ax4.set_title('Sample Sizes by Experiment')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n📈 SUMMARY STATISTICS")
    print("=" * 40)
    summary_stats = df.groupby('mode').agg({
        'accuracy': ['count', 'mean', 'std', 'min', 'max'],
        'phi_correlation': ['mean', 'std', 'min', 'max'],
        'samples': ['mean', 'min', 'max']
    }).round(3)
    
    display(summary_stats)
    
else:
    print("No data available for visualization.")

In [None]:
# Export Results
if len(df) > 0:
    # Save comprehensive results
    output_dir = Path('../outputs/feature_classification/analysis')
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Export detailed results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Full results with all columns
    full_results_file = output_dir / f'steering_analysis_full_results_{timestamp}.csv'
    df.to_csv(full_results_file, index=False)
    
    # Summary table
    summary_file = output_dir / f'steering_analysis_summary_{timestamp}.csv'
    final_table.to_csv(summary_file, index=False)
    
    # JSON summary for programmatic access
    json_summary = {
        'timestamp': timestamp,
        'total_experiments': len(df),
        'modes': df['mode'].value_counts().to_dict(),
        'models': df['model_name'].nunique(),
        'significant_accuracy': (df['accuracy_sig'] != '').sum(),
        'significant_phi': (df['phi_significant'] == '✓').sum(),
        'best_accuracy': {
            'value': df['accuracy'].max(),
            'model': df.loc[df['accuracy'].idxmax(), 'model_name'] if not df['accuracy'].isna().all() else None,
            'mode': df.loc[df['accuracy'].idxmax(), 'mode'] if not df['accuracy'].isna().all() else None
        },
        'best_phi': {
            'value': df['phi_correlation'].max(),
            'model': df.loc[df['phi_correlation'].idxmax(), 'model_name'] if not df['phi_correlation'].isna().all() else None,
            'mode': df.loc[df['phi_correlation'].idxmax(), 'mode'] if not df['phi_correlation'].isna().all() else None
        }
    }
    
    json_file = output_dir / f'steering_analysis_summary_{timestamp}.json'
    with json_file.open('w') as f:
        json.dump(json_summary, f, indent=2, default=str)
    
    print(f"\n💾 RESULTS EXPORTED")
    print(f"Full results: {full_results_file}")
    print(f"Summary table: {summary_file}")
    print(f"JSON summary: {json_file}")
    
    # Display final summary
    print(f"\n🎯 KEY FINDINGS:")
    if json_summary['best_accuracy']['model']:
        print(f"Best Accuracy: {json_summary['best_accuracy']['value']:.3f} ({json_summary['best_accuracy']['model']}, {json_summary['best_accuracy']['mode']})")
    if json_summary['best_phi']['model']:
        print(f"Best Phi Correlation: {json_summary['best_phi']['value']:.3f} ({json_summary['best_phi']['model']}, {json_summary['best_phi']['mode']})")
    print(f"Statistically Significant Results: {json_summary['significant_accuracy']}/{len(df)} (accuracy), {json_summary['significant_phi']}/{len(df)} (phi)")
    
else:
    print("\n⚠️  No results to export. Run steering analysis experiments first.")