In [7]:
"""
Statistical Significance, Stability, and Interpretability Analysis
Run this after Stage 1 and Stage 2 are complete
"""

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
import re

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

dataset_slug = re.sub(r'\W+', '_', os.path.splitext(os.path.basename(CONFIG["dataset_path"]))[0])

# Define paths
base_dir = os.path.join("metrics", dataset_slug)
fold_metrics_dir = os.path.join(base_dir, "fold_wise")
analysis_dir = os.path.join(base_dir, "statistical_analysis")
os.makedirs(analysis_dir, exist_ok=True)

print("="*80)
print("STATISTICAL ANALYSIS SUITE")
print("="*80)

# =============================================================================
# 1. LOAD DATA
# =============================================================================
print("\n[1] Loading data...")

stage1_fold_metrics = pd.read_csv(os.path.join(fold_metrics_dir, "stage1_fold_metrics.csv"))
stage2_fold_metrics = pd.read_csv(os.path.join(fold_metrics_dir, "stage2_fold_metrics.csv"))
stage1_results = pd.read_csv(os.path.join(base_dir, CONFIG['results_csv_path']))
stage2_results = pd.read_csv(os.path.join(base_dir, "best_meta_model_results.csv"))

# Load baseline metrics if available
baseline_path = os.path.join(CONFIG["predictions_dir"], dataset_slug, "baseline_model_metrics.csv")
if os.path.exists(baseline_path):
    baseline_metrics = pd.read_csv(baseline_path, index_col=0)
    print(f"✓ Loaded baseline metrics")
else:
    baseline_metrics = None
    print("⚠ No baseline metrics found")

print(f"✓ Stage 1 configurations: {len(stage1_results)}")
print(f"✓ Stage 2 configurations: {len(stage2_results)}")
print(f"✓ Stage 1 fold records: {len(stage1_fold_metrics)}")
print(f"✓ Stage 2 fold records: {len(stage2_fold_metrics)}")

# =============================================================================
# 2. STATISTICAL SIGNIFICANCE TESTS
# =============================================================================
print("\n" + "="*80)
print("[2] STATISTICAL SIGNIFICANCE TESTING")
print("="*80)

significance_results = []

# 2.1: Test if models are significantly better than baseline (R² > 0)
print("\n2.1: One-Sample t-test (H0: R² = 0)")
print("-" * 60)

# Best Stage 1 model
best_stage1 = stage1_results.loc[stage1_results['Test_R2'].idxmax()]
stage1_config = f"{best_stage1['FS_Method']}_{best_stage1['Model']}_Top{int(best_stage1['Features_Used'])}"
stage1_r2_folds = stage1_fold_metrics[
    (stage1_fold_metrics['FS_Method'] == best_stage1['FS_Method']) &
    (stage1_fold_metrics['Model'] == best_stage1['Model']) &
    (stage1_fold_metrics['Features_Used'] == best_stage1['Features_Used'])
]['R2'].values

t_stat, p_value = stats.ttest_1samp(stage1_r2_folds, 0)
p_value_one_tailed = p_value / 2

print(f"\nBest Stage 1: {stage1_config}")
print(f"  Mean R²: {np.mean(stage1_r2_folds):.4f} ± {np.std(stage1_r2_folds, ddof=1):.4f}")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value (one-tailed): {p_value_one_tailed:.6f}")
print(f"  Result: {'✓ Significant (p < 0.05)' if p_value_one_tailed < 0.05 else '✗ Not significant'}")

significance_results.append({
    'Test': 'One-sample t-test',
    'Comparison': 'Best Stage 1 vs R²=0',
    'Model': stage1_config,
    'Mean_R2': np.mean(stage1_r2_folds),
    'Std_R2': np.std(stage1_r2_folds, ddof=1),
    't_statistic': t_stat,
    'p_value': p_value_one_tailed,
    'Significant': p_value_one_tailed < 0.05
})

# Best Stage 2 model
best_stage2 = stage2_results.loc[stage2_results['Test_R2'].idxmax()]
stage2_config = f"{best_stage2['Method']}_TopK{int(best_stage2['Top_K'])}"

if best_stage2['Method'] == 'FS_MetaModel':
    stage2_r2_folds = stage2_fold_metrics[
        (stage2_fold_metrics['Method'] == best_stage2['Method']) &
        (stage2_fold_metrics['Meta_Model'] == best_stage2['Meta_Model']) &
        (stage2_fold_metrics['FS_Method'] == best_stage2['FS_Method']) &
        (stage2_fold_metrics['Top_K'] == best_stage2['Top_K']) &
        (stage2_fold_metrics['Best_n'] == best_stage2['Best_n'])
    ]['R2'].values
else:
    stage2_r2_folds = stage2_fold_metrics[
        (stage2_fold_metrics['Method'] == best_stage2['Method']) &
        (stage2_fold_metrics['Top_K'] == best_stage2['Top_K'])
    ]['R2'].values

t_stat, p_value = stats.ttest_1samp(stage2_r2_folds, 0)
p_value_one_tailed = p_value / 2

print(f"\nBest Stage 2: {stage2_config}")
print(f"  Mean R²: {np.mean(stage2_r2_folds):.4f} ± {np.std(stage2_r2_folds, ddof=1):.4f}")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value (one-tailed): {p_value_one_tailed:.6f}")
print(f"  Result: {'✓ Significant (p < 0.05)' if p_value_one_tailed < 0.05 else '✗ Not significant'}")

significance_results.append({
    'Test': 'One-sample t-test',
    'Comparison': 'Best Stage 2 vs R²=0',
    'Model': stage2_config,
    'Mean_R2': np.mean(stage2_r2_folds),
    'Std_R2': np.std(stage2_r2_folds, ddof=1),
    't_statistic': t_stat,
    'p_value': p_value_one_tailed,
    'Significant': p_value_one_tailed < 0.05
})

# 2.2: Paired t-test: Stage 2 vs Stage 1
print("\n2.2: Paired t-test (Stage 2 vs Best Stage 1)")
print("-" * 60)

if len(stage1_r2_folds) == len(stage2_r2_folds):
    t_stat, p_value = stats.ttest_rel(stage2_r2_folds, stage1_r2_folds)
    
    print(f"\nStage 1 R²: {np.mean(stage1_r2_folds):.4f} ± {np.std(stage1_r2_folds, ddof=1):.4f}")
    print(f"Stage 2 R²: {np.mean(stage2_r2_folds):.4f} ± {np.std(stage2_r2_folds, ddof=1):.4f}")
    print(f"Difference: {np.mean(stage2_r2_folds - stage1_r2_folds):.4f}")
    print(f"t-statistic: {t_stat:.4f}")
    print(f"p-value: {p_value:.4f}")
    print(f"Result: {'✓ Significant difference (p < 0.05)' if p_value < 0.05 else '✗ No significant difference'}")
    
    significance_results.append({
        'Test': 'Paired t-test',
        'Comparison': 'Stage 2 vs Stage 1',
        'Model': f"{stage2_config} vs {stage1_config}",
        'Mean_R2': np.mean(stage2_r2_folds - stage1_r2_folds),
        'Std_R2': np.std(stage2_r2_folds - stage1_r2_folds, ddof=1),
        't_statistic': t_stat,
        'p_value': p_value,
        'Significant': p_value < 0.05
    })
else:
    print("⚠ Cannot perform paired t-test: fold counts don't match")

# Save significance results
sig_df = pd.DataFrame(significance_results)
sig_df.to_csv(os.path.join(analysis_dir, "significance_tests.csv"), index=False)
print(f"\n✓ Saved significance tests to: {analysis_dir}/significance_tests.csv")

# =============================================================================
# 3. STABILITY ANALYSIS
# =============================================================================
print("\n" + "="*80)
print("[3] STABILITY ANALYSIS")
print("="*80)

stability_results = []

# 3.1: Coefficient of Variation
print("\n3.1: Coefficient of Variation (CV%)")
print("-" * 60)

def coefficient_of_variation(values):
    return (np.std(values, ddof=1) / np.mean(values)) * 100

# Get top 5 Stage 1 models
top5_stage1 = stage1_results.nlargest(5, 'Test_R2')

print("\nTop 5 Stage 1 Models:")
for idx, row in top5_stage1.iterrows():
    model_id = f"{row['FS_Method']}_{row['Model']}_Top{int(row['Features_Used'])}"
    folds = stage1_fold_metrics[
        (stage1_fold_metrics['FS_Method'] == row['FS_Method']) &
        (stage1_fold_metrics['Model'] == row['Model']) &
        (stage1_fold_metrics['Features_Used'] == row['Features_Used'])
    ]['R2'].values
    
    cv_pct = coefficient_of_variation(folds)
    print(f"  {model_id}: CV = {cv_pct:.2f}%")
    
    stability_results.append({
        'Stage': 'Stage 1',
        'Model': model_id,
        'Mean_R2': np.mean(folds),
        'Std_R2': np.std(folds, ddof=1),
        'CV_Percent': cv_pct,
        'Min_R2': np.min(folds),
        'Max_R2': np.max(folds),
        'Range_R2': np.max(folds) - np.min(folds)
    })

# Average CV for Stage 1
avg_cv_stage1 = np.mean([r['CV_Percent'] for r in stability_results if r['Stage'] == 'Stage 1'])

print("\nTop 5 Stage 2 Models:")
top5_stage2 = stage2_results.nlargest(5, 'Test_R2')

for idx, row in top5_stage2.iterrows():
    model_id = f"{row['Method']}_TopK{int(row['Top_K'])}"
    
    if row['Method'] == 'FS_MetaModel':
        folds = stage2_fold_metrics[
            (stage2_fold_metrics['Method'] == row['Method']) &
            (stage2_fold_metrics['Meta_Model'] == row['Meta_Model']) &
            (stage2_fold_metrics['FS_Method'] == row['FS_Method']) &
            (stage2_fold_metrics['Top_K'] == row['Top_K']) &
            (stage2_fold_metrics['Best_n'] == row['Best_n'])
        ]['R2'].values
    else:
        folds = stage2_fold_metrics[
            (stage2_fold_metrics['Method'] == row['Method']) &
            (stage2_fold_metrics['Top_K'] == row['Top_K'])
        ]['R2'].values
    
    if len(folds) > 0:
        cv_pct = coefficient_of_variation(folds)
        print(f"  {model_id}: CV = {cv_pct:.2f}%")
        
        stability_results.append({
            'Stage': 'Stage 2',
            'Model': model_id,
            'Mean_R2': np.mean(folds),
            'Std_R2': np.std(folds, ddof=1),
            'CV_Percent': cv_pct,
            'Min_R2': np.min(folds),
            'Max_R2': np.max(folds),
            'Range_R2': np.max(folds) - np.min(folds)
        })

avg_cv_stage2 = np.mean([r['CV_Percent'] for r in stability_results if r['Stage'] == 'Stage 2'])

print(f"\nAverage CV% - Stage 1: {avg_cv_stage1:.2f}%")
print(f"Average CV% - Stage 2: {avg_cv_stage2:.2f}%")
print(f"Stability improvement: {((avg_cv_stage1 - avg_cv_stage2) / avg_cv_stage1 * 100):.1f}%")

# 3.2: Variance comparison test
print("\n3.2: Levene's Test for Variance Equality")
print("-" * 60)

# Get all Stage 1 R2 values
all_stage1_r2 = []
for idx, row in stage1_results.iterrows():
    folds = stage1_fold_metrics[
        (stage1_fold_metrics['FS_Method'] == row['FS_Method']) &
        (stage1_fold_metrics['Model'] == row['Model']) &
        (stage1_fold_metrics['Features_Used'] == row['Features_Used'])
    ]['R2'].values
    all_stage1_r2.extend(folds)

# Get all Stage 2 R2 values
all_stage2_r2 = stage2_fold_metrics['R2'].values

stat, p_value = stats.levene(all_stage1_r2, all_stage2_r2)

print(f"Levene's statistic: {stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Result: {'✓ Variances significantly different (p < 0.05)' if p_value < 0.05 else '✗ No significant difference'}")

# Save stability results
stab_df = pd.DataFrame(stability_results)
stab_df.to_csv(os.path.join(analysis_dir, "stability_analysis.csv"), index=False)
print(f"\n✓ Saved stability analysis to: {analysis_dir}/stability_analysis.csv")

# 3.3: Visualization - Box plots
print("\n3.3: Creating stability visualizations...")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Stage 1 box plot
stage1_data = []
stage1_labels = []
for idx, row in top5_stage1.iterrows():
    folds = stage1_fold_metrics[
        (stage1_fold_metrics['FS_Method'] == row['FS_Method']) &
        (stage1_fold_metrics['Model'] == row['Model']) &
        (stage1_fold_metrics['Features_Used'] == row['Features_Used'])
    ]['R2'].values
    stage1_data.append(folds)
    stage1_labels.append(f"{row['Model']}\n{row['FS_Method']}")

bp1 = axes[0].boxplot(stage1_data, labels=stage1_labels, patch_artist=True)
for patch in bp1['boxes']:
    patch.set_facecolor('lightblue')
axes[0].set_title('Stage 1: Top 5 Models - R² Distribution')
axes[0].set_ylabel('R² Score')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Stage 2 box plot
stage2_data = []
stage2_labels = []
for idx, row in top5_stage2.iterrows():
    if row['Method'] == 'FS_MetaModel':
        folds = stage2_fold_metrics[
            (stage2_fold_metrics['Method'] == row['Method']) &
            (stage2_fold_metrics['Meta_Model'] == row['Meta_Model']) &
            (stage2_fold_metrics['FS_Method'] == row['FS_Method']) &
            (stage2_fold_metrics['Top_K'] == row['Top_K']) &
            (stage2_fold_metrics['Best_n'] == row['Best_n'])
        ]['R2'].values
    else:
        folds = stage2_fold_metrics[
            (stage2_fold_metrics['Method'] == row['Method']) &
            (stage2_fold_metrics['Top_K'] == row['Top_K'])
        ]['R2'].values
    
    if len(folds) > 0:
        stage2_data.append(folds)
        stage2_labels.append(f"{row['Method']}\nK={int(row['Top_K'])}")

bp2 = axes[1].boxplot(stage2_data, labels=stage2_labels, patch_artist=True)
for patch in bp2['boxes']:
    patch.set_facecolor('lightgreen')
axes[1].set_title('Stage 2: Top 5 Models - R² Distribution')
axes[1].set_ylabel('R² Score')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, "stability_boxplots.png"), dpi=300, bbox_inches='tight')
print(f"✓ Saved box plots to: {analysis_dir}/stability_boxplots.png")
plt.close()

# =============================================================================
# 4. INTERPRETABILITY ANALYSIS
# =============================================================================
print("\n" + "="*80)
print("[4] INTERPRETABILITY ANALYSIS")
print("="*80)

# Only analyze if Stage 2 used Linear Regression meta-model
lr_models = stage2_results[stage2_results['Meta_Model'] == 'LinearRegression']

if len(lr_models) == 0:
    print("⚠ No Linear Regression meta-models found in Stage 2")
else:
    print(f"\n4.1: Meta-Model Weight Analysis ({len(lr_models)} configurations)")
    print("-" * 60)
    
    # Load Stage 1 predictions
    cv_train_preds = pd.read_csv(
        os.path.join(CONFIG["predictions_dir"], dataset_slug, "cv_train_predictions.csv")
    )
    
    X_train_stage1 = cv_train_preds.drop(columns=["true"])
    y_train = cv_train_preds["true"]
    
    # Get best Linear Regression model
    best_lr = lr_models.loc[lr_models['Test_R2'].idxmax()]
    
    print(f"\nBest LR Meta-Model:")
    print(f"  Method: {best_lr['Method']}")
    print(f"  Top-K: {int(best_lr['Top_K'])}")
    print(f"  FS Method: {best_lr['FS_Method']}")
    print(f"  Best n: {int(best_lr['Best_n'])}")
    print(f"  Test R²: {best_lr['Test_R2']:.4f}")
    
    # Get the top-k models used
    top_k_models = stage1_results.sort_values("CV_R2", ascending=False).head(int(best_lr['Top_K']))
    top_k_cols = [
        f"pred_{row['FS_Method']}_{row['Model']}_Top{int(row['Features_Used'])}"
        for _, row in top_k_models.iterrows()
    ]
    
    X_train_topk = X_train_stage1[top_k_cols]
    
    # Feature selection (same as in Stage 2)
    if best_lr['FS_Method'] == 'RF':
        from sklearn.ensemble import RandomForestRegressor
        rf = RandomForestRegressor(random_state=42, n_jobs=-1)
        rf.fit(X_train_topk, y_train)
        scores = rf.feature_importances_
        selected_feats = X_train_topk.columns[np.argsort(scores)[::-1][:int(best_lr['Best_n'])]]
    elif best_lr['FS_Method'] == 'XGB':
        from xgboost import XGBRegressor
        xgb = XGBRegressor(random_state=42, n_jobs=-1)
        xgb.fit(X_train_topk, y_train)
        scores = xgb.feature_importances_
        selected_feats = X_train_topk.columns[np.argsort(scores)[::-1][:int(best_lr['Best_n'])]]
    elif best_lr['FS_Method'] == 'RFE_RF':
        from sklearn.feature_selection import RFE
        rfe = RFE(estimator=RandomForestRegressor(random_state=42), 
                  n_features_to_select=int(best_lr['Best_n']))
        rfe.fit(X_train_topk, y_train)
        selected_feats = X_train_topk.columns[rfe.support_]
    elif best_lr['FS_Method'] == 'RF_XGB':
        rf = RandomForestRegressor(random_state=42, n_jobs=-1)
        xgb = XGBRegressor(random_state=42, n_jobs=-1)
        rf.fit(X_train_topk, y_train)
        xgb.fit(X_train_topk, y_train)
        avg_scores = (rf.feature_importances_ + xgb.feature_importances_) / 2
        selected_feats = X_train_topk.columns[np.argsort(avg_scores)[::-1][:int(best_lr['Best_n'])]]
    
    X_train_selected = X_train_topk[selected_feats]
    
    # Train final Linear Regression
    meta_model = LinearRegression()
    meta_model.fit(X_train_selected, y_train)
    
    # Extract weights
    weights = meta_model.coef_
    intercept = meta_model.intercept_
    
    # Create weights DataFrame
    weights_df = pd.DataFrame({
        'Base_Model': [feat.replace('pred_', '') for feat in selected_feats],
        'Weight': weights
    }).sort_values('Weight', ascending=False)
    
    print("\nMeta-Model Weights:")
    print(weights_df.to_string(index=False))
    print(f"\nIntercept: {intercept:.4f}")
    print(f"Sum of weights: {np.sum(weights):.4f}")
    print(f"Dominant model: {weights_df.iloc[0]['Base_Model']} (weight={weights_df.iloc[0]['Weight']:.4f})")
    
    # Save weights
    weights_df.to_csv(os.path.join(analysis_dir, "meta_model_weights.csv"), index=False)
    
    # Visualization
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = ['green' if w > 0 else 'red' for w in weights_df['Weight']]
    ax.barh(range(len(weights_df)), weights_df['Weight'], color=colors, alpha=0.7)
    ax.set_yticks(range(len(weights_df)))
    ax.set_yticklabels(weights_df['Base_Model'])
    ax.axvline(0, color='black', linewidth=0.8)
    ax.set_xlabel('Weight Coefficient')
    ax.set_title('Linear Regression Meta-Model: Base Learner Weights')
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(analysis_dir, "meta_model_weights.png"), dpi=300, bbox_inches='tight')
    print(f"\n✓ Saved weight visualization to: {analysis_dir}/meta_model_weights.png")
    plt.close()
    
    # Interpretation insights
    print("\n4.2: Interpretation Insights")
    print("-" * 60)
    
    positive_weights = weights_df[weights_df['Weight'] > 0]
    negative_weights = weights_df[weights_df['Weight'] < 0]
    near_zero = weights_df[abs(weights_df['Weight']) < 0.05]
    
    print(f"\nPositive contributors: {len(positive_weights)} models")
    print(f"Negative contributors: {len(negative_weights)} models (bias correction)")
    print(f"Near-zero weights (<0.05): {len(near_zero)} models (redundant)")
    
    if len(negative_weights) > 0:
        print(f"\nNegative weights suggest these models correct systematic biases:")
        print(negative_weights.to_string(index=False))
    
    if len(near_zero) > 0:
        print(f"\nNear-zero weights indicate redundancy with other learners:")
        print(near_zero.to_string(index=False))

# =============================================================================
# 5. GENERATE SUMMARY REPORT
# =============================================================================
print("\n" + "="*80)
print("[5] GENERATING SUMMARY REPORT")
print("="*80)

summary = []

summary.append("="*80)
summary.append("STATISTICAL ANALYSIS SUMMARY REPORT")
summary.append(f"Dataset: {dataset_slug}")
summary.append("="*80)

summary.append("\n--- 1. STATISTICAL SIGNIFICANCE ---")
summary.append(f"Best Stage 1 Model: {stage1_config}")
summary.append(f"  Mean CV R²: {np.mean(stage1_r2_folds):.4f} ± {np.std(stage1_r2_folds, ddof=1):.4f}")
summary.append(f"  Significantly better than baseline (R²=0): {'YES (p < 0.05)' if sig_df.iloc[0]['Significant'] else 'NO'}")

summary.append(f"\nBest Stage 2 Model: {stage2_config}")
summary.append(f"  Mean CV R²: {np.mean(stage2_r2_folds):.4f} ± {np.std(stage2_r2_folds, ddof=1):.4f}")
summary.append(f"  Significantly better than baseline (R²=0): {'YES (p < 0.05)' if sig_df.iloc[1]['Significant'] else 'NO'}")

if len(sig_df) > 2:
    summary.append(f"\nStage 2 vs Stage 1 (Paired t-test):")
    summary.append(f"  Difference in R²: {sig_df.iloc[2]['Mean_R2']:.4f}")
    summary.append(f"  p-value: {sig_df.iloc[2]['p_value']:.4f}")
    summary.append(f"  Significantly different: {'YES (p < 0.05)' if sig_df.iloc[2]['Significant'] else 'NO'}")

summary.append("\n--- 2. STABILITY ANALYSIS ---")
summary.append(f"Average CV% - Stage 1: {avg_cv_stage1:.2f}%")
summary.append(f"Average CV% - Stage 2: {avg_cv_stage2:.2f}%")
summary.append(f"Stability improvement: {((avg_cv_stage1 - avg_cv_stage2) / avg_cv_stage1 * 100):.1f}%")

if len(lr_models) > 0:
    summary.append("\n--- 3. INTERPRETABILITY ---")
    summary.append(f"Meta-Model Type: Linear Regression")
    summary.append(f"Number of base learners: {len(selected_feats)}")
    summary.append(f"Dominant model: {weights_df.iloc[0]['Base_Model']} (weight={weights_df.iloc[0]['Weight']:.4f})")
    summary.append(f"Positive contributors: {len(positive_weights)}")
    summary.append(f"Negative contributors: {len(negative_weights)} (bias correction)")
    summary.append(f"Redundant models: {len(near_zero)}")

summary_text = "\n".join(summary)
print(summary_text)

with open(os.path.join(analysis_dir, "summary_report.txt"), 'w') as f:
    f.write(summary_text)

print(f"\n✓ Saved summary report to: {analysis_dir}/summary_report.txt")

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print(f"All results saved to: {analysis_dir}/")
print("="*80)

STATISTICAL ANALYSIS SUITE

[1] Loading data...
⚠ No baseline metrics found
✓ Stage 1 configurations: 54
✓ Stage 2 configurations: 36
✓ Stage 1 fold records: 540
✓ Stage 2 fold records: 360

[2] STATISTICAL SIGNIFICANCE TESTING

2.1: One-Sample t-test (H0: R² = 0)
------------------------------------------------------------

Best Stage 1: RFE_RF_RF_Top5
  Mean R²: 0.7325 ± 0.0776
  t-statistic: 29.8590
  p-value (one-tailed): 0.000000
  Result: ✓ Significant (p < 0.05)

Best Stage 2: TopKAverage_TopK50
  Mean R²: 0.7508 ± 0.0674
  t-statistic: 35.2012
  p-value (one-tailed): 0.000000
  Result: ✓ Significant (p < 0.05)

2.2: Paired t-test (Stage 2 vs Best Stage 1)
------------------------------------------------------------

Stage 1 R²: 0.7325 ± 0.0776
Stage 2 R²: 0.7508 ± 0.0674
Difference: 0.0183
t-statistic: 0.4674
p-value: 0.6513
Result: ✗ No significant difference

✓ Saved significance tests to: metrics/PRF_Landsat5_9/statistical_analysis/significance_tests.csv

[3] STABILITY ANALY