# RSNA Aneurysm Detection Evaluation - exp0001

**目的**: Out-Of-Fold (OOF) 予測の分析と品質チェック

**分析内容**:
- CV性能の詳細分析
- Fold間の一貫性チェック
- リーク検出・品質監査
- 閾値最適化
- 特徴重要度分析

In [None]:
# 基本ライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import json
from pathlib import Path
import warnings

# 評価指標
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
    roc_curve, precision_recall_curve, confusion_matrix,
    classification_report, average_precision_score
)
from sklearn.calibration import calibration_curve
import scipy.stats as stats

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# 設定読み込み
with open('config.yaml', 'r') as f:
    cfg = yaml.safe_load(f)
    
print(f"Evaluating experiment: {cfg['experiment']['id']}")
print(f"Description: {cfg['experiment']['description']}")

In [None]:
# 結果データ読み込み
oof_df = pd.read_csv('oof_predictions.csv')
with open('metrics.json', 'r') as f:
    metrics = json.load(f)
    
print(f"OOF predictions shape: {oof_df.shape}")
print(f"Metrics loaded: {list(metrics.keys())}")
print(f"\nOOF DataFrame:")
print(oof_df.head())

In [None]:
# 基本統計サマリー
print("=" * 50)
print("EXPERIMENT RESULTS SUMMARY")
print("=" * 50)

print(f"CV Mean AUC: {metrics['cv_mean_auc']:.6f} ± {metrics['cv_std_auc']:.6f}")
print(f"OOF AUC: {metrics['oof_auc']:.6f}")
print(f"\nFold-wise AUC scores:")
for i, score in enumerate(metrics['fold_scores']):
    print(f"  Fold {i+1}: {score:.6f}")

# Fold間の一貫性チェック
fold_std = np.std(metrics['fold_scores'])
print(f"\nFold Score Standard Deviation: {fold_std:.6f}")

if fold_std > 0.02:
    print("⚠️  WARNING: High variance between folds (potential data leakage or instability)")
else:
    print("✅ Good: Consistent performance across folds")

# CV vs OOF乖離チェック
cv_oof_diff = abs(metrics['cv_mean_auc'] - metrics['oof_auc'])
print(f"\nCV-OOF AUC Difference: {cv_oof_diff:.6f}")

if cv_oof_diff > 0.01:
    print("⚠️  WARNING: Large discrepancy between CV and OOF AUC")
else:
    print("✅ Good: CV and OOF AUC are consistent")

In [None]:
# Fold別分析
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for fold in range(cfg['cv']['n_folds']):
    fold_data = oof_df[oof_df['fold'] == fold]
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(fold_data['y_true'], fold_data['y_pred'])
    auc = roc_auc_score(fold_data['y_true'], fold_data['y_pred'])
    
    axes[fold].plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    axes[fold].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[fold].set_xlabel('False Positive Rate')
    axes[fold].set_ylabel('True Positive Rate')
    axes[fold].set_title(f'ROC Curve - Fold {fold+1}')
    axes[fold].legend()
    axes[fold].grid(True, alpha=0.3)

# 全体ROC Curve
fpr, tpr, _ = roc_curve(oof_df['y_true'], oof_df['y_pred'])
auc = roc_auc_score(oof_df['y_true'], oof_df['y_pred'])

axes[5].plot(fpr, tpr, label=f'Overall AUC = {auc:.4f}', linewidth=2)
axes[5].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[5].set_xlabel('False Positive Rate')
axes[5].set_ylabel('True Positive Rate')
axes[5].set_title('Overall ROC Curve')
axes[5].legend()
axes[5].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('roc_curves_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# 予測分布分析
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 予測確率分布（クラス別）
axes[0,0].hist(oof_df[oof_df['y_true']==0]['y_pred'], bins=50, alpha=0.7, label='Negative', density=True)
axes[0,0].hist(oof_df[oof_df['y_true']==1]['y_pred'], bins=50, alpha=0.7, label='Positive', density=True)
axes[0,0].set_xlabel('Predicted Probability')
axes[0,0].set_ylabel('Density')
axes[0,0].set_title('Prediction Distribution by Class')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Fold別予測分布
for fold in range(cfg['cv']['n_folds']):
    fold_data = oof_df[oof_df['fold'] == fold]
    axes[0,1].hist(fold_data['y_pred'], bins=30, alpha=0.5, label=f'Fold {fold+1}', density=True)
axes[0,1].set_xlabel('Predicted Probability')
axes[0,1].set_ylabel('Density')
axes[0,1].set_title('Prediction Distribution by Fold')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(oof_df['y_true'], oof_df['y_pred'])
avg_precision = average_precision_score(oof_df['y_true'], oof_df['y_pred'])
axes[1,0].plot(recall, precision, label=f'Average Precision = {avg_precision:.4f}')
axes[1,0].set_xlabel('Recall')
axes[1,0].set_ylabel('Precision')
axes[1,0].set_title('Precision-Recall Curve')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Calibration Curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    oof_df['y_true'], oof_df['y_pred'], n_bins=10
)
axes[1,1].plot(mean_predicted_value, fraction_of_positives, "s-", label='Model')
axes[1,1].plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
axes[1,1].set_xlabel('Mean Predicted Probability')
axes[1,1].set_ylabel('Fraction of Positives')
axes[1,1].set_title('Calibration Curve')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('prediction_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# 最適閾値決定
def find_optimal_threshold(y_true, y_pred, method='youden'):
    """
    最適閾値を見つける
    method: 'youden', 'f1', 'precision_recall_balance'
    """
    if method == 'youden':
        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        optimal_idx = np.argmax(tpr - fpr)  # Youden's J statistic
        return thresholds[optimal_idx]
    
    elif method == 'f1':
        precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
        optimal_idx = np.argmax(f1_scores)
        return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
    
    elif method == 'precision_recall_balance':
        precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
        balance_scores = 2 * precision * recall / (precision + recall + 1e-8)
        optimal_idx = np.argmax(balance_scores)
        return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5

# 各手法で最適閾値計算
thresholds = {}
methods = ['youden', 'f1', 'precision_recall_balance']

for method in methods:
    threshold = find_optimal_threshold(oof_df['y_true'], oof_df['y_pred'], method)
    thresholds[method] = threshold
    
    # 該当閾値での性能計算
    y_pred_binary = (oof_df['y_pred'] >= threshold).astype(int)
    
    accuracy = accuracy_score(oof_df['y_true'], y_pred_binary)
    f1 = f1_score(oof_df['y_true'], y_pred_binary)
    precision = precision_score(oof_df['y_true'], y_pred_binary)
    recall = recall_score(oof_df['y_true'], y_pred_binary)
    
    print(f"\n{method.upper()} Method:")
    print(f"  Threshold: {threshold:.4f}")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")

# デフォルト閾値(0.5)での性能
y_pred_default = (oof_df['y_pred'] >= 0.5).astype(int)
print(f"\nDEFAULT (0.5) Method:")
print(f"  Threshold: 0.5000")
print(f"  Accuracy:  {accuracy_score(oof_df['y_true'], y_pred_default):.4f}")
print(f"  F1 Score:  {f1_score(oof_df['y_true'], y_pred_default):.4f}")
print(f"  Precision: {precision_score(oof_df['y_true'], y_pred_default):.4f}")
print(f"  Recall:    {recall_score(oof_df['y_true'], y_pred_default):.4f}")

In [None]:
# 混同行列 (最適閾値使用)
optimal_threshold = thresholds['youden']  # Youden法を採用
y_pred_optimal = (oof_df['y_pred'] >= optimal_threshold).astype(int)

cm = confusion_matrix(oof_df['y_true'], y_pred_optimal)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.title(f'Confusion Matrix (Threshold: {optimal_threshold:.4f})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# 詳細分類レポート
print("\nDetailed Classification Report:")
print("=" * 50)
print(classification_report(oof_df['y_true'], y_pred_optimal, 
                          target_names=['Negative', 'Positive']))

In [None]:
# リーク検出・品質監査
print("=" * 50)
print("DATA QUALITY AUDIT")
print("=" * 50)

# 1. Fold間Target分布一貫性チェック
fold_target_rates = []
for fold in range(cfg['cv']['n_folds']):
    fold_data = oof_df[oof_df['fold'] == fold]
    target_rate = fold_data['y_true'].mean()
    fold_target_rates.append(target_rate)
    print(f"Fold {fold+1} positive rate: {target_rate:.4f} ({fold_data['y_true'].sum()}/{len(fold_data)})")

target_rate_std = np.std(fold_target_rates)
print(f"\nTarget rate std across folds: {target_rate_std:.6f}")

if target_rate_std > 0.05:
    print("⚠️  WARNING: Inconsistent target distribution across folds")
else:
    print("✅ Good: Consistent target distribution across folds")

# 2. 予測確率の異常値チェック
extreme_high = (oof_df['y_pred'] > 0.99).sum()
extreme_low = (oof_df['y_pred'] < 0.01).sum()
print(f"\nExtreme predictions:")
print(f"  Very confident positive (>0.99): {extreme_high} samples")
print(f"  Very confident negative (<0.01): {extreme_low} samples")

total_extreme = extreme_high + extreme_low
extreme_ratio = total_extreme / len(oof_df)

if extreme_ratio > 0.1:
    print(f"⚠️  WARNING: {extreme_ratio:.2%} of predictions are extreme (potential overfitting)")
else:
    print(f"✅ Good: Only {extreme_ratio:.2%} extreme predictions")

# 3. CV-Test 相関シミュレーション（将来のLB比較用）
print(f"\n=== CV Reliability Metrics ===")
print(f"Expected CV-LB correlation indicators:")
print(f"  Fold score consistency: {1 - fold_std:.4f} (higher is better)")
print(f"  CV-OOF agreement: {1 - cv_oof_diff:.4f} (higher is better)")
print(f"  Target distribution stability: {1 - target_rate_std:.4f} (higher is better)")

cv_reliability_score = np.mean([1 - fold_std, 1 - cv_oof_diff, 1 - target_rate_std])
print(f"\n📊 Overall CV Reliability Score: {cv_reliability_score:.4f}")

if cv_reliability_score > 0.9:
    print("✅ Excellent: High confidence in CV")
elif cv_reliability_score > 0.8:
    print("✅ Good: Reasonable confidence in CV")
elif cv_reliability_score > 0.7:
    print("⚠️  Fair: Some concerns about CV reliability")
else:
    print("❌ Poor: Low confidence in CV - consider strategy revision")

In [None]:
# 詳細メトリクス保存（evaluation用）
evaluation_metrics = {
    'experiment_id': cfg['experiment']['id'],
    'evaluation_date': pd.Timestamp.now().isoformat(),
    
    # 基本性能指標
    'oof_auc': float(metrics['oof_auc']),
    'cv_mean_auc': float(metrics['cv_mean_auc']),
    'cv_std_auc': float(metrics['cv_std_auc']),
    'average_precision': float(average_precision_score(oof_df['y_true'], oof_df['y_pred'])),
    
    # 最適閾値情報
    'optimal_thresholds': {
        method: float(threshold) for method, threshold in thresholds.items()
    },
    'recommended_threshold': float(optimal_threshold),
    
    # 最適閾値での性能
    'optimal_threshold_metrics': {
        'accuracy': float(accuracy_score(oof_df['y_true'], y_pred_optimal)),
        'f1': float(f1_score(oof_df['y_true'], y_pred_optimal)),
        'precision': float(precision_score(oof_df['y_true'], y_pred_optimal)),
        'recall': float(recall_score(oof_df['y_true'], y_pred_optimal))
    },
    
    # 品質監査結果
    'quality_audit': {
        'fold_consistency_score': float(1 - fold_std),
        'cv_oof_agreement': float(1 - cv_oof_diff), 
        'target_distribution_stability': float(1 - target_rate_std),
        'cv_reliability_score': float(cv_reliability_score),
        'extreme_prediction_ratio': float(extreme_ratio)
    },
    
    # Fold別詳細
    'fold_details': {
        f'fold_{i+1}': {
            'auc': float(metrics['fold_scores'][i]),
            'target_rate': float(fold_target_rates[i]),
            'n_samples': int((oof_df['fold'] == i).sum())
        } for i in range(cfg['cv']['n_folds'])
    }
}

# 評価結果保存
with open('evaluation_metrics.json', 'w') as f:
    json.dump(evaluation_metrics, f, indent=2)

print("\n📁 Evaluation results saved:")
print("- evaluation_metrics.json")
print("- roc_curves_analysis.png")
print("- prediction_analysis.png")
print("- confusion_matrix.png")

In [None]:
# 実験ノート更新用のサマリー生成
print("=" * 60)
print("EXPERIMENT SUMMARY FOR NOTES")
print("=" * 60)

summary_text = f"""## Experiment {cfg['experiment']['id']} - Results Summary

**Model**: {cfg['model']['architecture']}
**Date**: {pd.Timestamp.now().strftime('%Y-%m-%d')}

### Performance
- CV AUC: {metrics['cv_mean_auc']:.6f} ± {metrics['cv_std_auc']:.6f}
- OOF AUC: {metrics['oof_auc']:.6f}
- Average Precision: {average_precision_score(oof_df['y_true'], oof_df['y_pred']):.6f}

### Optimal Threshold: {optimal_threshold:.4f} (Youden method)
- Accuracy: {accuracy_score(oof_df['y_true'], y_pred_optimal):.4f}
- F1 Score: {f1_score(oof_df['y_true'], y_pred_optimal):.4f}
- Precision: {precision_score(oof_df['y_true'], y_pred_optimal):.4f}
- Recall: {recall_score(oof_df['y_true'], y_pred_optimal):.4f}

### Quality Assessment
- CV Reliability Score: {cv_reliability_score:.4f}
- Fold Consistency: {'✅ Good' if fold_std <= 0.02 else '⚠️ Concerning'}
- CV-OOF Agreement: {'✅ Good' if cv_oof_diff <= 0.01 else '⚠️ Concerning'}

### Key Observations
- {"Stable cross-validation with consistent performance across folds" if fold_std <= 0.02 else "High variance between folds - investigate potential issues"}
- {"Well-calibrated model with good probability estimates" if cv_oof_diff <= 0.01 else "Model calibration may need improvement"}
- Extreme predictions: {extreme_ratio:.2%} of samples

### Recommendations for Next Experiments
1. {'Try different model architectures' if metrics['oof_auc'] < 0.8 else 'Focus on ensemble methods and fine-tuning'}
2. {'Investigate data augmentation strategies' if 'light' in cfg.get('augmentation', {}).get('strength', 'light') else 'Consider regularization techniques'}
3. {'Review CV strategy if reliability issues persist' if cv_reliability_score < 0.8 else 'Current CV strategy appears reliable'}
"""

# notes.mdファイルに追記
with open('notes.md', 'w') as f:
    f.write(summary_text)
    
print(summary_text)
print("\n📝 Summary saved to notes.md")

print("\n🎯 Evaluation completed successfully!")
print("Next step: Run inference.ipynb for test predictions and submission")