# Titanic Evaluation Notebook - exp0001

**OOF分析・CV品質チェック・リーク監査**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import json
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.calibration import calibration_curve
import warnings

warnings.filterwarnings("ignore")

# 設定とデータ読み込み
with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

with open("metrics.json", "r") as f:
    metrics = json.load(f)

oof_df = pd.read_parquet("oof.parquet")
cv_folds_df = pd.read_parquet("cv_folds.parquet")

print(f"実験ID: {cfg['experiment']['id']}")
print(f"OOF形状: {oof_df.shape}")
print(f"CV分割形状: {cv_folds_df.shape}")

In [None]:
# 基本統計
print("=== CV Results ===")
print(f"CV AUC: {metrics['cv']['mean']:.6f} ± {metrics['cv']['std']:.6f}")
print(f"Per-fold AUC: {metrics['cv']['per_fold']}")
print(f"学習時間: {metrics['train_time_sec']:.1f}秒")

# OOF全体スコア
oof_auc = roc_auc_score(oof_df["y_true"], oof_df["y_pred"])
oof_acc = accuracy_score(oof_df["y_true"], (oof_df["y_pred"] > 0.5).astype(int))

print(f"\n=== OOF Scores ===")
print(f"OOF AUC: {oof_auc:.6f}")
print(f"OOF Accuracy: {oof_acc:.6f}")

In [None]:
# Fold別スコア分析
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Fold別AUC
fold_aucs = []
for fold in range(cfg["cv"]["n_splits"]):
    fold_data = oof_df[oof_df["fold"] == fold]
    if len(fold_data) > 0:
        fold_auc = roc_auc_score(fold_data["y_true"], fold_data["y_pred"])
        fold_aucs.append(fold_auc)

axes[0, 0].bar(range(len(fold_aucs)), fold_aucs)
axes[0, 0].axhline(y=np.mean(fold_aucs), color="r", linestyle="--", label=f"Mean: {np.mean(fold_aucs):.4f}")
axes[0, 0].set_title("Fold別AUC")
axes[0, 0].set_xlabel("Fold")
axes[0, 0].set_ylabel("AUC")
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. 予測値の分布
axes[0, 1].hist(oof_df[oof_df["y_true"] == 0]["y_pred"], bins=30, alpha=0.7, label="Survived=0", density=True)
axes[0, 1].hist(oof_df[oof_df["y_true"] == 1]["y_pred"], bins=30, alpha=0.7, label="Survived=1", density=True)
axes[0, 1].set_title("予測値分布（クラス別）")
axes[0, 1].set_xlabel("予測確率")
axes[0, 1].set_ylabel("密度")
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Calibration curve
fraction_of_positives, mean_predicted_value = calibration_curve(oof_df["y_true"], oof_df["y_pred"], n_bins=10)
axes[1, 0].plot(mean_predicted_value, fraction_of_positives, "s-", label="Model")
axes[1, 0].plot([0, 1], [0, 1], "k:", label="Perfect calibration")
axes[1, 0].set_title("Calibration Curve")
axes[1, 0].set_xlabel("Mean predicted probability")
axes[1, 0].set_ylabel("Fraction of positives")
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Confusion Matrix
cm = confusion_matrix(oof_df["y_true"], (oof_df["y_pred"] > 0.5).astype(int))
sns.heatmap(cm, annot=True, fmt="d", ax=axes[1, 1], cmap="Blues")
axes[1, 1].set_title("Confusion Matrix (threshold=0.5)")
axes[1, 1].set_xlabel("Predicted")
axes[1, 1].set_ylabel("True")

plt.tight_layout()
plt.show()

In [None]:
# 閾値最適化
from sklearn.metrics import precision_recall_curve, f1_score

# PR曲線から最適閾値を求める
precision, recall, thresholds = precision_recall_curve(oof_df["y_true"], oof_df["y_pred"])
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]
best_f1 = f1_scores[best_threshold_idx]

print(f"最適閾値 (F1-score): {best_threshold:.4f}")
print(f"最適F1-score: {best_f1:.4f}")

# 閾値別の性能
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# PR曲線
axes[0].plot(recall, precision, label=f"PR Curve (AUC={roc_auc_score(oof_df['y_true'], oof_df['y_pred']):.3f})")
axes[0].scatter(
    recall[best_threshold_idx],
    precision[best_threshold_idx],
    color="red",
    s=100,
    label=f"Best threshold: {best_threshold:.3f}",
)
axes[0].set_xlabel("Recall")
axes[0].set_ylabel("Precision")
axes[0].set_title("Precision-Recall Curve")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# F1-score vs Threshold
axes[1].plot(thresholds, f1_scores[:-1])
axes[1].axvline(x=best_threshold, color="red", linestyle="--", label=f"Best: {best_threshold:.3f} (F1={best_f1:.3f})")
axes[1].set_xlabel("Threshold")
axes[1].set_ylabel("F1-score")
axes[1].set_title("F1-score vs Threshold")
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# CV品質チェック（リーク監査）
print("=== CV品質チェック ===")

# 1. Fold間のスコア分散
fold_auc_std = np.std(fold_aucs)
print(f"Fold間AUC標準偏差: {fold_auc_std:.6f}")

if fold_auc_std > 0.02:
    print("⚠️  WARNING: Fold間のスコア分散が大きい（リーク疑い）")
else:
    print("✅ Fold間のスコア分散は正常範囲")

# 2. 各Foldのサンプル数とtarget分布
fold_stats = []
for fold in range(cfg["cv"]["n_splits"]):
    fold_data = oof_df[oof_df["fold"] == fold]
    if len(fold_data) > 0:
        fold_stats.append(
            {
                "fold": fold,
                "size": len(fold_data),
                "positive_rate": fold_data["y_true"].mean(),
                "auc": roc_auc_score(fold_data["y_true"], fold_data["y_pred"]),
            }
        )

fold_stats_df = pd.DataFrame(fold_stats)
print("\nFold別統計:")
print(fold_stats_df)

# 3. Target分布の一様性チェック
target_std = fold_stats_df["positive_rate"].std()
print(f"\nFold間target分布標準偏差: {target_std:.6f}")

if target_std > 0.05:
    print("⚠️  WARNING: Fold間のtarget分布が不均一")
else:
    print("✅ Fold間のtarget分布は均一")

In [None]:
# 外れ値・異常値分析
print("=== 外れ値分析 ===")

# 予測確率の外れ値
q1, q3 = oof_df["y_pred"].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = oof_df[(oof_df["y_pred"] < lower_bound) | (oof_df["y_pred"] > upper_bound)]
print(f"予測確率外れ値: {len(outliers)}件 ({len(outliers) / len(oof_df) * 100:.2f}%)")

# 大きな予測誤差のサンプル
oof_df["abs_error"] = np.abs(oof_df["y_true"] - oof_df["y_pred"])
top_errors = oof_df.nlargest(10, "abs_error")[["index", "fold", "y_true", "y_pred", "abs_error"]]
print("\n最大予測誤差サンプル（上位10件）:")
print(top_errors)

# 予測誤差の分布
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(oof_df["abs_error"], bins=50, alpha=0.7, edgecolor="black")
plt.axvline(oof_df["abs_error"].mean(), color="red", linestyle="--", label=f"Mean: {oof_df['abs_error'].mean():.3f}")
plt.xlabel("絶対誤差")
plt.ylabel("頻度")
plt.title("予測誤差分布")
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(oof_df["y_pred"], oof_df["abs_error"], alpha=0.6)
plt.xlabel("予測確率")
plt.ylabel("絶対誤差")
plt.title("予測確率 vs 絶対誤差")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# CV vs LB相関分析（実際のLBスコアがあれば）
print("=== CV vs LB相関監視 ===")
print("※ 実際のLBスコア取得後に更新")

# 実験台帳があれば読み込んで相関を可視化
experiments_file = "../../experiments.csv"
try:
    experiments_df = pd.read_csv(experiments_file)
    if len(experiments_df) > 1:  # 複数実験がある場合
        # CV vs LB散布図
        valid_experiments = experiments_df.dropna(subset=["cv_mean", "lb_public"])
        if len(valid_experiments) > 0:
            plt.figure(figsize=(10, 6))
            plt.scatter(valid_experiments["cv_mean"], valid_experiments["lb_public"], s=100, alpha=0.7)

            # 相関係数
            corr = valid_experiments[["cv_mean", "lb_public"]].corr().iloc[0, 1]

            # 回帰直線
            z = np.polyfit(valid_experiments["cv_mean"], valid_experiments["lb_public"], 1)
            p = np.poly1d(z)
            plt.plot(valid_experiments["cv_mean"], p(valid_experiments["cv_mean"]), "r--", alpha=0.8)

            plt.xlabel("CV AUC")
            plt.ylabel("LB Public Score")
            plt.title(f"CV vs LB相関 (r={corr:.3f})")
            plt.grid(True, alpha=0.3)

            # 実験IDをラベル
            for idx, row in valid_experiments.iterrows():
                plt.annotate(
                    row["exp_id"], (row["cv_mean"], row["lb_public"]), xytext=(5, 5), textcoords="offset points", fontsize=8
                )

            plt.show()

            if abs(corr) < 0.7:
                print(f"⚠️  WARNING: CV-LB相関が低い (r={corr:.3f})")
                print("CVスキームの見直しを検討してください")
            else:
                print(f"✅ CV-LB相関は良好 (r={corr:.3f})")
        else:
            print("LBスコアがある実験がまだありません")
    else:
        print("比較可能な実験がまだありません")

except FileNotFoundError:
    print("実験台帳がまだありません")
except Exception as e:
    print(f"実験台帳読み込みエラー: {e}")

In [None]:
# レポートサマリー
print("\n" + "=" * 50)
print("EVALUATION SUMMARY")
print("=" * 50)

print(f"実験ID: {cfg['experiment']['id']}")
print(f"CV AUC: {metrics['cv']['mean']:.6f} ± {metrics['cv']['std']:.6f}")
print(f"OOF AUC: {oof_auc:.6f}")
print(f"最適閾値 (F1): {best_threshold:.4f}")
print(f"最適F1スコア: {best_f1:.4f}")

print("\n品質チェック結果:")
print(f"- Fold間スコア分散: {fold_auc_std:.6f} {'⚠️' if fold_auc_std > 0.02 else '✅'}")
print(f"- Target分布均一性: {target_std:.6f} {'⚠️' if target_std > 0.05 else '✅'}")
print(f"- 予測外れ値: {len(outliers)}件 ({len(outliers) / len(oof_df) * 100:.1f}%)")

print("\n次のアクション:")
if fold_auc_std > 0.02:
    print("- Fold間スコア分散が大きい → リーク調査・CVスキーム見直し")
if target_std > 0.05:
    print("- Target分布が不均一 → 分割方法の改善")
if len(outliers) > len(oof_df) * 0.1:
    print("- 外れ値が多い → 前処理・特徴量見直し")
if fold_auc_std <= 0.02 and target_std <= 0.05:
    print("- CV品質は良好 → ハイパーパラメータ調整・特徴量追加")

print("=" * 50)