In [2]:
import os
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from tqdm import tqdm
from fttransformer_mask import FTTransformer
from math import log

# ============ 設定 ============
DATA_PATH = "cardio_train.csv"
TEST_RATIO = 0.2
categorical_features = ["gender", "cholesterol", "gluc", "smoke", "alco", "active"]
continuous_features = ["age", "height", "weight", "ap_hi", "ap_lo"]
additional_features = ["cholesterol", "gluc", "smoke", "alco", "active"]
MASK_BASE = [False, False] + [True] * 5 + [False] * 5
DEVICE = torch.device("cuda")  # 必要に応じて "cuda" に変更

# エントロピー計算関数
def entropy(p: float) -> float:
    if p <= 0 or p >= 1:
        return 0.0
    return - (p * log(p) + (1 - p) * log(1 - p))

# モデルパス辞書
fold_to_model_path = {
    1: "./save_mask_models/best_model_epoch300_fold1_ValAcc0.732.pth",
    2: "./save_mask_models/best_model_epoch300_fold2_ValAcc0.739.pth",
    3: "./save_mask_models/best_model_epoch300_fold3_ValAcc0.731.pth",
    4: "./save_mask_models/best_model_epoch300_fold4_ValAcc0.737.pth",
    5: "./save_mask_models/best_model_epoch300_fold5_ValAcc0.735.pth",
}

# ============ メイン処理 ============
def main():
    # データ読み込み＆分割
    data = pd.read_csv(DATA_PATH, sep=';')
    ids = data["id"].values
    X = data.drop(columns=["cardio"])
    y = data["cardio"]
    X_trainval, X_test, y_trainval, y_test, id_trainval, id_test = \
        train_test_split(X, y, ids, test_size=TEST_RATIO, random_state=42, stratify=y)

    # カテゴリ変数エンコード
    encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        X_trainval[col] = le.fit_transform(X_trainval[col])
        X_test[col] = le.transform(X_test[col])
        encoders[col] = le

    # 連続値スケーリング
    scaler = StandardScaler()
    X_trainval[continuous_features] = scaler.fit_transform(X_trainval[continuous_features])
    X_test[continuous_features] = scaler.transform(X_test[continuous_features])

    # 出現確率再計算
    feature_value_probs = {
        feat: X_trainval[feat].value_counts(normalize=True).sort_index().to_dict()
        for feat in additional_features
    }

    # 各foldごとに実行
    for fold, model_path in fold_to_model_path.items():
        print(f"=== Fold{fold} モデル読み込み: {model_path} ===")
        if not os.path.exists(model_path):
            print(f"❌ モデルが存在しません: {model_path}")
            continue

        # モデルロード
        model = FTTransformer(
            categories=[X[cat].nunique() for cat in categorical_features],
            num_continuous=len(continuous_features),
            dim=64, depth=6, heads=8, ff_dropout=0.2, attn_dropout=0.2
        ).to(DEVICE)
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model.eval()

        results = []
        # 推論ループ
        for i in tqdm(range(len(X_test)), desc=f"Fold{fold} 推論", unit="件"):
            sample_id = int(id_test[i])
            label = int(y_test.iloc[i])
            x_cat = torch.tensor([X_test.iloc[i][categorical_features].values], dtype=torch.long).to(DEVICE)
            x_cont = torch.tensor([X_test.iloc[i][continuous_features].values.astype(np.float32)], dtype=torch.float32).to(DEVICE)

            # ベース推論 + エントロピー
            with torch.no_grad():
                base_conf = torch.sigmoid(
                    model(x_cat, x_cont, torch.tensor([MASK_BASE], dtype=torch.bool).to(DEVICE))
                ).item()
            base_ent  = entropy(base_conf)
            base_pred = int(base_conf > 0.5)

            # エントロピー削減量で特徴選択
            best_reduction = -1.0
            best_feat      = None
            for feat in additional_features:
                mask = MASK_BASE.copy()
                mask[2 + additional_features.index(feat)] = False
                expected_ent = 0.0
                for val, prob in feature_value_probs[feat].items():
                    x_alt = x_cat.clone()
                    x_alt[0, categorical_features.index(feat)] = int(val)
                    with torch.no_grad():
                        conf_alt = torch.sigmoid(
                            model(x_alt, x_cont, torch.tensor([mask], dtype=torch.bool).to(DEVICE))
                        ).item()
                    expected_ent += entropy(conf_alt) * prob
                reduction = base_ent - expected_ent
                if reduction > best_reduction:
                    best_reduction = reduction
                    best_feat      = feat

            # 改良後推論
            final_mask = MASK_BASE.copy()
            final_mask[2 + additional_features.index(best_feat)] = False
            with torch.no_grad():
                best_conf = torch.sigmoid(
                    model(x_cat, x_cont, torch.tensor([final_mask], dtype=torch.bool).to(DEVICE))
                ).item()
            improved_pred = int(best_conf > 0.5)

            results.append({
                'id': sample_id,
                'answer': label,
                'base_pred': base_pred,
                'improved_pred': improved_pred,
            })

        # Accuracy計算
        base_acc = sum(r['base_pred'] == r['answer'] for r in results) / len(results)
        imp_acc  = sum(r['improved_pred'] == r['answer'] for r in results) / len(results)
        print(f"Fold{fold} - base accuracy: {base_acc:.4f}, improved accuracy: {imp_acc:.4f}")

        # 混同行列を横並びでプロット
        y_true = [r['answer'] for r in results]
        cm_base = confusion_matrix(y_true, [r['base_pred'] for r in results])
        cm_imp  = confusion_matrix(y_true, [r['improved_pred'] for r in results])
        fig, axes = plt.subplots(1, 2, figsize=(10, 5))
        ConfusionMatrixDisplay(cm_base, display_labels=[0,1]).plot(ax=axes[0], cmap='Blues', colorbar=False)
        axes[0].set_title('Base CM')
        ConfusionMatrixDisplay(cm_imp, display_labels=[0,1]).plot(ax=axes[1], cmap='Blues', colorbar=False)
        axes[1].set_title('Improved CM')
        fig.suptitle(f'Fold{fold} Confusion Matrices')
        fig.text(0.5, -0.05, f'Base Acc: {base_acc:.4f}    Improved Acc: {imp_acc:.4f}', ha='center')
        plt.tight_layout()
        plt.savefig(f'fold{fold}_cms.png', bbox_inches='tight')
        plt.close()

        # CSV保存
        pd.DataFrame(results).sort_values('id').to_csv(f"fold{fold}_results.csv", index=False)
        print(f"✅ fold{fold}_results.csv saved")

if __name__ == "__main__":
    main()



=== Fold1 モデル読み込み: ./save_mask_models/best_model_epoch300_fold1_ValAcc0.732.pth ===


Fold1 推論: 100%|██████████| 14000/14000 [16:43<00:00, 13.94件/s]


Fold1 - base accuracy: 0.6943, improved accuracy: 0.7056
✅ fold1_results.csv saved
=== Fold2 モデル読み込み: ./save_mask_models/best_model_epoch300_fold2_ValAcc0.739.pth ===


Fold2 推論: 100%|██████████| 14000/14000 [16:52<00:00, 13.82件/s]


Fold2 - base accuracy: 0.6929, improved accuracy: 0.7050
✅ fold2_results.csv saved
=== Fold3 モデル読み込み: ./save_mask_models/best_model_epoch300_fold3_ValAcc0.731.pth ===


Fold3 推論: 100%|██████████| 14000/14000 [16:55<00:00, 13.79件/s]


Fold3 - base accuracy: 0.6954, improved accuracy: 0.7054
✅ fold3_results.csv saved
=== Fold4 モデル読み込み: ./save_mask_models/best_model_epoch300_fold4_ValAcc0.737.pth ===


Fold4 推論: 100%|██████████| 14000/14000 [16:49<00:00, 13.86件/s]


Fold4 - base accuracy: 0.6964, improved accuracy: 0.7013
✅ fold4_results.csv saved
=== Fold5 モデル読み込み: ./save_mask_models/best_model_epoch300_fold5_ValAcc0.735.pth ===


Fold5 推論: 100%|██████████| 14000/14000 [16:52<00:00, 13.82件/s]


Fold5 - base accuracy: 0.6899, improved accuracy: 0.7025
✅ fold5_results.csv saved
