In [None]:
import pandas as pd
import torch
import numpy as np
from itertools import product
from tqdm import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
from tab_transformer_pytorch import TabTransformer
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

# --- 共通設定 ---
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS     = 50
BATCH_SIZE = 256
K_FOLDS    = 5
TARGET     = "cardio"
BASE_CATEG = ["gender"]
BASE_CONT  = ["age_years", "height", "weight", "ap_hi", "ap_lo"]
OPTIONAL   = ["cholesterol", "gluc", "smoke", "alco", "active"]

# --- データ読み込み＆前処理 ---
data = pd.read_csv("cardio_train.csv", sep=";")
data["age_years"] = (data["age"] / 365).astype(int)
for col in BASE_CATEG + OPTIONAL:
    data[col] = LabelEncoder().fit_transform(data[col])
scaler = StandardScaler()
data[BASE_CONT] = scaler.fit_transform(data[BASE_CONT])

# --- Dataset ---
class CardioDataset(Dataset):
    def __init__(self, df, categ_cols, cont_cols):
        self.y     = df[TARGET].values.astype(np.float32)
        self.categ = df[categ_cols].values.astype(np.int64)
        self.cont  = df[cont_cols].values.astype(np.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return (
            torch.tensor(self.categ[idx], dtype=torch.int64),
            torch.tensor(self.cont[idx],  dtype=torch.float32),
            torch.tensor(self.y[idx],     dtype=torch.float32),
        )

# --- モデル生成 ---
def build_model(n_categories, n_continuous):
    cont_mean_std = torch.tensor([[0.,1.]] * n_continuous).to(DEVICE)
    m = TabTransformer(
        categories=tuple(n_categories),
        num_continuous=n_continuous,
        dim=32, dim_out=1, depth=4, heads=4,
        attn_dropout=0.1, ff_dropout=0.1,
        mlp_hidden_mults=(4,2), mlp_act=nn.ReLU(),
        continuous_mean_std=cont_mean_std
    ).to(DEVICE)
    m.categories_offset = m.categories_offset.to(DEVICE)
    return m

# --- K-Fold CV での Validation 精度取得 ---
def cv_score(df, categ_cols, cont_cols):
    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    scores = []
    for tr_idx, va_idx in kf.split(df):
        df_tr = df.iloc[tr_idx].reset_index(drop=True)
        df_va = df.iloc[va_idx].reset_index(drop=True)
        tr_loader = DataLoader(CardioDataset(df_tr, categ_cols, cont_cols),
                            batch_size=BATCH_SIZE, shuffle=True)
        va_loader = DataLoader(CardioDataset(df_va, categ_cols, cont_cols),
                            batch_size=BATCH_SIZE, shuffle=False)

        # モデル初期化
        model = build_model(
            n_categories=[data[c].nunique() for c in categ_cols],
            n_continuous=len(cont_cols)
        )
        opt = optim.AdamW(model.parameters(), lr=1e-3)
        crit = nn.BCEWithLogitsLoss()

        # 学習（EPOCHS エポック）
        for _ in range(EPOCHS):
            model.train()
            for x_cat, x_cont, y in tr_loader:
                x_cat, x_cont, y = x_cat.to(DEVICE), x_cont.to(DEVICE), y.to(DEVICE)
                opt.zero_grad()
                loss = crit(model(x_cat, x_cont).squeeze(1), y)
                loss.backward()
                opt.step()

        # 検証
        model.eval()
        preds, trues = [], []
        with torch.no_grad():
            for x_cat, x_cont, y in va_loader:
                x_cat, x_cont = x_cat.to(DEVICE), x_cont.to(DEVICE)
                out = model(x_cat, x_cont).squeeze(1)
                preds.extend((torch.sigmoid(out)>0.5).int().cpu().tolist())
                trues.extend(y.int().tolist())
        scores.append(accuracy_score(trues, preds))
    return np.mean(scores)

# --- テスト評価関数（変更なし） ---
def test_score(df_trval, df_test, categ_cols, cont_cols):
    loader = DataLoader(CardioDataset(df_trval, categ_cols, cont_cols),
                        batch_size=BATCH_SIZE, shuffle=True)
    model = build_model(
        n_categories=[data[c].nunique() for c in categ_cols],
        n_continuous=len(cont_cols)
    )
    opt = optim.AdamW(model.parameters(), lr=1e-3)
    crit = nn.BCEWithLogitsLoss()
    for _ in range(EPOCHS):
        model.train()
        for x_cat, x_cont, y in loader:
            x_cat, x_cont, y = x_cat.to(DEVICE), x_cont.to(DEVICE), y.to(DEVICE)
            opt.zero_grad()
            loss = crit(model(x_cat, x_cont).squeeze(1), y)
            loss.backward()
            opt.step()

    # テスト評価
    test_loader = DataLoader(CardioDataset(df_test, categ_cols, cont_cols),
                            batch_size=BATCH_SIZE, shuffle=False)
    preds, trues = [], []
    model.eval()
    with torch.no_grad():
        for x_cat, x_cont, y in test_loader:
            x_cat, x_cont = x_cat.to(DEVICE), x_cont.to(DEVICE)
            out = model(x_cat, x_cont).squeeze(1)
            preds.extend((torch.sigmoid(out)>0.5).int().cpu().tolist())
            trues.extend(y.int().tolist())
    return accuracy_score(trues, preds)

# --- 実験ループ（進捗バー付き） ---
results = []
trainval, test = train_test_split(data, test_size=0.2, random_state=42)

for flags in tqdm(product([0,1], repeat=len(OPTIONAL)),
                total=2**len(OPTIONAL), desc="Feature combinations"):
    selected    = [f for f,flag in zip(OPTIONAL, flags) if flag]
    categ_cols  = BASE_CATEG + selected
    cont_cols   = BASE_CONT

    # val の平均 CV 精度と test 精度 を両方取得
    val_acc  = cv_score(trainval, categ_cols, cont_cols)
    test_acc = test_score(trainval, test,   categ_cols, cont_cols)

    results.append({
        **{f: "✓" if f in selected else "" for f in OPTIONAL},
        "val_acc":  val_acc,
        "test_acc": test_acc
    })

# --- 結果まとめ & ソート ---
df_res = pd.DataFrame(results)
df_res = df_res.sort_values("test_acc", ascending=False).reset_index(drop=True)
# 小数第4位表示
df_res["val_acc"]  = df_res["val_acc"].map(lambda x: f"{x:.4f}")
df_res["test_acc"] = df_res["test_acc"].map(lambda x: f"{x:.4f}")

print(df_res)
# df_res.to_csv("results_with_val_test.csv", index=False)


Feature combinations: 100%|██████████| 32/32 [1:33:46<00:00, 175.82s/it]

   cholesterol gluc smoke alco active val_acc test_acc
0            ✓    ✓     ✓           ✓  0.6872   0.6961
1            ✓                      ✓  0.6864   0.6951
2            ✓    ✓     ✓    ✓         0.6833   0.6948
3            ✓                         0.6845   0.6944
4            ✓    ✓                 ✓  0.6832   0.6936
5            ✓          ✓    ✓      ✓  0.6840   0.6936
6            ✓          ✓              0.6848   0.6936
7            ✓               ✓      ✓  0.6887   0.6931
8            ✓          ✓           ✓  0.6861   0.6931
9            ✓    ✓                    0.6844   0.6919
10           ✓    ✓     ✓    ✓      ✓  0.6807   0.6911
11           ✓          ✓    ✓         0.6859   0.6900
12           ✓    ✓          ✓         0.6847   0.6890
13           ✓               ✓         0.6862   0.6887
14           ✓    ✓          ✓      ✓  0.6874   0.6879
15           ✓    ✓     ✓              0.6846   0.6874
16                ✓          ✓         0.6749   0.6845
17        


