In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import roc_auc_score
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer

In [2]:
# Read and split the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [3]:
train = train_origin.sample(frac=0.01, random_state = 42).set_index('id').astype(str)
#train = train_origin.set_index('id').astype(str)

X_train, X_test = train_test_split(train, test_size=0.2, random_state=0, stratify=train['Response'])
y_train = X_train.pop('Response')
y_test = X_test.pop('Response')

enc = TargetEncoder(random_state=0)
X_train = pd.DataFrame(enc.fit_transform(X_train, y_train), 
                       index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(enc.transform(X_test), index=X_test.index, 
                      columns=X_test.columns)
 
model = TabNetClassifier()
model.fit(X_train.values, y_train.values,
          batch_size=65536, max_epochs=5,
          compute_importance=False)
roc_auc_score(y_test, model.predict_proba(X_test.values)[:,1])



epoch 0  | loss: 0.71052 |  0:00:13s
epoch 1  | loss: 0.44921 |  0:00:32s
epoch 2  | loss: 0.36358 |  0:00:44s
epoch 3  | loss: 0.32392 |  0:00:52s
epoch 4  | loss: 0.30741 |  0:01:01s


0.8072773615824427

In [8]:
# 사전 훈련을 위한 파라미터 설정
pretrain_params = {
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": dict(lr=2e-2),
    "mask_type": "entmax",  # "sparsemax"
    "verbose": 1
}

# 사전 훈련 모델 초기화
pretrainer = TabNetPretrainer(**pretrain_params)

# 사전 훈련 실행
pretrainer.fit(X_train.values, max_epochs=5)



epoch 0  | loss: 12267809.8613|  0:00:16s
epoch 1  | loss: 162680.52657|  0:00:33s
epoch 2  | loss: 123542.84819|  0:00:51s
epoch 3  | loss: 59452.34725|  0:01:10s
epoch 4  | loss: 48788.60158|  0:01:24s


In [11]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,  # 최적화 함수는 필요에 따라 설정
    optimizer_params=dict(lr=1e-3),  # 학습률은 주 훈련에 맞게 조정
    scheduler_params={"step_size": 50, "gamma": 0.9},  # 학습 스케줄러 설정
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=1
)

# 탭넷 분류 모델 학습
clf.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], patience=3, max_epochs=5, eval_name=['test'], eval_metric=['auc'])
y_pred_proba = clf.predict_proba(X_test.values)[:,1]
roc_auc_score(y_test, y_pred_proba)



epoch 0  | loss: 0.41122 | test_auc: 0.76135 |  0:00:17s
epoch 1  | loss: 0.30034 | test_auc: 0.79994 |  0:00:32s
epoch 2  | loss: 0.2865  | test_auc: 0.82827 |  0:00:56s
epoch 3  | loss: 0.28001 | test_auc: 0.84083 |  0:01:16s
epoch 4  | loss: 0.27656 | test_auc: 0.84687 |  0:01:35s
Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_test_auc = 0.84687




0.8468741471702276