In [19]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import roc_auc_score

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier

In [20]:
# Read the train data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [35]:
train = train_origin.set_index('id').astype(str)
#train = train_origin.sample(frac=0.01, random_state = 42)
#train = train.set_index('id').astype(str)

X_train, X_valid = train_test_split(train, test_size=0.01, random_state=0, stratify=train['Response'])
y_train = X_train.pop('Response')
y_valid = X_valid.pop('Response')

enc = TargetEncoder(random_state=0)
X_train = pd.DataFrame(enc.fit_transform(X_train, y_train), index=X_train.index, columns=X_train.columns)
X_valid = pd.DataFrame(enc.transform(X_valid), index=X_valid.index, columns=X_valid.columns)


In [36]:
# 사전 훈련을 위한 파라미터 설정
pretrain_params = {
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": dict(lr=2e-2),
    "mask_type": "entmax",  # "sparsemax"
    "verbose": 1
}

# 사전 훈련 모델 초기화
pretrainer = TabNetPretrainer(**pretrain_params)

# 사전 훈련 실행
pretrainer.fit(X_train.values, max_epochs=7)



epoch 0  | loss: 197188.03535|  0:00:35s
epoch 1  | loss: 1512.47923|  0:01:02s
epoch 2  | loss: 1734.45615|  0:01:26s
epoch 3  | loss: 628.81769|  0:01:46s
epoch 4  | loss: 637.37872|  0:02:16s
epoch 5  | loss: 510.30608|  0:02:39s
epoch 6  | loss: 240.90775|  0:03:00s
epoch 7  | loss: 290.35662|  0:03:28s
epoch 8  | loss: 197.94813|  0:03:54s
epoch 9  | loss: 257.16839|  0:04:24s


In [38]:
TabNet_model = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,  # 최적화 함수는 필요에 따라 설정
    optimizer_params=dict(lr=1.5e-3),  # 학습률은 주 훈련에 맞게 조정
    scheduler_params={"step_size": 10, "gamma": 0.8},  # 학습 스케줄러 설정
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=1
)

TabNet_model.fit(
        X_train.values, y_train.values,
        eval_set=[(X_valid.values, y_valid.values)],
        eval_name=['test'],
        eval_metric=['auc'],
        max_epochs=10,  # 한 번에 한 epoch만 훈련
        patience=3
)

#import subprocess
#subprocess.run(['say', '-v', 'Yuna', '모델학습이, 드디어 끝났도다'])



epoch 0  | loss: 0.39213 | test_auc: 0.77351 |  0:00:24s
epoch 1  | loss: 0.29332 | test_auc: 0.79266 |  0:00:48s
epoch 2  | loss: 0.28262 | test_auc: 0.82356 |  0:01:12s
epoch 3  | loss: 0.2774  | test_auc: 0.82987 |  0:01:37s
epoch 4  | loss: 0.2744  | test_auc: 0.83312 |  0:01:57s
epoch 5  | loss: 0.27284 | test_auc: 0.83485 |  0:02:16s
epoch 6  | loss: 0.27088 | test_auc: 0.83688 |  0:02:56s
epoch 7  | loss: 0.26986 | test_auc: 0.83785 |  0:03:48s
epoch 8  | loss: 0.26911 | test_auc: 0.83722 |  0:04:30s
epoch 9  | loss: 0.26808 | test_auc: 0.83778 |  0:04:48s
epoch 10 | loss: 0.26749 | test_auc: 0.8399  |  0:05:09s
epoch 11 | loss: 0.26675 | test_auc: 0.84093 |  0:05:27s
epoch 12 | loss: 0.26653 | test_auc: 0.84112 |  0:05:48s
epoch 13 | loss: 0.26618 | test_auc: 0.84256 |  0:06:04s
epoch 14 | loss: 0.26569 | test_auc: 0.84207 |  0:06:24s
epoch 15 | loss: 0.26565 | test_auc: 0.84115 |  0:06:43s
epoch 16 | loss: 0.26507 | test_auc: 0.84155 |  0:07:02s
epoch 17 | loss: 0.26486 | test



CompletedProcess(args=['say', '-v', 'Yuna', '모델학습이, 드디어 끝났도다'], returncode=0)

In [30]:
# x_train_pred 예측 생성(StackNN 학습용)
y_train_pred = TabNet_model.predict_proba(X_train.values)[:,1]
train_pred = pd.DataFrame({'id': X_train.index, 'Response': y_train_pred })

# 예측을 CSV 파일로 저장
train_pred .to_csv('train_pred_tabnet.csv', index=False)
print("Predictions saved to 'train_pred_tabnet.csv'")

Predictions saved to 'train_pred_tabnet.csv'


In [None]:
#test 데이터 로드
test_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/test.csv')

#인덱스 제외
X_test = test_origin.set_index('id').astype(str)

# 변수 타겟 인코딩
X_test = pd.DataFrame(enc.transform(X_test), index=X_test.index, columns=X_test.columns)

In [None]:
# 예측 생성
y_test_pred = TabNet_model.predict_proba(X_test.values)[:,1]

# 'id'와 'Response' 열이 있는 DataFrame 생성
submission = pd.DataFrame({'id': X_test.index, 'Response': y_test_pred})

# 예측을 CSV 파일로 저장
submission.to_csv('tabnet_predictions.csv', index=False)
print("Predictions saved to 'tabnet_predictions.csv'")