base(Python 3.8.3) 커널 선택

In [None]:
#!pip install torch torchvision pytorch-tabnet

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer


In [6]:
# 데이터 로드
train = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')
train = train.sample(frac=0.01, random_state = 42)

# 범주형 변수를 팩터로 변환 (카테고리형)
train.iloc[:,[1,3,4,5,6,7,9]] = train.iloc[:,[1,3,4,5,6,7,9]].astype('category')

# 최소-최대 정규화 (Standard 스케일링)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train.iloc[:,[2,8,10]] = scaler.fit_transform(train.iloc[:,[2,8,10]])

# 이분변수 생성: "Annual_Premium" == 2630.0 인 경우
train['Annual_Premium_Binary'] = (train['Annual_Premium'] == 2630.0).astype('category')

# 로그 변환된 "Annual_Premium" 변수 생성
train['Annual_Premium_Log'] = np.where(train['Annual_Premium'] > 0, np.log1p(train['Annual_Premium']), 0)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id', 'Annual_Premium'])

# 원-핫 인코딩 (One-Hot Encoding)
category_columns = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Annual_Premium_Binary']
train = pd.get_dummies(train, columns=category_columns, drop_first=True, dtype=int)

# XGBoost에서 발생하는 문제 해결
train.columns = train.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '')

# 특징과 레이블 분리
X = train.drop(columns=['Response'])
y = train['Response']

# 훈련 세트와 테스트 세트로 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [9]:
# 사전 훈련을 위한 파라미터 설정
pretrain_params = {
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": dict(lr=2e-2),
    "mask_type": "entmax",  # "sparsemax"
    "verbose": 1
}

# 사전 훈련 모델 초기화
pretrainer = TabNetPretrainer(**pretrain_params)

# 사전 훈련 실행
pretrainer.fit(X_train=X_train, max_epochs=2)



epoch 0  | loss: 41.89587|  0:00:29s
epoch 1  | loss: 0.63375 |  0:00:56s
epoch 2  | loss: 0.59554 |  0:01:18s
epoch 3  | loss: 0.58189 |  0:01:46s
epoch 4  | loss: 0.57577 |  0:02:13s
epoch 5  | loss: 0.57751 |  0:02:36s
epoch 6  | loss: 0.57401 |  0:02:59s
epoch 7  | loss: 0.56716 |  0:03:24s
epoch 8  | loss: 0.56276 |  0:03:51s
epoch 9  | loss: 0.55913 |  0:04:13s


In [30]:
# 탭넷 분류 모델 초기화
#clf = TabNetClassifier()

clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,  # 최적화 함수는 필요에 따라 설정
    optimizer_params=dict(lr=1e-3),  # 학습률은 주 훈련에 맞게 조정
    scheduler_params={"step_size": 50, "gamma": 0.9},  # 학습 스케줄러 설정
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=1
)



In [33]:
# 탭넷 분류 모델 학습
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], patience=3, max_epochs=5, eval_name=['test'], eval_metric=['accuracy'])
roc_auc_score(y_test, y_pred_proba)

epoch 0  | loss: 0.61286 | test_accuracy: 0.87731 |  0:01:11s
epoch 1  | loss: 0.48734 | test_accuracy: 0.87727 |  0:02:15s
epoch 2  | loss: 0.4286  | test_accuracy: 0.87731 |  0:03:41s
epoch 3  | loss: 0.37945 | test_accuracy: 0.8774  |  0:04:59s
epoch 4  | loss: 0.34294 | test_accuracy: 0.87731 |  0:06:12s
epoch 5  | loss: 0.32534 | test_accuracy: 0.87723 |  0:07:48s
epoch 6  | loss: 0.3137  | test_accuracy: 0.87558 |  0:09:05s
epoch 7  | loss: 0.30458 | test_accuracy: 0.87618 |  0:10:25s
epoch 8  | loss: 0.29685 | test_accuracy: 0.87649 |  0:12:10s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_test_accuracy = 0.8774




In [28]:
import joblib

#모델 저장
filepath1 = "pretrainer.joblib"
joblib.dump(pretrainer, filepath1)

filepath2 = "model_weights.joblib"
joblib.dump(clf, filepath2)

['model_weights.joblib']

In [41]:
#모델 로드
filepath = "model_weights.joblib"
clf_loaded = joblib.load(filepath)

In [44]:
#모델 로드
filepath = "model_weights.joblib"
clf_loaded = joblib.load(filepath)

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc

# 모델 예측
y_pred = clf_loaded.predict(X_test)

# 모델 예측 확률
y_pred_proba = clf_loaded.predict_proba(X_test)[:, 1]

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("\nTest accuracy:", accuracy)

# 혼동행렬 계산
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

# ROC AUC 계산
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("\nROC AUC Score:", roc_auc)


Test accuracy: 0.8774011299435028

Confusion Matrix:
 [[20187     0]
 [ 2821     2]]

ROC AUC Score: 0.7785392990698149
