In [5]:
# ✅ TabNetClassifier 기반의 모델 학습 및 평가 함수들 (pytorch-tabnet 사용)
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, ParameterSampler
import numpy as np
import pandas as pd
import time
import random
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 예시 확인용
from sklearn.datasets import make_classification

In [202]:
# ✅ TabNetClassifier 학습 함수
# 하이퍼파라미터(tabnet_params)와 수동 가중치(manual_weights)를 포함해 모델을 학습하고, validation 성능 기준으로 조기 종료

def train_tabnet_classifier(X_train, y_train, X_val, y_val,
                             manual_weights=None,
                             tabnet_params=None,
                             max_epochs=100, patience=10,
                             batch_size=512, device_name='auto'):
    device_name = 'cuda' if torch.cuda.is_available() else 'auto'
    # tabnet_params가 None이면 빈 딕셔너리로 초기화
    if tabnet_params is None:
        tabnet_params = {}

    # ✅ 입력 데이터 스케일링
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # TabNetClassifier 인스턴스 생성 (기본 파라미터 + 추가 하이퍼파라미터)
    clf = TabNetClassifier(
        seed=42,  # 재현성을 위한 시드 고정
        verbose=0,  # 학습 중 로그 출력 레벨
        device_name=device_name,  # 'auto'이면 가능한 경우 GPU 사용
        **tabnet_params  # 사용자 정의 하이퍼파라미터 적용
    )

    # 필요시 수동 가중치 적용
    if manual_weights is not None:
        clf.manual_weights = manual_weights  # 수동 가중치 기능 커스텀 시 활용

    # 모델 학습
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_val, y_val)],  # 검증 데이터로 평가
        eval_metric=['auc', 'logloss'],  # 평가 지표
        max_epochs=max_epochs,  # 최대 에폭 수
        patience=patience,  # 조기 종료 기준
        batch_size=batch_size,  # 배치 사이즈 설정
    )
    return clf


# ✅ TabNetClassifier 평가 함수
# ROC AUC 및 PR AUC(정밀도-재현율 곡선) 계산

def evaluate_tabnet_classifier(clf, X, y):
    # ✅ 입력 데이터 스케일링
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    preds = clf.predict_proba(X)[:, 1]  # 긍정 클래스 확률 예측
    roc = roc_auc_score(y, preds)       # ROC AUC 계산
    pr = average_precision_score(y, preds)  # PR AUC 계산
    return roc, pr


# ✅ TabNetClassifier 기반 교차검증 함수
# 지정된 수동 가중치(manual_weights)와 TabNet 하이퍼파라미터(tabnet_params)를 사용하여 K겹 교차검증 수행

def cross_validate_tabnet_classifier(X, y,
                                     manual_weights=None,
                                     tabnet_params=None,
                                     k=5,
                                     max_epochs=200,
                                     patience=30,
                                     batch_size=512,
                                     device_name='auto'):
    
    # Stratified K-Fold를 사용하여 클래스 비율 유지
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    roc_list, pr_list = [], []  # 점수 저장 리스트
    
    # 각 fold에 대해 반복 수행
    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        # fold마다 동일한 하이퍼파라미터로 모델 학습
        clf = train_tabnet_classifier(
            X_train, y_train, X_val, y_val,
            manual_weights=manual_weights,
            tabnet_params=tabnet_params,
            max_epochs=max_epochs,
            patience=patience,
            batch_size=batch_size,
            device_name=device_name
        )
        # 모델 평가 및 점수 저장
        roc, pr = evaluate_tabnet_classifier(clf, X_val, y_val)
        roc_list.append(roc)
        pr_list.append(pr)

    # 평균 및 표준편차 출력
    print(f"ROC AUC (mean ± std): {np.mean(roc_list):.4f} ± {np.std(roc_list):.4f}")
    print(f"PR  AUC (mean ± std): {np.mean(pr_list):.4f} ± {np.std(pr_list):.4f}")
    return roc_list, pr_list


# ✅ TabNetClassifier용 랜덤 서치 함수
# PR AUC를 기준으로 하이퍼파라미터 조합 중 상위 5개를 출력하고, 최고 조합 반환

def random_search_tabnet(X, y, param_dist, n_iter=10, k=3,
                          max_epochs=300, patience=50,
                          batch_size=512, device_name='auto'):
    results = []  # (ROC AUC, PR AUC, 하이퍼파라미터) 튜플 리스트
    
    # 하이퍼파라미터 랜덤 샘플링 반복
    for params in ParameterSampler(param_dist, n_iter=n_iter, random_state=None):
        # n_d == n_a로 강제 동기화
        if 'n_d' in params:
            params['n_a'] = params['n_d']

        print(f"Testing params: {params}")

        # TabNetClassifier에 적용할 파라미터만 필터링
        tabnet_keys = [
            "n_d", "n_a", "n_steps", "gamma",
            "lambda_sparse", "optimizer_params",
            "virtual_batch_size", "momentum"
        ]
        tabnet_params = {k: params[k] for k in tabnet_keys if k in params}

        # 교차검증 수행하여 PR AUC 측정
        roc_scores, pr_scores = cross_validate_tabnet_classifier(
            X, y,
            manual_weights=params.get("manual_weights", None),
            tabnet_params=tabnet_params,
            k=k,
            max_epochs=max_epochs,
            patience=patience,
            batch_size=batch_size,
            device_name=device_name
        )
        mean_pr = np.mean(pr_scores)  # PR AUC 평균
        mean_roc = np.mean(roc_scores)  # ROC AUC 평균
        results.append((mean_pr, mean_roc, params))

    # PR AUC 기준 상위 5개 하이퍼파라미터 조합 출력
    results.sort(reverse=True, key=lambda x: x[0])
    top_results = results[:5]

    print("\nTop 5 PR AUC results:")
    for i, (pr_score, roc_score, params) in enumerate(top_results, 1):
        print(f"{i}. ROC AUC = {roc_score:.4f}, PR AUC = {pr_score:.4f} with params: {params}")

    best_pr, best_roc, best_params = top_results[0]
    return best_params

In [7]:
# ✅ TabNetClassifier 기반의 모델 학습 및 평가 함수들 (DataFrame 친화 버전)
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold, ParameterSampler
from sklearn.preprocessing import StandardScaler
import numpy as np
import time

# ✅ TabNetClassifier 학습 함수 (DataFrame 버전)
def train_tabnet_classifier_df(X_train, y_train, X_val, y_val,
                                manual_weights=None,
                                tabnet_params=None,
                                max_epochs=100, patience=10,
                                batch_size=1024, device_name='auto'):
    if tabnet_params is None:
        tabnet_params = {}

    # ✅ 입력 데이터 스케일링 (DataFrame 유지)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

    clf = TabNetClassifier(
        seed=42,
        verbose=0,
        device_name=device_name,
        **tabnet_params
    )

    if manual_weights is not None:
        clf.manual_weights = manual_weights

    clf.fit(
        X_train=X_train_scaled.values, y_train=y_train.values,
        eval_set=[(X_val_scaled.values, y_val.values)],
        eval_metric=['auc', 'logloss'],
        max_epochs=max_epochs,
        patience=patience,
        batch_size=batch_size,
    )
    return clf, scaler

# ✅ TabNetClassifier 평가 함수 (DataFrame 버전)
def evaluate_tabnet_classifier_df(clf, X, y, scaler):
    # 입력 데이터를 학습 시 사용한 스케일러로 변환 (DataFrame 유지)
    X_scaled = scaler.transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    preds = clf.predict_proba(X_scaled.values)[:, 1]  # 긍정 클래스 확률 예측
    roc = roc_auc_score(y, preds)       # ROC AUC 계산
    pr = average_precision_score(y, preds)  # PR AUC 계산
    return roc, pr

# ✅ 교차검증 함수

def cross_validate_tabnet_classifier_df(X, y,
                                        manual_weights=None,
                                        tabnet_params=None,
                                        k=5,
                                        max_epochs=200,
                                        patience=30,
                                        batch_size=1024,
                                        device_name='auto'):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    roc_list, pr_list = [], []

    for train_idx, val_idx in skf.split(X.values, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        clf, scaler = train_tabnet_classifier_df(
            X_train, y_train, X_val, y_val,
            manual_weights=manual_weights,
            tabnet_params=tabnet_params,
            max_epochs=max_epochs,
            patience=patience,
            batch_size=batch_size,
            device_name=device_name
        )
        roc, pr = evaluate_tabnet_classifier_df(clf, X_val, y_val, scaler)
        roc_list.append(roc)
        pr_list.append(pr)

    print(f"ROC AUC (mean ± std): {np.mean(roc_list):.4f} ± {np.std(roc_list):.4f}")
    print(f"PR  AUC (mean ± std): {np.mean(pr_list):.4f} ± {np.std(pr_list):.4f}")
    return roc_list, pr_list

# ✅ 랜덤서치 함수

def random_search_tabnet_df(X, y, param_dist, n_iter=10, k=3,
                             max_epochs=100, patience=10,
                             batch_size=1024, device_name='auto'):
    results = []

    for params in ParameterSampler(param_dist, n_iter=n_iter, random_state=None):
        # n_d == n_a로 강제 동기화
        if 'n_d' in params:
            params['n_a'] = params['n_d']
        print(f"Testing params: {params}")

        tabnet_keys = [
            "n_d", "n_a", "n_steps", "gamma",
            "lambda_sparse", "optimizer_params",
            "virtual_batch_size", "momentum"
        ]
        tabnet_params = {k: params[k] for k in tabnet_keys if k in params}

        roc_scores, pr_scores = cross_validate_tabnet_classifier_df(
            X, y,
            manual_weights=params.get("manual_weights", None),
            tabnet_params=tabnet_params,
            k=k,
            max_epochs=max_epochs,
            patience=patience,
            batch_size=batch_size,
            device_name=device_name
        )
        mean_roc = np.mean(roc_scores)
        mean_pr = np.mean(pr_scores)
        results.append((mean_pr, mean_roc, params))

    results.sort(reverse=True, key=lambda x: x[0])
    top_results = results[:5]

    print("\nTop 5 PR AUC results:")
    for i, (pr_score, roc_score, params) in enumerate(top_results, 1):
        print(f"{i}. ROC AUC = {roc_score:.4f}, PR AUC = {pr_score:.4f} with params: {params}")

    best_pr, best_roc, best_params = top_results[0]
    return best_params


In [9]:
# ✅ 1. 시드 고정 함수 정의
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # multi-GPU 사용 시
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# ✅ 2. 시드 고정
seed_everything(42)

In [None]:
# ✅ TabNet 예시 코드 (작동 확인용)

# ✅ 1. 더미 데이터 생성 (이진 분류용)
X, y = make_classification(
    n_samples=1000, n_features=20,
    n_informative=10, n_redundant=5,
    random_state=42
)

# ✅ 2. train/test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ 3. TabNetClassifier 초기화 및 학습
clf = TabNetClassifier(
    seed=42,
    verbose=0,
    device_name='auto'  # GPU 사용 가능하면 자동 감지
)

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['auc'],
    max_epochs=100,
    patience=10,
    batch_size=256
)

# ✅ 4. 예측 및 평가
preds_proba = clf.predict_proba(X_test)[:, 1]
preds_label = clf.predict(X_test)

# ✅ 5. 성능 출력
roc = roc_auc_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds_label)

print(f"ROC AUC: {roc:.4f}")
print(f"Accuracy: {acc:.4f}")

In [11]:
# 도로 데이터 불러오기
data = pd.read_csv('data/(최종)_서울열선_광진도로.csv')
X = pd.get_dummies(data[['도로 종류', '도로폭', '경사각', '최근접_시설의_평균거리', '종합_평균_기온', '생활인구', '최근접_시설들_최소거리', '최근접_시설들_최대거리']])
y = data['열선']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train = X_train.astype('float')
X_test = X_test.astype('float')
X = X.astype('float')
# 💡 인덱스 초기화 (안 하면 batch_size가 전체인 경우 오류 발생 가능)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [57]:
param_dist = {
    # 표현 차원 (표현력 증가에 영향을 줌)
    "n_d": 64,
    "n_a": 64,  # 일반적으로 n_d와 같은 값을 사용
    # 모델 깊이 (더 깊으면 복잡도 증가, 학습 시간 증가)
    "n_steps": 3,
    # 희소성 관련 하이퍼파라미터 (특성 선택을 얼마나 강하게 할지)
    "gamma": 2.0,
    "lambda_sparse": 0.15,
    # 옵티마이저 학습률
    "optimizer_params": dict(lr=0.001)
}

In [59]:
# ✅ 3. TabNetClassifier 초기화 및 학습
model, scaler = train_tabnet_classifier_df(X_train=X_train, y_train=y_train, X_val=X_test, y_val=y_test,patience=250, max_epochs=500, tabnet_params=param_dist)

# ✅ 4. 예측 및 평가
roc, pr = evaluate_tabnet_classifier_df(model, X_test, y_test, scaler)
print(f"ROC AUC: {roc:.4f}")
print(f"PR  AUC: {pr:.4f}")

cross_validate_tabnet_classifier_df(X_train, y_train, batch_size=1024, tabnet_params=param_dist, max_epochs=200, patience=60)

Stop training because you reached max_epochs = 500 with best_epoch = 474 and best_val_0_logloss = 0.17709
Best weights from best epoch are automatically used!
ROC AUC: 0.9402
PR  AUC: 0.8579
Stop training because you reached max_epochs = 200 with best_epoch = 190 and best_val_0_logloss = 0.21651
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 200 with best_epoch = 196 and best_val_0_logloss = 0.20004
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 200 with best_epoch = 149 and best_val_0_logloss = 0.23165
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 172 with best_epoch = 112 and best_val_0_logloss = 0.1998
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 200 with best_epoch = 185 and best_val_0_logloss = 0.21332
Best weights from best epoch are automatically used!
ROC AUC (mean ± std): 0.9198

([0.917670902716915,
  0.9325007721984339,
  0.9088652093385974,
  0.9312399473607254,
  0.908533167617098],
 [0.7792528661717667,
  0.819999157915361,
  0.759417458367711,
  0.8105622880393578,
  0.7974581902997006])

In [13]:
param_dist = {
    # 표현 차원 (표현력 증가에 영향을 줌)
    "n_d": [32, 64],
    "n_a": [32, 64],  # 일반적으로 n_d와 같은 값을 사용

    # 모델 깊이 (더 깊으면 복잡도 증가, 학습 시간 증가)
    "n_steps": [3, 4, 5],

    # 희소성 관련 하이퍼파라미터 (특성 선택을 얼마나 강하게 할지)
    "gamma": [1.5, 1.7, 1.8, 2.0],
    "lambda_sparse": [1e-2, 5e-2, 1e-1],

    # 옵티마이저 학습률
    "optimizer_params": [
        dict(lr=0.001),
        dict(lr=0.005),
        dict(lr=0.01)
    ],

    # # 가상 배치 사이즈 (BN 안정화 목적)
    # "virtual_batch_size": [128, 256],

    # # Ghost BN 모멘텀
    # "momentum": [0.02, 0.05],

    # 향후 확장용 수동 특성 가중치
    "manual_weights": [None]
}


In [23]:
random_search_tabnet_df(X_train, y_train, param_dist=param_dist, n_iter=5, k=3, batch_size=X_train.shape[0], max_epochs=300, patience=150)

Testing params: {'optimizer_params': {'lr': 0.01}, 'n_steps': 3, 'n_d': 32, 'n_a': 32, 'manual_weights': None, 'lambda_sparse': 0.01, 'gamma': 1.5}
Stop training because you reached max_epochs = 300 with best_epoch = 163 and best_val_0_logloss = 0.20262
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 300 with best_epoch = 271 and best_val_0_logloss = 0.20769
Best weights from best epoch are automatically used!
Stop training because you reached max_epochs = 300 with best_epoch = 191 and best_val_0_logloss = 0.20679
Best weights from best epoch are automatically used!
ROC AUC (mean ± std): 0.9238 ± 0.0048
PR  AUC (mean ± std): 0.8092 ± 0.0028
Testing params: {'optimizer_params': {'lr': 0.01}, 'n_steps': 4, 'n_d': 64, 'n_a': 64, 'manual_weights': None, 'lambda_sparse': 0.05, 'gamma': 1.7}
Stop training because you reached max_epochs = 300 with best_epoch = 195 and best_val_0_logloss = 0.20762
Best weights from best epoch are automaticall

{'optimizer_params': {'lr': 0.01},
 'n_steps': 4,
 'n_d': 32,
 'n_a': 32,
 'manual_weights': None,
 'lambda_sparse': 0.1,
 'gamma': 1.5}

In [126]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 
X_train_scaled = scaler.fit_transform(X_train)   # fit + transform
X_val_scaled = scaler.transform(X_test)

In [21]:
X_train.shape[0]

5727