In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.model_selection import train_test_split

import optuna
from sklearn.model_selection import KFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./Tabnet_Raw_final.csv')

In [3]:
df.replace((np.inf, -np.inf), np.nan, inplace=True)
df.dropna(inplace=True)

In [9]:
train = df[df['회계년도'] <= 2017]
test = df[df['회계년도'] > 2017]

X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)

y_train = train['부실']
y_test = test['부실']

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X = X_train.values
y = y_train.values

def Objective(trial):
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 56, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                    lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                    mask_type=mask_type, n_shared=n_shared,
                    scheduler_params=dict(mode="min",
                                        patience=trial.suggest_int("patienceScheduler",low=3,high=10), # changing sheduler patience to be lower than early stopping patience 
                                        min_lr=1e-5,
                                        factor=0.5,),
                    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                    verbose=0,
                    ) #early stopping
    kf = KFold(n_splits=3, random_state=42, shuffle=True)
    CV_score_array    =[]
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        clf = TabNetClassifier(**tabnet_params)
        clf.fit(X_train=X_train, y_train=y_train,
                eval_set=[(X_valid, y_valid)],
                patience=trial.suggest_int("patience",low=15,high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
                eval_metric=['auc'])
        CV_score_array.append(clf.best_cost)
    avg = np.mean(CV_score_array)
    return avg

In [10]:
study = optuna.create_study(direction="maximize", study_name='TabNet optimization')
study.optimize(Objective, n_trials=20) # timeout=6*60

[32m[I 2023-04-28 21:45:15,199][0m A new study created in memory with name: TabNet optimization[0m


Stop training because you reached max_epochs = 72 with best_epoch = 71 and best_val_0_auc = 0.81428




Stop training because you reached max_epochs = 72 with best_epoch = 67 and best_val_0_auc = 0.85482





Early stopping occurred at epoch 65 with best_epoch = 47 and best_val_0_auc = 0.86366


[32m[I 2023-04-28 21:50:04,633][0m Trial 0 finished with value: 0.8442519738092339 and parameters: {'mask_type': 'sparsemax', 'n_da': 64, 'n_steps': 3, 'gamma': 1.2, 'n_shared': 2, 'lambda_sparse': 1.653323480984695e-05, 'patienceScheduler': 3, 'patience': 18, 'epochs': 72}. Best is trial 0 with value: 0.8442519738092339.[0m


Stop training because you reached max_epochs = 43 with best_epoch = 30 and best_val_0_auc = 0.82774




Stop training because you reached max_epochs = 43 with best_epoch = 38 and best_val_0_auc = 0.84249





Early stopping occurred at epoch 35 with best_epoch = 17 and best_val_0_auc = 0.84529


[32m[I 2023-04-28 21:52:07,797][0m Trial 1 finished with value: 0.8385051415723365 and parameters: {'mask_type': 'sparsemax', 'n_da': 64, 'n_steps': 2, 'gamma': 1.2, 'n_shared': 2, 'lambda_sparse': 0.0003646795370874319, 'patienceScheduler': 3, 'patience': 18, 'epochs': 43}. Best is trial 0 with value: 0.8442519738092339.[0m


Stop training because you reached max_epochs = 23 with best_epoch = 2 and best_val_0_auc = 0.73396




Stop training because you reached max_epochs = 23 with best_epoch = 10 and best_val_0_auc = 0.78068




Stop training because you reached max_epochs = 23 with best_epoch = 21 and best_val_0_auc = 0.84291


[32m[I 2023-04-28 21:53:21,778][0m Trial 2 finished with value: 0.785851014051676 and parameters: {'mask_type': 'entmax', 'n_da': 64, 'n_steps': 3, 'gamma': 1.2, 'n_shared': 1, 'lambda_sparse': 3.0276883793141853e-05, 'patienceScheduler': 8, 'patience': 28, 'epochs': 23}. Best is trial 0 with value: 0.8442519738092339.[0m


Stop training because you reached max_epochs = 49 with best_epoch = 48 and best_val_0_auc = 0.81207




Stop training because you reached max_epochs = 49 with best_epoch = 48 and best_val_0_auc = 0.76968


[32m[I 2023-04-28 21:54:18,475][0m Trial 3 finished with value: 0.7937882850699701 and parameters: {'mask_type': 'sparsemax', 'n_da': 60, 'n_steps': 1, 'gamma': 1.4, 'n_shared': 1, 'lambda_sparse': 7.099637609814594e-06, 'patienceScheduler': 7, 'patience': 21, 'epochs': 49}. Best is trial 0 with value: 0.8442519738092339.[0m



Early stopping occurred at epoch 28 with best_epoch = 7 and best_val_0_auc = 0.79962
Stop training because you reached max_epochs = 42 with best_epoch = 41 and best_val_0_auc = 0.81767




Stop training because you reached max_epochs = 42 with best_epoch = 41 and best_val_0_auc = 0.7973





Early stopping occurred at epoch 29 with best_epoch = 13 and best_val_0_auc = 0.84961


[32m[I 2023-04-28 21:56:24,363][0m Trial 4 finished with value: 0.8215287477681498 and parameters: {'mask_type': 'sparsemax', 'n_da': 60, 'n_steps': 3, 'gamma': 1.2, 'n_shared': 2, 'lambda_sparse': 0.00022680327808937193, 'patienceScheduler': 7, 'patience': 16, 'epochs': 42}. Best is trial 0 with value: 0.8442519738092339.[0m


Stop training because you reached max_epochs = 72 with best_epoch = 61 and best_val_0_auc = 0.85054





Early stopping occurred at epoch 19 with best_epoch = 1 and best_val_0_auc = 0.75786





Early stopping occurred at epoch 58 with best_epoch = 40 and best_val_0_auc = 0.86545


[32m[I 2023-04-28 21:58:23,933][0m Trial 5 finished with value: 0.8246162054930664 and parameters: {'mask_type': 'entmax', 'n_da': 64, 'n_steps': 1, 'gamma': 1.0, 'n_shared': 3, 'lambda_sparse': 2.455053258025464e-05, 'patienceScheduler': 6, 'patience': 18, 'epochs': 72}. Best is trial 0 with value: 0.8442519738092339.[0m


Stop training because you reached max_epochs = 74 with best_epoch = 70 and best_val_0_auc = 0.86507




Stop training because you reached max_epochs = 74 with best_epoch = 67 and best_val_0_auc = 0.83109




Stop training because you reached max_epochs = 74 with best_epoch = 63 and best_val_0_auc = 0.85858


[32m[I 2023-04-28 22:02:09,536][0m Trial 6 finished with value: 0.8515794855668214 and parameters: {'mask_type': 'entmax', 'n_da': 60, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 6.810295174050659e-05, 'patienceScheduler': 5, 'patience': 23, 'epochs': 74}. Best is trial 6 with value: 0.8515794855668214.[0m


Stop training because you reached max_epochs = 37 with best_epoch = 34 and best_val_0_auc = 0.79269




Stop training because you reached max_epochs = 37 with best_epoch = 32 and best_val_0_auc = 0.84089




Stop training because you reached max_epochs = 37 with best_epoch = 36 and best_val_0_auc = 0.82082


[32m[I 2023-04-28 22:03:39,313][0m Trial 7 finished with value: 0.8181331813918441 and parameters: {'mask_type': 'sparsemax', 'n_da': 64, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 1, 'lambda_sparse': 1.2661942471788473e-05, 'patienceScheduler': 6, 'patience': 27, 'epochs': 37}. Best is trial 6 with value: 0.8515794855668214.[0m


Stop training because you reached max_epochs = 27 with best_epoch = 26 and best_val_0_auc = 0.85165




Stop training because you reached max_epochs = 27 with best_epoch = 7 and best_val_0_auc = 0.73776




Stop training because you reached max_epochs = 27 with best_epoch = 5 and best_val_0_auc = 0.77769


[32m[I 2023-04-28 22:04:29,387][0m Trial 8 finished with value: 0.789034481730908 and parameters: {'mask_type': 'entmax', 'n_da': 60, 'n_steps': 1, 'gamma': 1.0, 'n_shared': 3, 'lambda_sparse': 0.0003985644058373171, 'patienceScheduler': 10, 'patience': 28, 'epochs': 27}. Best is trial 6 with value: 0.8515794855668214.[0m


Stop training because you reached max_epochs = 5 with best_epoch = 2 and best_val_0_auc = 0.74402




Stop training because you reached max_epochs = 5 with best_epoch = 3 and best_val_0_auc = 0.75328




Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_val_0_auc = 0.77317


[32m[I 2023-04-28 22:04:48,622][0m Trial 9 finished with value: 0.7568244518899094 and parameters: {'mask_type': 'sparsemax', 'n_da': 64, 'n_steps': 2, 'gamma': 1.0, 'n_shared': 3, 'lambda_sparse': 2.5854930796372623e-06, 'patienceScheduler': 8, 'patience': 27, 'epochs': 5}. Best is trial 6 with value: 0.8515794855668214.[0m


Stop training because you reached max_epochs = 100 with best_epoch = 86 and best_val_0_auc = 0.89197




Stop training because you reached max_epochs = 100 with best_epoch = 99 and best_val_0_auc = 0.83408





Early stopping occurred at epoch 56 with best_epoch = 33 and best_val_0_auc = 0.85313


[32m[I 2023-04-28 22:08:42,988][0m Trial 10 finished with value: 0.8597263305499867 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0001021180196306919, 'patienceScheduler': 4, 'patience': 23, 'epochs': 100}. Best is trial 10 with value: 0.8597263305499867.[0m


Stop training because you reached max_epochs = 97 with best_epoch = 84 and best_val_0_auc = 0.88647





Early stopping occurred at epoch 25 with best_epoch = 2 and best_val_0_auc = 0.76484





Early stopping occurred at epoch 30 with best_epoch = 7 and best_val_0_auc = 0.82558


[32m[I 2023-04-28 22:11:05,326][0m Trial 11 finished with value: 0.8256312410202135 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.00010901264761156722, 'patienceScheduler': 4, 'patience': 23, 'epochs': 97}. Best is trial 10 with value: 0.8597263305499867.[0m



Early stopping occurred at epoch 94 with best_epoch = 71 and best_val_0_auc = 0.89627





Early stopping occurred at epoch 26 with best_epoch = 3 and best_val_0_auc = 0.75136




Stop training because you reached max_epochs = 98 with best_epoch = 94 and best_val_0_auc = 0.85763


[32m[I 2023-04-28 22:14:24,814][0m Trial 12 finished with value: 0.8350877305712818 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 8.59802289339536e-05, 'patienceScheduler': 5, 'patience': 23, 'epochs': 98}. Best is trial 10 with value: 0.8597263305499867.[0m


Stop training because you reached max_epochs = 77 with best_epoch = 57 and best_val_0_auc = 0.89854





Early stopping occurred at epoch 27 with best_epoch = 2 and best_val_0_auc = 0.71522




Stop training because you reached max_epochs = 77 with best_epoch = 57 and best_val_0_auc = 0.85509


[32m[I 2023-04-28 22:17:06,687][0m Trial 13 finished with value: 0.8229502008903568 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 7.790162756543873e-05, 'patienceScheduler': 5, 'patience': 25, 'epochs': 77}. Best is trial 10 with value: 0.8597263305499867.[0m



Early stopping occurred at epoch 77 with best_epoch = 56 and best_val_0_auc = 0.88123




Stop training because you reached max_epochs = 86 with best_epoch = 85 and best_val_0_auc = 0.82657




Stop training because you reached max_epochs = 86 with best_epoch = 71 and best_val_0_auc = 0.83771


[32m[I 2023-04-28 22:19:13,168][0m Trial 14 finished with value: 0.8485039268121906 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 1, 'gamma': 1.4, 'n_shared': 2, 'lambda_sparse': 6.0984648548947156e-05, 'patienceScheduler': 4, 'patience': 21, 'epochs': 86}. Best is trial 10 with value: 0.8597263305499867.[0m


Stop training because you reached max_epochs = 61 with best_epoch = 57 and best_val_0_auc = 0.87508




Stop training because you reached max_epochs = 61 with best_epoch = 59 and best_val_0_auc = 0.83657




Stop training because you reached max_epochs = 61 with best_epoch = 40 and best_val_0_auc = 0.84977


[32m[I 2023-04-28 22:23:17,247][0m Trial 15 finished with value: 0.8538077674620679 and parameters: {'mask_type': 'entmax', 'n_da': 60, 'n_steps': 3, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0008717257299357214, 'patienceScheduler': 5, 'patience': 30, 'epochs': 61}. Best is trial 10 with value: 0.8597263305499867.[0m



Early stopping occurred at epoch 53 with best_epoch = 27 and best_val_0_auc = 0.87528





Early stopping occurred at epoch 43 with best_epoch = 17 and best_val_0_auc = 0.79157




Stop training because you reached max_epochs = 58 with best_epoch = 33 and best_val_0_auc = 0.85427


[32m[I 2023-04-28 22:25:50,461][0m Trial 16 finished with value: 0.8403739340050769 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 3, 'gamma': 1.2, 'n_shared': 2, 'lambda_sparse': 0.0008700460338616535, 'patienceScheduler': 4, 'patience': 26, 'epochs': 58}. Best is trial 10 with value: 0.8597263305499867.[0m


Stop training because you reached max_epochs = 58 with best_epoch = 55 and best_val_0_auc = 0.86309




Stop training because you reached max_epochs = 58 with best_epoch = 57 and best_val_0_auc = 0.80657




Stop training because you reached max_epochs = 58 with best_epoch = 32 and best_val_0_auc = 0.85776


[32m[I 2023-04-28 22:29:38,118][0m Trial 17 finished with value: 0.8424732532092859 and parameters: {'mask_type': 'entmax', 'n_da': 60, 'n_steps': 3, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.000699329444227171, 'patienceScheduler': 3, 'patience': 30, 'epochs': 58}. Best is trial 10 with value: 0.8597263305499867.[0m



Early stopping occurred at epoch 36 with best_epoch = 21 and best_val_0_auc = 0.86316





Early stopping occurred at epoch 40 with best_epoch = 25 and best_val_0_auc = 0.85477





Early stopping occurred at epoch 26 with best_epoch = 11 and best_val_0_auc = 0.85043


[32m[I 2023-04-28 22:31:32,682][0m Trial 18 finished with value: 0.8561214738048589 and parameters: {'mask_type': 'entmax', 'n_da': 60, 'n_steps': 3, 'gamma': 1.4, 'n_shared': 2, 'lambda_sparse': 0.0001992484862238816, 'patienceScheduler': 10, 'patience': 15, 'epochs': 86}. Best is trial 10 with value: 0.8597263305499867.[0m


Stop training because you reached max_epochs = 88 with best_epoch = 86 and best_val_0_auc = 0.86788





Early stopping occurred at epoch 39 with best_epoch = 24 and best_val_0_auc = 0.75725





Early stopping occurred at epoch 19 with best_epoch = 4 and best_val_0_auc = 0.82934


[32m[I 2023-04-28 22:33:22,683][0m Trial 19 finished with value: 0.8181553505376636 and parameters: {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.2, 'n_shared': 2, 'lambda_sparse': 0.00019042501568447304, 'patienceScheduler': 10, 'patience': 15, 'epochs': 88}. Best is trial 10 with value: 0.8597263305499867.[0m


In [11]:
TabNet_params = study.best_params

In [12]:
print(TabNet_params)

{'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0001021180196306919, 'patienceScheduler': 4, 'patience': 23, 'epochs': 100}


In [13]:
final_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                    lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                    mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
                    scheduler_params=dict(mode="min",
                                        patience=TabNet_params['patienceScheduler'],
                                        min_lr=1e-5,
                                        factor=0.5,),
                    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                    verbose=0,
                    )
epochs = TabNet_params['epochs']

In [14]:
clf = TabNetClassifier(**final_params)
clf.fit(X_train=X, y_train=y,
          patience=TabNet_params['patience'], max_epochs=epochs,
          eval_metric=['auc'])



In [15]:
# train
X_train_re = X_train.values
dt1_pred_train = clf.predict(X_train_re)

# test
X_test_re = X_test.values
dt1_pred = clf.predict(X_test_re)

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

rdict={'model':[], "acc_train":[], "auc_train":[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}

results_train  = (round(accuracy_score(y_train,dt1_pred_train),2),round(precision_score(y_train,dt1_pred_train),2), round(recall_score(y_train,dt1_pred_train),2), round(roc_auc_score(y_train,dt1_pred_train),2))

results = (round(accuracy_score(y_test,dt1_pred),2),
                round(precision_score(y_test,dt1_pred),2),
                round(recall_score(y_test,dt1_pred),2),
                round(f1_score(y_test,dt1_pred),2),
                round(roc_auc_score(y_test,dt1_pred),2))

rdict['model'].append(clf)
rdict['acc_train'].append(results_train[0])
rdict['auc_train'].append(results_train[1])
rdict['acc_test'].append(results[0])
rdict['precision'].append(results[1])
rdict['recall'].append(results[2])
rdict['f1_score'].append(results[3])
rdict['AUC_test'].append(results[4])

print(results_train)
rdf_tabnet = pd.DataFrame(data=rdict)
rdf_tabnet

(0.97, 0.84, 0.38, 0.69)


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,"TabNetClassifier(n_d=56, n_a=56, n_steps=2, ga...",0.97,0.84,0.96,0.33,0.14,0.2,0.56
