In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./Tabnet_Raw_final.csv')

In [3]:
df.replace((np.inf, -np.inf), np.nan, inplace=True)
df.dropna(inplace=True)

In [4]:
# With 'train_test split by years' data

train = df[df['회계년도'] <= 2015]
test = df[df['회계년도'] > 2015]

X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values
X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)




epoch 0  | loss: 0.4327  | val_0_auc: 0.58151 |  0:00:30s
epoch 1  | loss: 0.25713 | val_0_auc: 0.54818 |  0:00:31s
epoch 2  | loss: 0.24835 | val_0_auc: 0.54512 |  0:00:33s
epoch 3  | loss: 0.19785 | val_0_auc: 0.55456 |  0:00:34s
epoch 4  | loss: 0.18815 | val_0_auc: 0.62922 |  0:00:35s
epoch 5  | loss: 0.17074 | val_0_auc: 0.70187 |  0:00:37s
epoch 6  | loss: 0.15904 | val_0_auc: 0.72469 |  0:00:38s
epoch 7  | loss: 0.15643 | val_0_auc: 0.74569 |  0:00:40s
epoch 8  | loss: 0.16361 | val_0_auc: 0.73602 |  0:00:41s
epoch 9  | loss: 0.15598 | val_0_auc: 0.79541 |  0:00:43s
epoch 10 | loss: 0.14196 | val_0_auc: 0.77802 |  0:00:44s
epoch 11 | loss: 0.14099 | val_0_auc: 0.78336 |  0:00:45s
epoch 12 | loss: 0.14081 | val_0_auc: 0.79199 |  0:00:47s
epoch 13 | loss: 0.13793 | val_0_auc: 0.81193 |  0:00:48s
epoch 14 | loss: 0.1429  | val_0_auc: 0.78983 |  0:00:49s
epoch 15 | loss: 0.13843 | val_0_auc: 0.7886  |  0:00:50s
epoch 16 | loss: 0.13454 | val_0_auc: 0.79515 |  0:00:50s
epoch 17 | los



Confusion matrix:
 [[4790    6]
 [ 180    6]]
Accuracy: 0.962665596146126
Precision: 0.5
Recall: 0.03225806451612903
F1-score: 0.06060606060606061
{'자기자본구성비율': 0.0008080892251332609, '설비투자효율': 0.0009330250660546965, '총자본투자효율': 2.5077186086357095e-05, '이자보상배율(이자비용)': 0.022491964482881002, '유동비율': 0.04098272510919903, '당좌비율': 0.007880806493676258, '부채비율': 0.026749017217086345, '총자본정상영업이익률': 8.531247133723453e-05, '매출액정상영업이익률': 0.016002351825675856, '매출액순이익률': 0.03216719070502078, '자기자본순이익률': 0.06195354965929166, '매출채권회전률': 0.02636616853374424, '재고자산회전률': 0.04973132360652812, '총자본회전률': 0.007630245779718587, '순운전자본비율': 0.16475490304719126, '매출액증가율': 0.07488827141956922, '총자본증가율': 0.033282353164258226, '유동자산증가율': 0.0002729360645514547, '유형자산증가율': 0.007710189925954246, '영업이익증가율': 0.026741313489286128, '순이익증가율': 0.017781308132499268, 'RETA': 0.061667804783938754, 'EBTA': 0.00017181801990063711, 'OM': 0.0009783749939243588, '종업원수증가율': 0.00018323762580364767, '영업이익변화율': 4.761290636460907e-05, '

In [5]:
# No hyperparameter tuning with raw data splitted by train_test_split method

X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

#scaler = StandardScaler()
#scaler.fit(X_train)
#scaler.fit(X_test)
#X_train = scaler.transform(X_train) 
#X_test = scaler.transform(X_test)

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.40034 | val_0_auc: 0.46156 |  0:00:00s
epoch 1  | loss: 0.21533 | val_0_auc: 0.50309 |  0:00:01s
epoch 2  | loss: 0.19749 | val_0_auc: 0.62057 |  0:00:02s
epoch 3  | loss: 0.17417 | val_0_auc: 0.55469 |  0:00:02s
epoch 4  | loss: 0.15966 | val_0_auc: 0.71219 |  0:00:03s
epoch 5  | loss: 0.15208 | val_0_auc: 0.763   |  0:00:04s
epoch 6  | loss: 0.15089 | val_0_auc: 0.76233 |  0:00:04s
epoch 7  | loss: 0.14718 | val_0_auc: 0.79313 |  0:00:05s
epoch 8  | loss: 0.14119 | val_0_auc: 0.77597 |  0:00:05s
epoch 9  | loss: 0.13731 | val_0_auc: 0.78541 |  0:00:06s
epoch 10 | loss: 0.13361 | val_0_auc: 0.78602 |  0:00:07s
epoch 11 | loss: 0.12759 | val_0_auc: 0.81955 |  0:00:07s
epoch 12 | loss: 0.14331 | val_0_auc: 0.83164 |  0:00:08s
epoch 13 | loss: 0.13703 | val_0_auc: 0.8333  |  0:00:09s
epoch 14 | loss: 0.13534 | val_0_auc: 0.8299  |  0:00:09s
epoch 15 | loss: 0.12712 | val_0_auc: 0.8491  |  0:00:10s
epoch 16 | loss: 0.13394 | val_0_auc: 0.8585  |  0:00:11s
epoch 17 | los



Confusion matrix:
 [[1947    2]
 [  76    3]]
Accuracy: 0.9615384615384616
Precision: 0.6
Recall: 0.0379746835443038
F1-score: 0.07142857142857144
{'자기자본구성비율': 0.002841075290054547, '설비투자효율': 0.0008899565943526075, '총자본투자효율': 7.012592897536443e-05, '이자보상배율(이자비용)': 0.00032876851497568753, '유동비율': 0.00045656213794516347, '당좌비율': 0.0005359530255079969, '부채비율': 0.012427779512210597, '총자본정상영업이익률': 0.00303521941239578, '매출액정상영업이익률': 0.0008637038078991044, '매출액순이익률': 0.027494607368683487, '자기자본순이익률': 0.28884986771706755, '매출채권회전률': 0.18611284620334587, '재고자산회전률': 0.0009445219893499821, '총자본회전률': 0.005078601912416747, '순운전자본비율': 0.0006180638790743157, '매출액증가율': 0.0, '총자본증가율': 0.10704979422341988, '유동자산증가율': 6.67223745636649e-08, '유형자산증가율': 0.00041207122036000197, '영업이익증가율': 0.0005887820453415095, '순이익증가율': 5.082344811099751e-05, 'RETA': 0.008861586106631725, 'EBTA': 0.15839268488559274, 'OM': 0.0008579370415106809, '종업원수증가율': 0.012374921760111603, '영업이익변화율': 0.0014409445636766225, '매출액변화율': 0.

In [18]:
# Data Resampling case no.1 (with weight to minority)

import torch
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight

train = df[df['회계년도'] <= 2017]
test = df[df['회계년도'] > 2017]

X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values
X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

# Resample using SMOTE
smote = SMOTE(sampling_strategy='minority' ,k_neighbors=5)
#smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_test
y_train = y_resampled
y_test = y_test

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

# Compute class weights based on the training data
class_weights = compute_class_weight('balanced', [0, 1], y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Increase the weight of the minority class
class_weights[1] *= 3.5

# Define custom loss function with class weights
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the TabNet classifier with the custom loss function
clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], loss_fn=criterion)
preds = clf.predict(X_test)

# Find the optimal threshold for F1 score
#y_pred_proba = clf.predict_proba(X_test)
#thresholds = np.linspace(0, 1, 100)
#f1_scores = [f1_score(y_test, y_pred_proba[:, 1] > t) for t in thresholds]
#optimal_threshold = thresholds[np.argmax(f1_scores)]

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.64017 | val_0_auc: 0.71455 |  0:00:01s
epoch 1  | loss: 0.38516 | val_0_auc: 0.80538 |  0:00:02s
epoch 2  | loss: 0.32628 | val_0_auc: 0.84263 |  0:00:03s
epoch 3  | loss: 0.29912 | val_0_auc: 0.85046 |  0:00:04s
epoch 4  | loss: 0.29477 | val_0_auc: 0.87588 |  0:00:05s
epoch 5  | loss: 0.28715 | val_0_auc: 0.8972  |  0:00:06s
epoch 6  | loss: 0.28614 | val_0_auc: 0.90747 |  0:00:07s
epoch 7  | loss: 0.28025 | val_0_auc: 0.90969 |  0:00:08s
epoch 8  | loss: 0.27768 | val_0_auc: 0.90804 |  0:00:09s
epoch 9  | loss: 0.26938 | val_0_auc: 0.90734 |  0:00:09s
epoch 10 | loss: 0.26534 | val_0_auc: 0.91248 |  0:00:10s
epoch 11 | loss: 0.26252 | val_0_auc: 0.92167 |  0:00:11s
epoch 12 | loss: 0.25678 | val_0_auc: 0.92085 |  0:00:12s
epoch 13 | loss: 0.25119 | val_0_auc: 0.92318 |  0:00:13s
epoch 14 | loss: 0.24817 | val_0_auc: 0.92896 |  0:00:14s
epoch 15 | loss: 0.23957 | val_0_auc: 0.93201 |  0:00:15s
epoch 16 | loss: 0.23755 | val_0_auc: 0.93208 |  0:00:16s
epoch 17 | los



Confusion matrix:
 [[1837  668]
 [  20   67]]
Accuracy: 0.7345679012345679
Precision: 0.09115646258503401
Recall: 0.7701149425287356
F1-score: 0.16301703163017034
{'자기자본구성비율': 0.0018806617480432526, '설비투자효율': 0.0, '총자본투자효율': 0.0, '이자보상배율(이자비용)': 0.0, '유동비율': 6.130755186486271e-06, '당좌비율': 1.2222070310678652e-05, '부채비율': 0.09995807209970252, '총자본정상영업이익률': 2.0646298154725486e-05, '매출액정상영업이익률': 0.0770805197880308, '매출액순이익률': 0.027735767533401793, '자기자본순이익률': 0.2603696032449207, '매출채권회전률': 0.011026483266035189, '재고자산회전률': 0.0, '총자본회전률': 0.001362664690671877, '순운전자본비율': 0.022060860212485003, '매출액증가율': 0.0011563895867826785, '총자본증가율': 0.002105277362745831, '유동자산증가율': 0.0, '유형자산증가율': 0.010082481066257995, '영업이익증가율': 0.0, '순이익증가율': 0.0, 'RETA': 0.06113630268683869, 'EBTA': 0.09446786620546291, 'OM': 0.021231514713407645, '종업원수증가율': 0.02730454231891689, '영업이익변화율': 0.0002694557279012431, '매출액변화율': 0.007696230363738419, '당기순이익변화율': 0.01669296672033391, 'DOL': 0.02131264686452287, 'DFL': 0.0007832

In [57]:
# Extra for case no.1

X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Resample using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

#sm = BorderlineSMOTE(random_state=42, sampling_strategy='minority')
#X_res, y_res = sm.fit_resample(X_train, y_train)
#X_res_test, y_res_test = sm.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled.values
X_test = X_resampled_test.values
y_train = y_resampled.values
y_test = y_resampled_test.values


feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.6668  | val_0_auc: 0.66477 |  0:00:01s
epoch 1  | loss: 0.48715 | val_0_auc: 0.68082 |  0:00:02s
epoch 2  | loss: 0.44063 | val_0_auc: 0.79115 |  0:00:03s
epoch 3  | loss: 0.42207 | val_0_auc: 0.85135 |  0:00:05s
epoch 4  | loss: 0.41402 | val_0_auc: 0.87017 |  0:00:06s
epoch 5  | loss: 0.40044 | val_0_auc: 0.87128 |  0:00:07s
epoch 6  | loss: 0.39695 | val_0_auc: 0.8889  |  0:00:09s
epoch 7  | loss: 0.39604 | val_0_auc: 0.89705 |  0:00:10s
epoch 8  | loss: 0.38764 | val_0_auc: 0.89493 |  0:00:11s
epoch 9  | loss: 0.38411 | val_0_auc: 0.89852 |  0:00:13s
epoch 10 | loss: 0.37878 | val_0_auc: 0.90353 |  0:00:14s
epoch 11 | loss: 0.37435 | val_0_auc: 0.90778 |  0:00:15s
epoch 12 | loss: 0.364   | val_0_auc: 0.90858 |  0:00:16s
epoch 13 | loss: 0.35821 | val_0_auc: 0.90838 |  0:00:18s
epoch 14 | loss: 0.35383 | val_0_auc: 0.91143 |  0:00:19s
epoch 15 | loss: 0.34963 | val_0_auc: 0.91753 |  0:00:20s
epoch 16 | loss: 0.35145 | val_0_auc: 0.92037 |  0:00:22s
epoch 17 | los



Confusion matrix:
 [[1597  352]
 [ 177 1772]]
Accuracy: 0.8642893791688046
Precision: 0.8342749529190208
Recall: 0.9091841970241149
F1-score: 0.8701203044438988
{'자기자본구성비율': 0.02554252494362828, '설비투자효율': 0.009969267920895052, '총자본투자효율': 0.00015142872846318943, '이자보상배율(이자비용)': 0.0, '유동비율': 0.0, '당좌비율': 0.00011943201185828064, '부채비율': 0.19775674414336428, '총자본정상영업이익률': 0.0869541793689616, '매출액정상영업이익률': 0.0006333398297035835, '매출액순이익률': 0.05180883077035805, '자기자본순이익률': 0.10567379043143872, '매출채권회전률': 0.0, '재고자산회전률': 0.0, '총자본회전률': 0.013643499277014404, '순운전자본비율': 0.0, '매출액증가율': 0.0, '총자본증가율': 2.0025579137308933e-05, '유동자산증가율': 0.0, '유형자산증가율': 6.35931065617222e-06, '영업이익증가율': 0.007944402370875249, '순이익증가율': 2.4567867731420298e-05, 'RETA': 0.11927032295205475, 'EBTA': 0.043717410155811605, 'OM': 1.4770960831933228e-05, '종업원수증가율': 0.01649126832809473, '영업이익변화율': 0.0, '매출액변화율': 0.00016379079438307335, '당기순이익변화율': 0.021083538716762527, 'DOL': 0.005599023726448556, 'DFL': 7.216179656674863e-07

In [14]:
# Extra for case no.2

train = df[df['회계년도'] <= 2017]
test = df[df['회계년도'] > 2017]

X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values
X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

#sm = BorderlineSMOTE(random_state=42, sampling_strategy='minority')
#X_res, y_res = sm.fit_resample(X_train, y_train)
#X_res_test, y_res_test = sm.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_resampled_test
y_train = y_resampled
y_test = y_resampled_test
#X_test = X_test
#y_test = y_test

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.70121 | val_0_auc: 0.65153 |  0:00:01s
epoch 1  | loss: 0.52434 | val_0_auc: 0.717   |  0:00:02s
epoch 2  | loss: 0.45251 | val_0_auc: 0.83716 |  0:00:03s
epoch 3  | loss: 0.4174  | val_0_auc: 0.84204 |  0:00:04s
epoch 4  | loss: 0.40776 | val_0_auc: 0.88255 |  0:00:05s
epoch 5  | loss: 0.39427 | val_0_auc: 0.89413 |  0:00:06s
epoch 6  | loss: 0.38297 | val_0_auc: 0.89289 |  0:00:08s
epoch 7  | loss: 0.37587 | val_0_auc: 0.89555 |  0:00:09s
epoch 8  | loss: 0.37906 | val_0_auc: 0.90345 |  0:00:10s
epoch 9  | loss: 0.3706  | val_0_auc: 0.90286 |  0:00:11s
epoch 10 | loss: 0.37091 | val_0_auc: 0.90259 |  0:00:12s
epoch 11 | loss: 0.36953 | val_0_auc: 0.90349 |  0:00:13s
epoch 12 | loss: 0.36686 | val_0_auc: 0.90729 |  0:00:14s
epoch 13 | loss: 0.36236 | val_0_auc: 0.90763 |  0:00:16s
epoch 14 | loss: 0.36125 | val_0_auc: 0.91215 |  0:00:17s
epoch 15 | loss: 0.3582  | val_0_auc: 0.91393 |  0:00:18s
epoch 16 | loss: 0.35434 | val_0_auc: 0.91306 |  0:00:19s
epoch 17 | los



Confusion matrix:
 [[1784  721]
 [ 288 2217]]
Accuracy: 0.7986027944111777
Precision: 0.7545949625595644
Recall: 0.8850299401197604
F1-score: 0.8146242880764284
{'자기자본구성비율': 0.015891374992008045, '설비투자효율': 0.0045242446632410335, '총자본투자효율': 0.0, '이자보상배율(이자비용)': 0.05185259401072577, '유동비율': 0.022996984382098042, '당좌비율': 0.016972575501744545, '부채비율': 0.034091570015875976, '총자본정상영업이익률': 0.0, '매출액정상영업이익률': 0.08881435575290814, '매출액순이익률': 0.006690544722667798, '자기자본순이익률': 0.19039009294946668, '매출채권회전률': 7.74683043573838e-06, '재고자산회전률': 0.0, '총자본회전률': 0.00033547288851654735, '순운전자본비율': 0.0006930729652971988, '매출액증가율': 0.012524567760493682, '총자본증가율': 0.0002542533595306181, '유동자산증가율': 0.0, '유형자산증가율': 0.0, '영업이익증가율': 0.0014866190326346318, '순이익증가율': 7.64040343186483e-06, 'RETA': 0.048075953791446295, 'EBTA': 0.16127269610709316, 'OM': 0.07045175382385123, '종업원수증가율': 0.0042413277011958996, '영업이익변화율': 0.021801976057722737, '매출액변화율': 6.611939482877365e-05, '당기순이익변화율': 0.0, 'DOL': 0.0, 'DFL': 0.0009

In [78]:
# Data Resampling case no.2 with SMOTE

X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Resample using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled.values
X_test = X_test.values
y_train = y_resampled.values
y_test = y_test.values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.69775 | val_0_auc: 0.73531 |  0:00:01s
epoch 1  | loss: 0.4982  | val_0_auc: 0.87557 |  0:00:02s
epoch 2  | loss: 0.45362 | val_0_auc: 0.85568 |  0:00:03s
epoch 3  | loss: 0.43319 | val_0_auc: 0.8973  |  0:00:05s
epoch 4  | loss: 0.4229  | val_0_auc: 0.8873  |  0:00:06s
epoch 5  | loss: 0.40234 | val_0_auc: 0.89579 |  0:00:07s
epoch 6  | loss: 0.3912  | val_0_auc: 0.90327 |  0:00:08s
epoch 7  | loss: 0.38062 | val_0_auc: 0.91259 |  0:00:10s
epoch 8  | loss: 0.37995 | val_0_auc: 0.91294 |  0:00:11s
epoch 9  | loss: 0.37335 | val_0_auc: 0.91595 |  0:00:12s
epoch 10 | loss: 0.36993 | val_0_auc: 0.9155  |  0:00:14s
epoch 11 | loss: 0.35829 | val_0_auc: 0.91654 |  0:00:15s
epoch 12 | loss: 0.35746 | val_0_auc: 0.92822 |  0:00:16s
epoch 13 | loss: 0.35346 | val_0_auc: 0.92652 |  0:00:17s
epoch 14 | loss: 0.34541 | val_0_auc: 0.92756 |  0:00:19s
epoch 15 | loss: 0.3381  | val_0_auc: 0.92104 |  0:00:20s
epoch 16 | loss: 0.33688 | val_0_auc: 0.92669 |  0:00:21s
epoch 17 | los



Confusion matrix:
 [[1594  355]
 [  23   56]]
Accuracy: 0.8136094674556213
Precision: 0.1362530413625304
Recall: 0.7088607594936709
F1-score: 0.22857142857142854
{'자기자본구성비율': 0.003712294488269945, '설비투자효율': 0.004307703696655136, '총자본투자효율': 0.02433753121364975, '이자보상배율(이자비용)': 0.24027886129880713, '유동비율': 1.7522549303713885e-06, '당좌비율': 0.007554617780065908, '부채비율': 0.0, '총자본정상영업이익률': 0.10925399993491428, '매출액정상영업이익률': 0.0, '매출액순이익률': 0.059757011702196855, '자기자본순이익률': 0.1622812966154025, '매출채권회전률': 0.005381674206391948, '재고자산회전률': 0.0, '총자본회전률': 0.0038837028140515776, '순운전자본비율': 0.010020141063410737, '매출액증가율': 0.0, '총자본증가율': 0.00025299646475380035, '유동자산증가율': 0.0038530444349346355, '유형자산증가율': 0.002649229795278509, '영업이익증가율': 1.369699849541446e-06, '순이익증가율': 0.0, 'RETA': 0.03555725331481567, 'EBTA': 0.048213721297163975, 'OM': 0.05019281904918675, '종업원수증가율': 0.0, '영업이익변화율': 4.334990362899189e-06, '매출액변화율': 0.0, '당기순이익변화율': 0.035300705057068824, 'DOL': 0.0, 'DFL': 0.0010438932081858953, '

In [51]:
# Data Resampling case no.3 with BorderlineSMOTE

from imblearn.over_sampling import BorderlineSMOTE

X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

sm = BorderlineSMOTE(random_state=42, sampling_strategy='minority')
X_res, y_res = sm.fit_resample(X_train, y_train)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_res.values
X_test = X_test.values
#y_train = y_resampled.values
#y_test = y_resampled_test.values
y_train = y_res.values
y_test = y_test.values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier(mask_type="sparsemax")
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

# Find the optimal threshold for F1 score
y_pred_proba = clf.predict_proba(X_test)
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(y_test, y_pred_proba[:, 1] > t) for t in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.65257 | val_0_auc: 0.75657 |  0:00:01s
epoch 1  | loss: 0.47745 | val_0_auc: 0.79371 |  0:00:02s
epoch 2  | loss: 0.4158  | val_0_auc: 0.81363 |  0:00:03s
epoch 3  | loss: 0.39094 | val_0_auc: 0.85404 |  0:00:05s
epoch 4  | loss: 0.37627 | val_0_auc: 0.8702  |  0:00:06s
epoch 5  | loss: 0.3685  | val_0_auc: 0.87425 |  0:00:07s
epoch 6  | loss: 0.35904 | val_0_auc: 0.88859 |  0:00:08s
epoch 7  | loss: 0.36157 | val_0_auc: 0.89371 |  0:00:09s
epoch 8  | loss: 0.3427  | val_0_auc: 0.89865 |  0:00:11s
epoch 9  | loss: 0.33641 | val_0_auc: 0.89185 |  0:00:12s
epoch 10 | loss: 0.32129 | val_0_auc: 0.89527 |  0:00:13s
epoch 11 | loss: 0.31155 | val_0_auc: 0.91829 |  0:00:14s
epoch 12 | loss: 0.31312 | val_0_auc: 0.91811 |  0:00:16s
epoch 13 | loss: 0.30565 | val_0_auc: 0.91197 |  0:00:17s
epoch 14 | loss: 0.3024  | val_0_auc: 0.91245 |  0:00:18s
epoch 15 | loss: 0.29261 | val_0_auc: 0.9159  |  0:00:19s
epoch 16 | loss: 0.30514 | val_0_auc: 0.92449 |  0:00:20s
epoch 17 | los



Confusion matrix:
 [[1683  266]
 [  31   48]]
Accuracy: 0.8535502958579881
Precision: 0.15286624203821655
Recall: 0.6075949367088608
F1-score: 0.24427480916030533
{'자기자본구성비율': 0.0, '설비투자효율': 4.543846886102898e-06, '총자본투자효율': 0.0, '이자보상배율(이자비용)': 0.1517336717152607, '유동비율': 9.634882630671797e-06, '당좌비율': 0.0007334255116842103, '부채비율': 0.045946003184816085, '총자본정상영업이익률': 0.04248568291237684, '매출액정상영업이익률': 0.026456344123703412, '매출액순이익률': 0.008324985911078362, '자기자본순이익률': 0.09088179799059345, '매출채권회전률': 0.0019503855806486071, '재고자산회전률': 0.0009167766462150305, '총자본회전률': 6.036754802568913e-05, '순운전자본비율': 0.005193571658961957, '매출액증가율': 0.0, '총자본증가율': 0.006063050718078008, '유동자산증가율': 0.0, '유형자산증가율': 0.020694832828190967, '영업이익증가율': 0.01696937216859189, '순이익증가율': 4.915787893483473e-05, 'RETA': 0.0, 'EBTA': 0.06026741247603145, 'OM': 0.11899166144538936, '종업원수증가율': 1.3790367978274393e-05, '영업이익변화율': 0.0, '매출액변화율': 0.0, '당기순이익변화율': 4.467763971619613e-06, 'DOL': 0.004660698767887182, 'DFL': 0.00

In [9]:
# Threshold optimization

X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

# Find the optimal threshold for F1 score
y_pred_proba = clf.predict_proba(X_test)
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(y_test, y_pred_proba[:, 1] > t) for t in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)




epoch 0  | loss: 0.40034 | val_0_auc: 0.46156 |  0:00:00s
epoch 1  | loss: 0.21533 | val_0_auc: 0.50309 |  0:00:01s
epoch 2  | loss: 0.19749 | val_0_auc: 0.62057 |  0:00:02s
epoch 3  | loss: 0.17417 | val_0_auc: 0.55469 |  0:00:02s
epoch 4  | loss: 0.15966 | val_0_auc: 0.71219 |  0:00:03s
epoch 5  | loss: 0.15208 | val_0_auc: 0.763   |  0:00:03s
epoch 6  | loss: 0.15089 | val_0_auc: 0.76233 |  0:00:04s
epoch 7  | loss: 0.14718 | val_0_auc: 0.79313 |  0:00:05s
epoch 8  | loss: 0.14119 | val_0_auc: 0.77597 |  0:00:05s
epoch 9  | loss: 0.13731 | val_0_auc: 0.78541 |  0:00:06s
epoch 10 | loss: 0.13361 | val_0_auc: 0.78602 |  0:00:07s
epoch 11 | loss: 0.12759 | val_0_auc: 0.81955 |  0:00:07s
epoch 12 | loss: 0.14331 | val_0_auc: 0.83164 |  0:00:08s
epoch 13 | loss: 0.13703 | val_0_auc: 0.8333  |  0:00:08s
epoch 14 | loss: 0.13534 | val_0_auc: 0.8299  |  0:00:09s
epoch 15 | loss: 0.12712 | val_0_auc: 0.8491  |  0:00:10s
epoch 16 | loss: 0.13394 | val_0_auc: 0.8585  |  0:00:10s
epoch 17 | los



Confusion matrix:
 [[1720  229]
 [  28   51]]
Accuracy: 0.8732741617357002
Precision: 0.18214285714285713
Recall: 0.6455696202531646
F1-score: 0.2841225626740947
{'자기자본구성비율': 0.002841075290054547, '설비투자효율': 0.0008899565943526075, '총자본투자효율': 7.012592897536443e-05, '이자보상배율(이자비용)': 0.00032876851497568753, '유동비율': 0.00045656213794516347, '당좌비율': 0.0005359530255079969, '부채비율': 0.012427779512210597, '총자본정상영업이익률': 0.00303521941239578, '매출액정상영업이익률': 0.0008637038078991044, '매출액순이익률': 0.027494607368683487, '자기자본순이익률': 0.28884986771706755, '매출채권회전률': 0.18611284620334587, '재고자산회전률': 0.0009445219893499821, '총자본회전률': 0.005078601912416747, '순운전자본비율': 0.0006180638790743157, '매출액증가율': 0.0, '총자본증가율': 0.10704979422341988, '유동자산증가율': 6.67223745636649e-08, '유형자산증가율': 0.00041207122036000197, '영업이익증가율': 0.0005887820453415095, '순이익증가율': 5.082344811099751e-05, 'RETA': 0.008861586106631725, 'EBTA': 0.15839268488559274, 'OM': 0.0008579370415106809, '종업원수증가율': 0.012374921760111603, '영업이익변화율': 0.001440944563676622

In [10]:
# Feature selection

from sklearn.feature_selection import SelectKBest, mutual_info_classif


X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

# Select top 10 features based on mutual information
selector = SelectKBest(mutual_info_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the selected feature names
#selected_feature_indices = selector.get_support(indices=True)
#feature_names = np.array(feature_names)
#selected_feature_names = feature_names[selected_feature_indices]

#print("Selected Features:", selected_feature_names)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.40034 | val_0_auc: 0.46156 |  0:00:00s
epoch 1  | loss: 0.21533 | val_0_auc: 0.50309 |  0:00:01s
epoch 2  | loss: 0.19749 | val_0_auc: 0.62057 |  0:00:01s
epoch 3  | loss: 0.17417 | val_0_auc: 0.55469 |  0:00:02s
epoch 4  | loss: 0.15966 | val_0_auc: 0.71219 |  0:00:03s
epoch 5  | loss: 0.15208 | val_0_auc: 0.763   |  0:00:03s
epoch 6  | loss: 0.15089 | val_0_auc: 0.76233 |  0:00:04s
epoch 7  | loss: 0.14718 | val_0_auc: 0.79313 |  0:00:05s
epoch 8  | loss: 0.14119 | val_0_auc: 0.77597 |  0:00:05s
epoch 9  | loss: 0.13731 | val_0_auc: 0.78541 |  0:00:06s
epoch 10 | loss: 0.13361 | val_0_auc: 0.78602 |  0:00:06s
epoch 11 | loss: 0.12759 | val_0_auc: 0.81955 |  0:00:07s
epoch 12 | loss: 0.14331 | val_0_auc: 0.83164 |  0:00:08s
epoch 13 | loss: 0.13703 | val_0_auc: 0.8333  |  0:00:08s
epoch 14 | loss: 0.13534 | val_0_auc: 0.8299  |  0:00:09s
epoch 15 | loss: 0.12712 | val_0_auc: 0.8491  |  0:00:10s
epoch 16 | loss: 0.13394 | val_0_auc: 0.8585  |  0:00:10s
epoch 17 | los



Confusion matrix:
 [[1947    2]
 [  76    3]]
Accuracy: 0.9615384615384616
Precision: 0.6
Recall: 0.0379746835443038
F1-score: 0.07142857142857144
{'자기자본구성비율': 0.002841075290054547, '설비투자효율': 0.0008899565943526075, '총자본투자효율': 7.012592897536443e-05, '이자보상배율(이자비용)': 0.00032876851497568753, '유동비율': 0.00045656213794516347, '당좌비율': 0.0005359530255079969, '부채비율': 0.012427779512210597, '총자본정상영업이익률': 0.00303521941239578, '매출액정상영업이익률': 0.0008637038078991044, '매출액순이익률': 0.027494607368683487, '자기자본순이익률': 0.28884986771706755, '매출채권회전률': 0.18611284620334587, '재고자산회전률': 0.0009445219893499821, '총자본회전률': 0.005078601912416747, '순운전자본비율': 0.0006180638790743157, '매출액증가율': 0.0, '총자본증가율': 0.10704979422341988, '유동자산증가율': 6.67223745636649e-08, '유형자산증가율': 0.00041207122036000197, '영업이익증가율': 0.0005887820453415095, '순이익증가율': 5.082344811099751e-05, 'RETA': 0.008861586106631725, 'EBTA': 0.15839268488559274, 'OM': 0.0008579370415106809, '종업원수증가율': 0.012374921760111603, '영업이익변화율': 0.0014409445636766225, '매출액변화율': 0.

In [29]:
df = pd.DataFrame(y_test)
df.value_counts()

0    1949
1      79
dtype: int64

In [7]:
from sklearn.model_selection import GridSearchCV

X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Resample using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

X_train = X_resampled.values
X_test = X_test.values
y_train = y_resampled.values
y_test = y_test.values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

param_grid = {
    'n_d': [8, 16, 32],
    'n_a': [8, 16, 32],
    'n_steps': [3, 5, 10],
    'gamma': [1.3, 1.4, 1.5],
    'n_independent': [1, 2, 4],
    'n_shared': [1, 2, 4],
}

search = GridSearchCV(TabNetClassifier(), param_grid, scoring='f1', cv=5, n_jobs=-1)
search.fit(X_train, y_train)
best_params = search.best_params_
clf = TabNetClassifier(**best_params)
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



In [None]:
X = df.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1)
y = df['부실']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Resample using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

X_train = X_resampled.values
X_test = X_test.values
y_train = y_resampled.values
y_test = y_test.values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.2, random_state=1)

from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier on the resampled data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_resampled, y_resampled)

# Use the Random Forest as a base estimator in a TabNetClassifier ensemble
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetMultiTaskClassifier

clf = TabNetClassifier(
    n_d=32, n_a=32,
    n_steps=3, gamma=1.3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(mode="min", patience=10, min_lr=1e-5, factor=0.5, ),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type="entmax",
    device_name="auto",
    verbose=1,
    seed=42,
    cat_idxs=[],
    cat_dims=[],
)
clf.fit(
    X_resampled,
    y_resampled,
    eval_set=[(X_valid, y_valid)],
    max_epochs=100,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    weights=0.5,  # Weight assigned to the positive class in the loss
    from_unsupervised=rf,
)
preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)

In [6]:
# Hyperparameter tuning by GridsearchCV

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터 튜닝
hyperparameters = {
    "n_d": [8, 16, 32],
    "n_a": [8, 16, 32],
    "n_steps": [3, 5],
    "gamma": [1.3, 1.4],
    "lambda_sparse": [0, 0.0001, 0.001]
}

# TabNet classifier 생성
clf = TabNetClassifier()

# GridSearchCV 생성
grid_search = GridSearchCV(
    clf,
    hyperparameters,
    cv=5,  # number of cross-validation folds
    n_jobs=-1,  # number of CPU cores to use (-1 means all available cores)
    scoring="recall",  # metric to optimize
    verbose=10  # level of verbosity
)

# GridSearchCV train data에 적용
grid_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

# 베스트 하이퍼파라미터
print("Best hyperparameters:", grid_search.best_params_)

# 베스트 하이퍼파라미터 test data에 적용
best_clf = grid_search.best_estimator_
preds = best_clf.predict(X_test)

# 평가 지표 도출
cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
epoch 0  | loss: 0.92107 | val_0_auc: 0.45271 |  0:00:00s
epoch 1  | loss: 0.38988 | val_0_auc: 0.52385 |  0:00:00s
epoch 2  | loss: 0.23659 | val_0_auc: 0.6177  |  0:00:01s
epoch 3  | loss: 0.22708 | val_0_auc: 0.58813 |  0:00:01s
epoch 4  | loss: 0.21963 | val_0_auc: 0.68821 |  0:00:02s
epoch 5  | loss: 0.18548 | val_0_auc: 0.73219 |  0:00:02s
epoch 6  | loss: 0.16471 | val_0_auc: 0.69541 |  0:00:03s
epoch 7  | loss: 0.17668 | val_0_auc: 0.69511 |  0:00:03s
epoch 8  | loss: 0.16385 | val_0_auc: 0.74799 |  0:00:03s
epoch 9  | loss: 0.16667 | val_0_auc: 0.76184 |  0:00:04s
epoch 10 | loss: 0.15705 | val_0_auc: 0.77656 |  0:00:04s
epoch 11 | loss: 0.1598  | val_0_auc: 0.7491  |  0:00:05s
epoch 12 | loss: 0.15705 | val_0_auc: 0.68843 |  0:00:05s
epoch 13 | loss: 0.15405 | val_0_auc: 0.74547 |  0:00:06s
epoch 14 | loss: 0.15123 | val_0_auc: 0.77154 |  0:00:06s
epoch 15 | loss: 0.15112 | val_0_auc: 0.85643 |  0:00:06s
epoch 16 



Best hyperparameters: {'gamma': 1.3, 'lambda_sparse': 0.0001, 'n_a': 8, 'n_d': 8, 'n_steps': 3}
Confusion matrix:
 [[3660    1]
 [ 139    1]]
Accuracy: 0.9631675874769797
Precision: 0.5
Recall: 0.007142857142857143
F1-score: 0.014084507042253521


In [7]:
# Hyperparameter tuning by BayesSearchCV, case no.1

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV


search_space = {
    'n_d': [4, 8, 16, 32],
    'n_a': [4, 8, 16, 32],
    'n_steps': [1, 2, 3, 4],
    'gamma': [1.0, 1.1, 1.2, 1.3, 1.4],
    'lambda_sparse': [0, 0.0001, 0.001, 0.01]
}


clf = TabNetClassifier()


bayes_search = BayesSearchCV(
    clf,
    search_space,
    cv=5,  
    n_iter=50,  
    scoring='f1',  
    n_jobs=-1, 
    verbose=10  
)

# Fit BayesSearchCV to the training data
bayes_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

# Print best hyperparameters found
print("Best hyperparameters:", bayes_search.best_params_)

# Use best estimator to make predictions on test data
best_clf = bayes_search.best_estimator_
preds = best_clf.predict(X_test)

# Compute evaluation metrics
cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




epoch 0  | loss: 0.53501 | val_0_auc: 0.52334 |  0:00:00s
epoch 1  | loss: 0.31185 | val_0_auc: 0.66771 |  0:00:01s
epoch 2  | loss: 0.22119 | val_0_auc: 0.66224 |  0:00:02s
epoch 3  | loss: 0.19116 | val_0_auc: 0.56885 |  0:00:03s
epoch 4  | loss: 0.20851 | val_0_auc: 0.61162 |  0:00:04s
epoch 5  | loss: 0.19698 | val_0_auc: 0.62452 |  0:00:05s
epoch 6  | loss: 0.17428 | val_0_auc: 0.67337 |  0:00:05s
epoch 7  | loss: 0.2016  | val_0_auc: 0.72645 |  0:00:06s
epoch 8  | loss: 0.19585 | val_0_auc: 0.60787 |  0:00:07s
epoch 9  | loss: 0.17284 | val_0_auc: 0.75224 |  0:00:08s
epoch 10 | loss: 0.15508 | val_0_auc: 0.80195 |  0:00:09s
epoch 11 | loss: 0.16229 | val_0_auc: 0.81586 |  0:00:10s
epoch 12 | loss: 0.16369 | val_0_auc: 0.84053 |  0:00:10s
epoch 13 | loss: 0.14072 | val_0_auc: 0.8628  |  0:00:11s
epoch 14 | loss: 0.15576 | val_0_auc: 0.88049 |  0:00:12s
epoch 15 | loss: 0.14825 | val_0_auc: 0.8872  |  0:00:13s
epoch 16 | loss: 0.15181 | val_0_auc: 0.92165 |  0:00:13s
epoch 17 | los



Best hyperparameters: OrderedDict([('gamma', 1.1), ('lambda_sparse', 0.0), ('n_a', 32), ('n_d', 32), ('n_steps', 4)])
Confusion matrix:
 [[3643   18]
 [ 134    6]]
Accuracy: 0.9600105235464351
Precision: 0.25
Recall: 0.04285714285714286
F1-score: 0.07317073170731707


In [47]:
# Hyperparameter tuning by RandomizedSearchCV

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


param_distributions = {
    'n_d': randint(4, 64),
    'n_a': randint(4, 64),
    'n_steps': randint(1, 5),
    'gamma': uniform(1.0, 0.5),
    'lambda_sparse': uniform(0.0, 0.01)
}


clf = TabNetClassifier()


random_search = RandomizedSearchCV(
    clf,
    param_distributions,
    n_iter=50,  
    cv=5,  
    scoring='f1',  
    n_jobs=-1,  
    verbose=10,  
    random_state=42  
)


random_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])


print("Best hyperparameters:", random_search.best_params_)


best_clf = random_search.best_estimator_
preds = best_clf.predict(X_test)


cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)




Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

In [9]:
# Hyperparameter tuning by BayesSearchCV, case no.2

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer


param_dist = {
    'n_d': Integer(4, 64),
    'n_a': Integer(4, 64),
    'n_steps': Integer(1, 5),
    'gamma': Real(1.0, 1.5),
    'lambda_sparse': Real(0.0, 0.01)
}


clf = TabNetClassifier()


bayes_search = BayesSearchCV(
    clf,
    param_dist,
    n_iter=50,  
    cv=5,  
    scoring='f1',  
    n_jobs=-1,  
    verbose=10,  
    random_state=42,  
)


bayes_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])


print("Best hyperparameters:", bayes_search.best_params_)


best_clf = bayes_search.best_estimator_
preds = best_clf.predict(X_test)


cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




epoch 0  | loss: 0.80243 | val_0_auc: 0.47296 |  0:00:00s
epoch 1  | loss: 0.50996 | val_0_auc: 0.48818 |  0:00:01s
epoch 2  | loss: 0.4785  | val_0_auc: 0.5528  |  0:00:02s
epoch 3  | loss: 0.36685 | val_0_auc: 0.67669 |  0:00:03s
epoch 4  | loss: 0.27676 | val_0_auc: 0.71056 |  0:00:04s
epoch 5  | loss: 0.22776 | val_0_auc: 0.67946 |  0:00:05s
epoch 6  | loss: 0.22465 | val_0_auc: 0.74394 |  0:00:06s
epoch 7  | loss: 0.22704 | val_0_auc: 0.78297 |  0:00:07s
epoch 8  | loss: 0.20818 | val_0_auc: 0.62129 |  0:00:08s
epoch 9  | loss: 0.19869 | val_0_auc: 0.67692 |  0:00:09s
epoch 10 | loss: 0.20308 | val_0_auc: 0.65017 |  0:00:10s
epoch 11 | loss: 0.19193 | val_0_auc: 0.73042 |  0:00:10s
epoch 12 | loss: 0.18463 | val_0_auc: 0.78886 |  0:00:11s
epoch 13 | loss: 0.17926 | val_0_auc: 0.83275 |  0:00:12s
epoch 14 | loss: 0.19301 | val_0_auc: 0.71906 |  0:00:13s
epoch 15 | loss: 0.21283 | val_0_auc: 0.56548 |  0:00:14s
epoch 16 | loss: 0.19331 | val_0_auc: 0.4635  |  0:00:15s
epoch 17 | los



Best hyperparameters: OrderedDict([('gamma', 1.2511469928933538), ('lambda_sparse', 0.0063456874545047785), ('n_a', 4), ('n_d', 50), ('n_steps', 5)])
Confusion matrix:
 [[3649   12]
 [ 135    5]]
Accuracy: 0.9613259668508287
Precision: 0.29411764705882354
Recall: 0.03571428571428571
F1-score: 0.06369426751592357


In [22]:
importance_list = {'자기자본구성비율': 0.015891374992008045, '설비투자효율': 0.0045242446632410335, '총자본투자효율': 0.0, '이자보상배율(이자비용)': 0.05185259401072577, 
                   '유동비율': 0.022996984382098042, '당좌비율': 0.016972575501744545, '부채비율': 0.034091570015875976, '총자본정상영업이익률': 0.0, 
                   '매출액정상영업이익률': 0.08881435575290814, '매출액순이익률': 0.006690544722667798, '자기자본순이익률': 0.19039009294946668, '매출채권회전률': 7.74683043573838e-06,
                   '재고자산회전률': 0.0, '총자본회전률': 0.00033547288851654735, '순운전자본비율': 0.0006930729652971988, '매출액증가율': 0.012524567760493682, 
                   '총자본증가율': 0.0002542533595306181, '유동자산증가율': 0.0, '유형자산증가율': 0.0, '영업이익증가율': 0.0014866190326346318, 
                   '순이익증가율': 7.64040343186483e-06, 'RETA': 0.048075953791446295, 'EBTA': 0.16127269610709316, 'OM': 0.07045175382385123, 
                   '종업원수증가율': 0.0042413277011958996, '영업이익변화율': 0.021801976057722737, '매출액변화율': 6.611939482877365e-05, '당기순이익변화율': 0.0, 
                   'DOL': 0.0, 'DFL': 0.0009503108883668936, 'EV/EBITDA': 0.0039413797407759354, '영업활동으로 인한 현금흐름': 0.02564916035405224, 
                   '금융비용부담률': 0.057867099532558065, '고정비율': 0.00048291392289406327, 'R&D비율': 0.0, '채무부담비율': 0.0, 
                   '거래량회전율': 0.0, '로그시가총액': 0.0, '수정거래량': 0.04251240124551997, '거래량증가율': 0.002735341869645074, 
                   '시가총액증가율': 0.07615808087531853, '시가총액': 0.036259774463654824}

my_series = pd.Series(importance_list)

importance = pd.DataFrame(my_series)

importance.sort_values(by=0,ascending=False)



Unnamed: 0,0
자기자본순이익률,0.19039
EBTA,0.161273
매출액정상영업이익률,0.088814
시가총액증가율,0.076158
OM,0.070452
금융비용부담률,0.057867
이자보상배율(이자비용),0.051853
RETA,0.048076
수정거래량,0.042512
시가총액,0.03626
