In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./Tabnet_Raw_final.csv')

In [3]:
df.replace((np.inf, -np.inf), np.nan, inplace=True)
df.dropna(inplace=True)

In [4]:
## 이상치의 값들을 출력하는 함수 생성
## 매개변수 2개 데이터프레임, 컬럼들들의 이름

def outliers_iqr(df, cols):
    outliers = pd.DataFrame()
    for col in cols:
        ## 1사분위수, 3사분위수 변수에 대입
        quartile_1, quartile_3 = np.percentile(df[col], [25, 75])
        ## IQR 값을 대입
        iqr = quartile_3 - quartile_1
        ## 하단 경계 값, 상단 경계 값
        lower_whis = quartile_1 - (1.5 * iqr)
        upper_whis = quartile_3 + (1.5 * iqr)
        print(col, lower_whis, upper_whis)
        ## 데이터프레임에서 상단의 경계보다 크거나 하단의 경계보다 작은 데이터를 출력
        outliers = df[(df[col] > upper_whis) | (df[col] < lower_whis)]
    return outliers


In [5]:
## 이상치를 확인하는 함수를 호출
outliers = outliers_iqr(df, ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액'])
outliers

자기자본구성비율 -6.836250000000014 118.45375000000001
설비투자효율 -5.908750000000005 120.40125
총자본투자효율 -13.18 55.32
이자보상배율(이자비용) -1499999995.6 2499999995.76
유동비율 -120.85624999999999 465.41375
당좌비율 -118.54250000000002 379.9175
부채비율 -93.76749999999998 231.6325
총자본정상영업이익률 -8.909999999999998 16.369999999999997
매출액정상영업이익률 -9.945 18.335
매출액순이익률 -12.60625 17.72375
자기자본순이익률 -20.09125 27.558749999999996
매출채권회전률 -3.4099999999999993 10.43
재고자산회전률 -47.617500000000014 140.8025
총자본회전률 -0.2300000000000002 2.33
순운전자본비율 -33.26 77.08
매출액증가율 -0.3738512821221669 0.46692003645899677
총자본증가율 -0.2510565504764261 0.35926653764190364
유동자산증가율 -0.43776800587089293 0.5374501136557726
유형자산증가율 -0.29767798064125073 0.3886468620871835
영업이익증가율 -2.616178178594921 2.234060942621259
순이익증가율 -2.6462152800968095 2.2564719974811016
RETA -0.4506295047773456 0.9530648439025948
EBTA -0.12093881169054703 0.18101058466619938
OM -0.14174144500938185 0.21032145824834317
종업원수증가율 -0.23666097171929423 0.27532991636490967
영업이익변화율 -2.875427658463425

Unnamed: 0,회사명,거래소코드,회계년도,자기자본구성비율,설비투자효율,총자본투자효율,이자보상배율(이자비용),유동비율,당좌비율,부채비율,...,고정비율,R&D비율,채무부담비율,거래량회전율,로그시가총액,수정거래량,거래량증가율,시가총액증가율,시가총액,부실
6,(주)CMG제약,58820,2017,89.39,64.09,33.01,1.000000e+09,746.28,673.66,11.11,...,16.522212,4.323740,88.585907,10.245732,12.712526,23548.907778,0.001166,0.811200,5.158533e+12,0
7,(주)CMG제약,58820,2018,90.91,67.33,33.79,1.000000e+09,957.86,856.94,8.17,...,16.607260,1.559812,48.558556,8.811970,12.808381,16242.253593,-0.139937,0.246966,6.432514e+12,0
8,(주)CMG제약,58820,2019,93.71,72.60,32.47,1.000000e+09,1618.07,1520.84,5.17,...,13.387896,3.400411,47.833673,6.889406,12.766702,22742.366441,-0.002737,-0.091507,5.843891e+12,0
83,(주)강스템바이오텍,217730,2019,60.47,0.00,-223.93,-3.942000e+01,268.99,267.69,29.98,...,45.430546,206.765081,231.539492,4.222263,12.582402,640.905243,-0.126843,0.052018,3.822983e+12,0
120,(주)경방,50,2016,52.22,19.66,33.31,5.290000e+00,68.20,38.72,22.86,...,46.407880,0.001930,42.348996,0.199982,12.659407,32.928300,6.764822,7.763158,4.564642e+12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10391,현대제철(주),4020,2019,50.15,37.59,14.59,1.190000e+00,149.19,64.84,37.25,...,119.320025,0.000000,22.007090,0.565282,12.701934,1999.590643,-0.456565,-0.263543,5.034242e+12,0
10455,환인제약(주),16580,2015,87.89,53.56,39.23,1.000000e+09,658.97,572.60,11.49,...,25.367759,0.000000,62.943580,1.140909,12.599928,99.163486,0.369844,0.268148,3.980415e+12,0
10458,환인제약(주),16580,2018,90.23,51.88,40.05,1.000000e+09,794.01,684.53,9.11,...,15.131443,0.418821,40.444932,1.477700,12.586533,132.459383,-0.188496,0.084967,3.859515e+12,0
10484,휴젤(주),145020,2016,93.08,21.47,70.84,1.000000e+09,760.09,692.05,7.27,...,21.471858,3.273205,23.935906,3.005742,12.923972,3.861906,10.662331,1.574018,8.394052e+12,0


In [6]:
# 이상치 제거
drop_outliers = df.drop(outliers.index)
# 이상치 삭제 확인
print('원본데이터 크기:', df.shape)
print('이상치 제거 데이터 크기:', drop_outliers.shape)

원본데이터 크기: (10136, 46)
이상치 제거 데이터 크기: (9281, 46)


In [7]:
df_2 = drop_outliers

In [8]:
df_2.isna().sum()

회사명               0
거래소코드             0
회계년도              0
자기자본구성비율          0
설비투자효율            0
총자본투자효율           0
이자보상배율(이자비용)      0
유동비율              0
당좌비율              0
부채비율              0
총자본정상영업이익률        0
매출액정상영업이익률        0
매출액순이익률           0
자기자본순이익률          0
매출채권회전률           0
재고자산회전률           0
총자본회전률            0
순운전자본비율           0
매출액증가율            0
총자본증가율            0
유동자산증가율           0
유형자산증가율           0
영업이익증가율           0
순이익증가율            0
RETA              0
EBTA              0
OM                0
종업원수증가율           0
영업이익변화율           0
매출액변화율            0
당기순이익변화율          0
DOL               0
DFL               0
EV/EBITDA         0
영업활동으로 인한 현금흐름    0
금융비용부담률           0
고정비율              0
R&D비율             0
채무부담비율            0
거래량회전율            0
로그시가총액            0
수정거래량             0
거래량증가율            0
시가총액증가율           0
시가총액              0
부실                0
dtype: int64

In [9]:
df_2['부실'].value_counts()

0    8877
1     404
Name: 부실, dtype: int64

In [12]:
df_0 = df_2[df_2['부실'] == 0]

In [13]:
df_1 = df_2[df_2['부실'] == 1]

In [17]:
df_0.columns

Index(['회사명', '거래소코드', '회계년도', '자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액', '부실'],
      dtype='object')

In [22]:
pd.options.display.float_format = '{:.5f}'.format

In [23]:
df_0[['수정거래량', '거래량증가율', '거래량회전율', '부실']].describe()

Unnamed: 0,수정거래량,거래량증가율,거래량회전율,부실
count,8877.0,8877.0,8877.0,8877.0
mean,6742.82722,1.00534,3.71658,0.0
std,34656.36254,7.31643,7.40589,0.0
min,0.0,-1.0,0.0,0.0
25%,160.90148,-0.42302,0.81021,0.0
50%,657.206,-0.0484,1.92216,0.0
75%,2843.77964,0.72064,4.27138,0.0
max,1345120.49058,492.17653,457.03561,0.0


In [24]:
df_1[['수정거래량', '거래량증가율', '거래량회전율', '부실']].describe()

Unnamed: 0,수정거래량,거래량증가율,거래량회전율,부실
count,404.0,404.0,404.0,404.0
mean,29798.02406,1.75029,7.22845,1.0
std,98005.64738,7.40877,13.99498,0.0
min,0.0,-1.0,0.0,1.0
25%,872.41311,-0.43265,1.77832,1.0
50%,4119.14263,0.07724,4.04691,1.0
75%,22194.55891,1.48201,7.58188,1.0
max,1329816.21494,122.0587,202.08361,1.0


In [53]:
# Oversampling using SMOTE on train dataset, with hyperparameters set by optuna

train = df[df['회계년도'] <= 2017]
test = df[df['회계년도'] > 2017]

#X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values
#X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values

X_train = train[['자기자본순이익률', '시가총액', '금융비용부담률', 'EBTA', '수정거래량']].values
X_test = test[['자기자본순이익률', '시가총액', '금융비용부담률', 'EBTA', '수정거래량']].values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
smote = SMOTE(sampling_strategy='minority' ,k_neighbors=6)
#smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_test
y_train = y_resampled
y_test = y_test

feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier(
        n_d=46,
        n_a=38,
        n_steps=5,
        gamma=1.5515070542008145,
        lambda_sparse=0.5371050923936614,
        optimizer_params={"lr": 1e-3},
        verbose=0
    )

class_weights = {0: 1, 1: len(y_train[y_train == 0])/len(y_train[y_train == 1])}
clf.class_weights = class_weights

# Train TabNetClassifier on training data
clf.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_name=["val"],
    eval_metric=["auc"],
    max_epochs=10,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("Auc score:", auc)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)


Stop training because you reached max_epochs = 10 with best_epoch = 7 and best_val_auc = 0.88917




Confusion matrix:
 [[1176 1329]
 [  11   76]]
Accuracy: 0.48302469135802467
Precision: 0.05409252669039146
Recall: 0.8735632183908046
F1-score: 0.10187667560321716
Auc score: 0.671512148117558
{'자기자본구성비율': 0.2454485088693349, '설비투자효율': 0.23582817725974473, '총자본투자효율': 0.09693661895675995, '이자보상배율(이자비용)': 0.17634549953586964, '유동비율': 0.2454411953782908}


In [50]:
df.columns

Index(['회사명', '거래소코드', '회계년도', '자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액', '부실'],
      dtype='object')

In [29]:
train = df[df['회계년도'] <= 2017]
test = df[df['회계년도'] > 2017]

#X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values
#X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values

X_train = train.drop(['회사명', '거래소코드', '회계년도'], axis=1).values
X_test = test.drop(['회사명', '거래소코드', '회계년도'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액']

#feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
smote = SMOTE(sampling_strategy='minority' ,k_neighbors=6)
#smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_test
y_train = y_resampled
y_test = y_test


X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier(
        n_d=56,
        n_a=56,
        n_steps=2,
        n_shared=3,
        gamma=1.4,
        lambda_sparse=0.0001021180196306919,
        optimizer_params={"lr": 1e-3},
        verbose=0,
        mask_type='sparsemax'
    )

#{'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0001021180196306919, 'patienceScheduler': 4, 'patience': 23, 'epochs': 100}

#class_weights = {0: 1, 1: len(y_train[y_train == 0])/len(y_train[y_train == 1])}
#clf.class_weights = class_weights

# Train TabNetClassifier on training data
clf.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_name=["val"],
    eval_metric=["auc"],
    max_epochs=10,
    patience=23,
    batch_size=1024,
    virtual_batch_size=128
)

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)

Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_auc = 0.99898




Confusion matrix:
 [[2478   27]
 [   2   85]]
Accuracy: 0.9888117283950617
Precision: 0.7589285714285714
Recall: 0.9770114942528736
F1-score: 0.8542713567839196
{'자기자본구성비율': 0.008880312497652323, '설비투자효율': 0.005130097761845933, '총자본투자효율': 0.00068389136760587, '이자보상배율(이자비용)': 0.002011284743625872, '유동비율': 0.002239878765571378, '당좌비율': 0.00517395225278928, '부채비율': 0.018867925029400816, '총자본정상영업이익률': 0.01314960924077827, '매출액정상영업이익률': 0.007079350070861711, '매출액순이익률': 0.0082588730116213, '자기자본순이익률': 0.0069817546156278896, '매출채권회전률': 0.007415998764642206, '재고자산회전률': 0.027701363210748106, '총자본회전률': 0.00040348915852457204, '순운전자본비율': 0.023058041998405835, '매출액증가율': 0.002142815132262591, '총자본증가율': 0.0029850195109245286, '유동자산증가율': 0.00472717947210003, '유형자산증가율': 0.0005642029213295793, '영업이익증가율': 0.006058510577228073, '순이익증가율': 0.0006263934461270727, 'RETA': 0.07601709848886395, 'EBTA': 0.13732207781676017, 'OM': 0.09497569681227323, '종업원수증가율': 0.032501442796050425, '영업이익변화율': 0.002026243713414

In [51]:
# adding more data to test dataset with the same option above 

train = df[df['회계년도'] <= 2016]
test = df[df['회계년도'] > 2016]

#X_train = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values
#X_test = test.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).values

X_train = train.drop(['회사명', '거래소코드', '회계년도', '부실'], axis=1).values
X_test = test.drop(['회사명', '거래소코드', '회계년도', '부실'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액']

#feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
smote = SMOTE(sampling_strategy='minority' ,k_neighbors=6)
#smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_test
y_train = y_resampled
y_test = y_test


X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1)

clf = TabNetClassifier(
        n_d=56,
        n_a=56,
        n_steps=2,
        n_shared=3,
        gamma=1.4,
        lambda_sparse=0.0001021180196306919,
        optimizer_params={"lr": 1e-3},
        verbose=0,
        mask_type='sparsemax'
    )

#{'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0001021180196306919, 'patienceScheduler': 4, 'patience': 23, 'epochs': 100}

#class_weights = {0: 1, 1: len(y_train[y_train == 0])/len(y_train[y_train == 1])}
#clf.class_weights = class_weights

# Train TabNetClassifier on training data
clf.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_name=["val"],
    eval_metric=["auc"],
    max_epochs=10,
    patience=23,
    batch_size=1024,
    virtual_batch_size=128
)

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)

Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_auc = 0.71331




Confusion matrix:
 [[2983  678]
 [  65   75]]
Accuracy: 0.8045251249671139
Precision: 0.099601593625498
Recall: 0.5357142857142857
F1-score: 0.16797312430011196
{'자기자본구성비율': 0.027970501706049658, '설비투자효율': 0.030350182328200775, '총자본투자효율': 0.005614583887533744, '이자보상배율(이자비용)': 0.0038012142333936436, '유동비율': 0.0005986887907009765, '당좌비율': 0.005282192128820619, '부채비율': 0.015927778842861406, '총자본정상영업이익률': 0.00022912988817210686, '매출액정상영업이익률': 0.03821932591852601, '매출액순이익률': 0.08366169667413555, '자기자본순이익률': 0.011170639706088595, '매출채권회전률': 0.02257562638469738, '재고자산회전률': 0.018939483895337794, '총자본회전률': 0.052583300704769124, '순운전자본비율': 0.08473322156505694, '매출액증가율': 0.007944327420595948, '총자본증가율': 0.003508951309230041, '유동자산증가율': 0.0036259400089062557, '유형자산증가율': 0.001138669266600429, '영업이익증가율': 0.001367376666633314, '순이익증가율': 0.009147730296967029, 'RETA': 0.0689375261197117, 'EBTA': 0.030741351093114933, 'OM': 0.047794987087667454, '종업원수증가율': 0.010883955234447297, '영업이익변화율': 0.0099378811511

In [52]:
# After data preprocessing, without hyperparameters set by optuna

train = df_2[df_2['회계년도'] <= 2017]
test = df_2[df_2['회계년도'] > 2017]

X_train = train.drop(['부실','회사명', '회계년도', '거래소코드', '시가총액'], axis=1).values
X_test = test.drop(['부실', '회사명','회계년도', '거래소코드', '시가총액'], axis=1).values

#X_train = train[['자기자본구성비율', '유동비율', '당좌비율','부채비율','총자본회전률','순운전자본비율','총자본증가율', 
#'유동자산증가율', '유형자산증가율', '영업이익증가율', 'RETA', '매출액변화율', '시가총액증가율', '수정거래량']].values
#X_test = test[['자기자본구성비율', '유동비율', '당좌비율','부채비율','총자본회전률','순운전자본비율','총자본증가율', 
#'유동자산증가율', '유형자산증가율', '영업이익증가율', 'RETA', '매출액변화율', '시가총액증가율', '수정거래량']].values

#X_train = train[['수정거래량']].values
#X_test = test[['수정거래량']].values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = ['자기자본구성비율', '유동비율', '당좌비율','부채비율','총자본회전률','순운전자본비율','총자본증가율', 
'유동자산증가율', '유형자산증가율', '영업이익증가율', 'RETA', '매출액변화율', '시가총액증가율','수정거래량']

#feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
smote = SMOTE(sampling_strategy='minority' ,k_neighbors=2)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_test
y_train = y_resampled
y_test = y_test


X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.1, random_state=300)

clf = TabNetClassifier()

#clf = TabNetClassifier(
#        n_d=56,
#        n_a=56,
#        n_steps=2,
#        n_shared=3,
#        gamma=1.4,
#        lambda_sparse=0.0001021180196306919,
#        optimizer_params={"lr": 1e-3},
#        verbose=0,
#        mask_type='entmax'
#    )

#{'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0001021180196306919, 'patienceScheduler': 4, 'patience': 23, 'epochs': 100}

#class_weights = {0: 1, 1: len(y_train[y_train == 0])/len(y_train[y_train == 1])}
#clf.class_weights = class_weights

# Train TabNetClassifier on training data

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

#clf.fit(
#    X_train,
#    y_train,
#    eval_set=[(X_valid, y_valid)],
#    eval_name=["val"],
#    eval_metric=["auc"],
#    max_epochs=10,
#    patience=23,
#    batch_size=1024,
#    virtual_batch_size=128
#)

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("Auc score:", auc)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.74439 | val_0_auc: 0.58768 |  0:00:01s
epoch 1  | loss: 0.56999 | val_0_auc: 0.609   |  0:00:02s
epoch 2  | loss: 0.50165 | val_0_auc: 0.80445 |  0:00:03s
epoch 3  | loss: 0.46512 | val_0_auc: 0.86234 |  0:00:04s
epoch 4  | loss: 0.43376 | val_0_auc: 0.83736 |  0:00:05s
epoch 5  | loss: 0.41801 | val_0_auc: 0.87936 |  0:00:06s
epoch 6  | loss: 0.4071  | val_0_auc: 0.86725 |  0:00:07s
epoch 7  | loss: 0.39753 | val_0_auc: 0.8979  |  0:00:08s
epoch 8  | loss: 0.3969  | val_0_auc: 0.89035 |  0:00:10s
epoch 9  | loss: 0.39021 | val_0_auc: 0.8868  |  0:00:11s
epoch 10 | loss: 0.38951 | val_0_auc: 0.89548 |  0:00:12s
epoch 11 | loss: 0.37993 | val_0_auc: 0.89578 |  0:00:13s
epoch 12 | loss: 0.38498 | val_0_auc: 0.90021 |  0:00:14s
epoch 13 | loss: 0.37704 | val_0_auc: 0.89259 |  0:00:15s
epoch 14 | loss: 0.37407 | val_0_auc: 0.89985 |  0:00:16s
epoch 15 | loss: 0.36206 | val_0_auc: 0.90064 |  0:00:17s
epoch 16 | loss: 0.36159 | val_0_auc: 0.91248 |  0:00:18s
epoch 17 | los



Confusion matrix:
 [[1744  451]
 [  22   59]]
Accuracy: 0.7921792618629174
Precision: 0.11568627450980393
Recall: 0.7283950617283951
F1-score: 0.1996615905245347
Auc score: 0.7614640456705757
{'자기자본구성비율': 1.0112812338077407e-05, '유동비율': 0.00011593830003722763, '당좌비율': 0.011229519276232782, '부채비율': 0.01915395341288318, '총자본회전률': 0.06461771639478257, '순운전자본비율': 0.0, '총자본증가율': 0.22156787892750185, '유동자산증가율': 0.07199107088486213, '유형자산증가율': 0.01910203824873215, '영업이익증가율': 0.014596395005500133, 'RETA': 1.455325995548049e-05, '매출액변화율': 0.00021722885422585994, '시가총액증가율': 3.280470770375977e-05, '수정거래량': 3.8669248233662185e-08}


In [53]:
# After train, test dataset splited / Train, test dataset resampled by SMOTE 

train = df_2[df_2['회계년도'] <= 2017]
test = df_2[df_2['회계년도'] > 2017]

X_train = train.drop(['부실','회사명', '회계년도', '거래소코드', '시가총액'], axis=1).values
X_test = test.drop(['부실', '회사명','회계년도', '거래소코드', '시가총액'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액']

#feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
smote = SMOTE(sampling_strategy='minority' ,k_neighbors=5)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_resampled_test
y_train = y_resampled
y_test = y_resampled_test


X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=300)

clf = TabNetClassifier()

# Train TabNetClassifier on training data

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.73576 | val_0_auc: 0.6858  |  0:00:01s
epoch 1  | loss: 0.5515  | val_0_auc: 0.79525 |  0:00:02s
epoch 2  | loss: 0.47323 | val_0_auc: 0.78115 |  0:00:03s
epoch 3  | loss: 0.4498  | val_0_auc: 0.86786 |  0:00:04s
epoch 4  | loss: 0.43883 | val_0_auc: 0.87601 |  0:00:05s
epoch 5  | loss: 0.43012 | val_0_auc: 0.88508 |  0:00:06s
epoch 6  | loss: 0.42658 | val_0_auc: 0.8914  |  0:00:07s
epoch 7  | loss: 0.42544 | val_0_auc: 0.8897  |  0:00:08s
epoch 8  | loss: 0.41724 | val_0_auc: 0.89403 |  0:00:09s
epoch 9  | loss: 0.40219 | val_0_auc: 0.89205 |  0:00:10s
epoch 10 | loss: 0.40161 | val_0_auc: 0.90153 |  0:00:11s
epoch 11 | loss: 0.39909 | val_0_auc: 0.90354 |  0:00:12s
epoch 12 | loss: 0.39801 | val_0_auc: 0.89897 |  0:00:13s
epoch 13 | loss: 0.39134 | val_0_auc: 0.89557 |  0:00:14s
epoch 14 | loss: 0.3901  | val_0_auc: 0.88648 |  0:00:15s
epoch 15 | loss: 0.38234 | val_0_auc: 0.88391 |  0:00:16s
epoch 16 | loss: 0.38544 | val_0_auc: 0.8999  |  0:00:17s
epoch 17 | los



Confusion matrix:
 [[1658  537]
 [ 454 1741]]
Accuracy: 0.774259681093394
Precision: 0.7642669007901668
Recall: 0.7931662870159454
F1-score: 0.7784484685893137
{'자기자본구성비율': 0.020492703046638375, '설비투자효율': 0.0016127746057385296, '총자본투자효율': 0.017777130485258648, '이자보상배율(이자비용)': 3.364235434236954e-05, '유동비율': 3.44867435771246e-05, '당좌비율': 0.057013575985215496, '부채비율': 0.12651298610873915, '총자본정상영업이익률': 0.045731056786025084, '매출액정상영업이익률': 0.02936051499201158, '매출액순이익률': 0.0, '자기자본순이익률': 0.026495164470319865, '매출채권회전률': 0.0001616545052003242, '재고자산회전률': 0.03527456346862133, '총자본회전률': 0.0, '순운전자본비율': 6.373079623812728e-06, '매출액증가율': 0.0, '총자본증가율': 0.0020254109862074617, '유동자산증가율': 0.0, '유형자산증가율': 0.005561264758950418, '영업이익증가율': 0.00015842424623885588, '순이익증가율': 0.006837045833347332, 'RETA': 0.04893937237402302, 'EBTA': 0.14708946860269478, 'OM': 0.11854895461771188, '종업원수증가율': 5.411674449555128e-05, '영업이익변화율': 0.0003193923881344992, '매출액변화율': 9.504089899783909e-07, '당기순이익변화율': 0.00023232989

In [54]:
# Resampling data using SMOTE

X = df_2.drop(['부실', '회사명', '회계년도', '거래소코드', '시가총액'], axis=1)
y = df_2['부실']

smote = SMOTE(sampling_strategy='minority' ,k_neighbors=5)

X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=24)

#train = df_2[df_2['회계년도'] <= 2017]
#test = df_2[df_2['회계년도'] > 2017]

#X_train = train.drop(['부실','회사명', '회계년도', '거래소코드', '시가총액'], axis=1).values
#X_test = test.drop(['부실', '회사명','회계년도', '거래소코드', '시가총액'], axis=1).values

#X_train = train[['자기자본구성비율', '유동비율', '당좌비율','부채비율','총자본회전률','순운전자본비율','총자본증가율', 
#'유동자산증가율', '유형자산증가율', '영업이익증가율', 'RETA', '매출액변화율', '시가총액증가율', '수정거래량']].values
#X_test = test[['자기자본구성비율', '유동비율', '당좌비율','부채비율','총자본회전률','순운전자본비율','총자본증가율', 
#'유동자산증가율', '유형자산증가율', '영업이익증가율', 'RETA', '매출액변화율', '시가총액증가율', '수정거래량']].values

#X_train = train[['수정거래량']].values
#X_test = test[['수정거래량']].values

#y_train = train['부실'].values
#y_test = test['부실'].values

feature_names = ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액']

#feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
#smote = SMOTE(sampling_strategy='minority' ,k_neighbors=5)

#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

# Resample using Random Under Sampler
#rus = RandomUnderSampler()
#X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = rus.fit_resample(X_test, y_test)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values


#X_train, X_valid, y_train, y_valid = train_test_split(
#    X_train, y_train, test_size=0.1, random_state=300)

clf = TabNetClassifier()

#clf = TabNetClassifier(
#        n_d=56,
#        n_a=56,
#        n_steps=2,
#        n_shared=3,
#        gamma=1.4,
#        lambda_sparse=0.0001021180196306919,
#        optimizer_params={"lr": 1e-3},
#        verbose=0,
#        mask_type='entmax'
#    )

#{'mask_type': 'entmax', 'n_da': 56, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 3, 'lambda_sparse': 0.0001021180196306919, 'patienceScheduler': 4, 'patience': 23, 'epochs': 100}

#class_weights = {0: 1, 1: len(y_train[y_train == 0])/len(y_train[y_train == 1])}
#clf.class_weights = class_weights

# Train TabNetClassifier on training data

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

#clf.fit(
#    X_train,
#    y_train,
#    eval_set=[(X_valid, y_valid)],
#    eval_name=["val"],
#    eval_metric=["auc"],
#    max_epochs=10,
#    patience=23,
#    batch_size=1024,
#    virtual_batch_size=128
#)

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.71897 | val_0_auc: 0.68898 |  0:00:01s
epoch 1  | loss: 0.54881 | val_0_auc: 0.80846 |  0:00:02s
epoch 2  | loss: 0.46825 | val_0_auc: 0.8671  |  0:00:04s
epoch 3  | loss: 0.43751 | val_0_auc: 0.87409 |  0:00:05s
epoch 4  | loss: 0.42703 | val_0_auc: 0.87495 |  0:00:06s
epoch 5  | loss: 0.41549 | val_0_auc: 0.89039 |  0:00:08s
epoch 6  | loss: 0.4114  | val_0_auc: 0.89218 |  0:00:09s
epoch 7  | loss: 0.40216 | val_0_auc: 0.90435 |  0:00:10s
epoch 8  | loss: 0.39711 | val_0_auc: 0.90506 |  0:00:12s
epoch 9  | loss: 0.38917 | val_0_auc: 0.90931 |  0:00:13s
epoch 10 | loss: 0.38773 | val_0_auc: 0.89574 |  0:00:14s
epoch 11 | loss: 0.38432 | val_0_auc: 0.88806 |  0:00:17s
epoch 12 | loss: 0.37665 | val_0_auc: 0.90039 |  0:00:18s
epoch 13 | loss: 0.37093 | val_0_auc: 0.90281 |  0:00:20s
epoch 14 | loss: 0.37093 | val_0_auc: 0.89906 |  0:00:22s
epoch 15 | loss: 0.36761 | val_0_auc: 0.89487 |  0:00:23s
epoch 16 | loss: 0.36179 | val_0_auc: 0.90072 |  0:00:24s
epoch 17 | los



Confusion matrix:
 [[1390  372]
 [  14 1775]]
Accuracy: 0.8912982258518727
Precision: 0.8267349790405216
Recall: 0.9921743991056456
F1-score: 0.901930894308943
{'자기자본구성비율': 0.08076761783683871, '설비투자효율': 0.0, '총자본투자효율': 0.09559798687640032, '이자보상배율(이자비용)': 0.0, '유동비율': 0.0017966894455916395, '당좌비율': 0.0, '부채비율': 0.04537035509997105, '총자본정상영업이익률': 0.054090657762159086, '매출액정상영업이익률': 0.05924417139951517, '매출액순이익률': 0.0, '자기자본순이익률': 0.0, '매출채권회전률': 0.07045967046377487, '재고자산회전률': 2.0101019795955405e-06, '총자본회전률': 0.0, '순운전자본비율': 6.6358878621123e-07, '매출액증가율': 9.723075711631875e-07, '총자본증가율': 0.0, '유동자산증가율': 0.0, '유형자산증가율': 0.007823812759860254, '영업이익증가율': 3.2426252829395255e-05, '순이익증가율': 0.0, 'RETA': 0.11550801224647418, 'EBTA': 0.0, 'OM': 0.05746249660140571, '종업원수증가율': 0.05159812290649592, '영업이익변화율': 0.005739600662521513, '매출액변화율': 0.0009473631423236088, '당기순이익변화율': 0.0, 'DOL': 0.0001724892534143815, 'DFL': 0.0007317246137371732, 'EV/EBITDA': 0.013462785572442557, '영업활동으로 인한 현금흐름': 1.9

In [51]:
# Resampling data using BorderlineSMOTE

X = df_2.drop(['부실', '회사명', '회계년도', '거래소코드', '시가총액'], axis=1)
y = df_2['부실']

bsmote = BorderlineSMOTE(sampling_strategy='minority' ,k_neighbors=5, m_neighbors=10)

X_bresampled, y_bresampled = bsmote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_bresampled, y_bresampled, test_size=0.2, random_state=24)

feature_names = ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액']


X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

clf = TabNetClassifier()

# Train TabNetClassifier on training data

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

preds = clf.predict(X_test)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
print("AUC score:", auc)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.6949  | val_0_auc: 0.81694 |  0:00:01s
epoch 1  | loss: 0.46227 | val_0_auc: 0.82654 |  0:00:02s
epoch 2  | loss: 0.38723 | val_0_auc: 0.83912 |  0:00:04s
epoch 3  | loss: 0.37251 | val_0_auc: 0.87548 |  0:00:05s
epoch 4  | loss: 0.36082 | val_0_auc: 0.87969 |  0:00:07s
epoch 5  | loss: 0.34632 | val_0_auc: 0.88594 |  0:00:08s
epoch 6  | loss: 0.34507 | val_0_auc: 0.89087 |  0:00:09s
epoch 7  | loss: 0.33687 | val_0_auc: 0.88912 |  0:00:11s
epoch 8  | loss: 0.33691 | val_0_auc: 0.8868  |  0:00:12s
epoch 9  | loss: 0.32744 | val_0_auc: 0.88907 |  0:00:13s
epoch 10 | loss: 0.3244  | val_0_auc: 0.89867 |  0:00:15s
epoch 11 | loss: 0.32995 | val_0_auc: 0.90151 |  0:00:16s
epoch 12 | loss: 0.32308 | val_0_auc: 0.90389 |  0:00:17s
epoch 13 | loss: 0.31833 | val_0_auc: 0.90773 |  0:00:19s
epoch 14 | loss: 0.32492 | val_0_auc: 0.91064 |  0:00:20s
epoch 15 | loss: 0.31648 | val_0_auc: 0.91915 |  0:00:22s
epoch 16 | loss: 0.31616 | val_0_auc: 0.92349 |  0:00:23s
epoch 17 | los



Confusion matrix:
 [[1540  222]
 [  61 1728]]
Accuracy: 0.9203041396789636
Precision: 0.8861538461538462
Recall: 0.965902738960313
F1-score: 0.9243113131853438
AUC score: 0.9199547747014959
{'자기자본구성비율': 0.0, '설비투자효율': 0.004830561344302666, '총자본투자효율': 0.0504100151965996, '이자보상배율(이자비용)': 0.0, '유동비율': 0.0, '당좌비율': 0.08930537856958294, '부채비율': 0.10727197053774694, '총자본정상영업이익률': 0.0623404862028463, '매출액정상영업이익률': 0.04750126568640788, '매출액순이익률': 2.6209508094196232e-06, '자기자본순이익률': 0.008046889612153472, '매출채권회전률': 0.017738058479264646, '재고자산회전률': 0.026386774141551536, '총자본회전률': 0.01663061273239922, '순운전자본비율': 0.0, '매출액증가율': 0.0, '총자본증가율': 0.008556554087974923, '유동자산증가율': 0.0, '유형자산증가율': 0.0, '영업이익증가율': 0.114868384238731, '순이익증가율': 0.00017572785629322123, 'RETA': 0.0, 'EBTA': 0.09447071054009759, 'OM': 0.03890304386991518, '종업원수증가율': 4.392630919508245e-05, '영업이익변화율': 0.0009435443883283981, '매출액변화율': 0.0006397757433577077, '당기순이익변화율': 0.07208572404037247, 'DOL': 1.4767206828966522e-05, 'DFL': 0.

In [41]:
# Resampling data using BorderlineSMOTE

train = df_2[df_2['회계년도'] <= 2017]
test = df_2[df_2['회계년도'] > 2017]

X_train = train.drop(['부실','회사명', '회계년도', '거래소코드', '시가총액'], axis=1).values
X_test = test.drop(['부실', '회사명','회계년도', '거래소코드', '시가총액'], axis=1).values

y_train = train['부실'].values
y_test = test['부실'].values

feature_names = ['자기자본구성비율', '설비투자효율', '총자본투자효율', '이자보상배율(이자비용)',
       '유동비율', '당좌비율', '부채비율', '총자본정상영업이익률', '매출액정상영업이익률', '매출액순이익률',
       '자기자본순이익률', '매출채권회전률', '재고자산회전률', '총자본회전률', '순운전자본비율', '매출액증가율',
       '총자본증가율', '유동자산증가율', '유형자산증가율', '영업이익증가율', '순이익증가율', 'RETA', 'EBTA',
       'OM', '종업원수증가율', '영업이익변화율', '매출액변화율', '당기순이익변화율', 'DOL', 'DFL',
       'EV/EBITDA', '영업활동으로 인한 현금흐름', '금융비용부담률', '고정비율', 'R&D비율', '채무부담비율',
       '거래량회전율', '로그시가총액', '수정거래량', '거래량증가율', '시가총액증가율', '시가총액']

#feature_names = train.drop(['부실', '회사명', '회계년도', '거래소코드'], axis=1).columns.tolist()

# Resample using SMOTE
bsmote = BorderlineSMOTE(sampling_strategy='minority' ,k_neighbors=9)

X_resampled, y_resampled = bsmote.fit_resample(X_train, y_train)
#X_resampled_test, y_resampled_test = smote.fit_resample(X_test, y_test)

X_train = X_resampled
X_test = X_test
y_train = y_resampled
y_test = y_test

clf = TabNetClassifier()

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=300)

# Train TabNetClassifier on training data

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

preds = clf.predict(X_test)
preds_train = clf.predict(X_train)

cm = confusion_matrix(y_test, preds)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

cm_t = confusion_matrix(y_train, preds_train)
acc_t = accuracy_score(y_train, preds_train)
prec_t = precision_score(y_train, preds_train)
rec_t = recall_score(y_train, preds_train)
f1_t = f1_score(y_train, preds_train)

print("confusion_matrix_train:\n", cm_t)
print("Accuracy_train:", acc_t)
print("Precision_train:", prec_t)
print("Recall_train:", rec_t)
print("F1-score_train:", f1_t)

print("Confusion matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

# Feature importance
feat_importances = clf.feature_importances_
feature_importances_dict = dict(zip(feature_names, feat_importances))

# Print the feature importance scores
print(feature_importances_dict)



epoch 0  | loss: 0.74488 | val_0_auc: 0.70749 |  0:00:01s
epoch 1  | loss: 0.51745 | val_0_auc: 0.74147 |  0:00:02s
epoch 2  | loss: 0.42985 | val_0_auc: 0.78973 |  0:00:03s
epoch 3  | loss: 0.39826 | val_0_auc: 0.84963 |  0:00:04s
epoch 4  | loss: 0.38247 | val_0_auc: 0.86947 |  0:00:05s
epoch 5  | loss: 0.3703  | val_0_auc: 0.86741 |  0:00:06s
epoch 6  | loss: 0.36505 | val_0_auc: 0.8693  |  0:00:07s
epoch 7  | loss: 0.35404 | val_0_auc: 0.89969 |  0:00:08s
epoch 8  | loss: 0.35663 | val_0_auc: 0.91487 |  0:00:09s
epoch 9  | loss: 0.34654 | val_0_auc: 0.92078 |  0:00:10s
epoch 10 | loss: 0.33928 | val_0_auc: 0.9221  |  0:00:11s
epoch 11 | loss: 0.3427  | val_0_auc: 0.92365 |  0:00:12s
epoch 12 | loss: 0.33319 | val_0_auc: 0.92809 |  0:00:13s
epoch 13 | loss: 0.32413 | val_0_auc: 0.93319 |  0:00:14s
epoch 14 | loss: 0.32112 | val_0_auc: 0.9333  |  0:00:16s
epoch 15 | loss: 0.32863 | val_0_auc: 0.93074 |  0:00:17s
epoch 16 | loss: 0.32297 | val_0_auc: 0.92985 |  0:00:18s
epoch 17 | los



confusion_matrix_train:
 [[3769 1619]
 [ 255 5048]]
Accuracy_train: 0.8247123748947713
Precision_train: 0.7571621418929053
Recall_train: 0.9519140109372054
F1-score_train: 0.8434419381787802
Confusion matrix:
 [[1539  656]
 [  12   69]]
Accuracy: 0.7065026362038664
Precision: 0.09517241379310344
Recall: 0.8518518518518519
F1-score: 0.17121588089330023
{'자기자본구성비율': 0.02082202880958761, '설비투자효율': 2.7299612007015244e-05, '총자본투자효율': 0.056635161038811374, '이자보상배율(이자비용)': 0.0013169015395198612, '유동비율': 0.04290402500919893, '당좌비율': 0.0029335040586050258, '부채비율': 0.04664832903939047, '총자본정상영업이익률': 0.10774961450722506, '매출액정상영업이익률': 1.8330078658680317e-05, '매출액순이익률': 0.06163381304482785, '자기자본순이익률': 0.1918323519087091, '매출채권회전률': 8.144985102750286e-05, '재고자산회전률': 0.00298680171906, '총자본회전률': 0.040440514576869174, '순운전자본비율': 0.002992856152468653, '매출액증가율': 0.005616318030652966, '총자본증가율': 0.019151401674294657, '유동자산증가율': 4.44113721091653e-06, '유형자산증가율': 0.0017285271696580095, '영업이익증가율': 0.0, '순이익증