# **라이브러리**

In [51]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import *
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score

import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'malgun gothic'

# **함수모음**

## **모델링**

In [71]:
def modeling(model, X, y, test):    
    model.fit(X, y)
    pred = model.predict_proba(test)
    pred = (pred[:,0] < 0.4)*1         # threshold 부도기업일 확률이 10% 이상이면 부도로 판단해라.
    return pred

## **스코어**

In [54]:
def scoring(y_true, y_pred):
    print('f1-Score:', f1_score(y_true, y_pred, average='macro'))
    print('Recall:',recall_score(y_true, y_pred, average='macro'))
    print('Precision:',precision_score(y_true, y_pred, average='macro'))   

In [89]:
dataset = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Data\최종데이터셋\Finaldataset_ver7.3.csv',index_col=0)

In [95]:
dataset['년'].value_counts()

2020.0    1422
2019.0    1417
2018.0    1416
2017.0    1389
2016.0    1351
2008.0    1333
2015.0    1313
2009.0    1309
2010.0    1279
2014.0    1278
2011.0    1255
2013.0    1254
2007.0    1247
2012.0    1247
2006.0    1196
2005.0    1168
2003.0    1139
2004.0    1130
2002.0    1040
2001.0     936
2021.0       5
Name: 년, dtype: int64

In [93]:
dataset.loc[dataset['년'] == 2019,'부도'].value_counts()

1.0    1406
0.0      11
Name: 부도, dtype: int64

In [94]:
dataset.loc[dataset['년'] == 2020,'부도'].value_counts()

1.0    1417
0.0       5
Name: 부도, dtype: int64

In [27]:
dataset.columns

Index(['회사명', '거래소코드', '회계년도', '소속코드', '상장폐지일', '감사의견코드', '자기자본배당률',
       '금융비용 대 부채비율', '자기자본배율', '유동부채비율', '매입채무회전률', '유동자산회전률', '총자본투자효율',
       '순운전자본비율', '누적수익성비율', '총자산영업이익률', '총자산회전율', '년', 'key', '부도', '공시발생일',
       '정기공시제목', '수시공시제목', '대표이사 변경', '최대주주 변경', '회계처리위반', '횡령배임', '폐지사유요약합본',
       '신종채권', '영업조업중단', '출자목적_투자', '출자목적_경영권', '출자목적_영업이익', '기타'],
      dtype='object')

In [55]:
drop_feature = ['회사명', '거래소코드', '회계년도', '소속코드', '상장폐지일', '감사의견코드', '년', 'key','폐지사유요약합본','공시발생일', '정기공시제목', '수시공시제목', '부도','기타']
text_col = ['대표이사 변경', '최대주주 변경', '회계처리위반', '횡령배임','신종채권','출자목적_투자', '출자목적_경영권', '출자목적_영업이익']

In [56]:
ajX = dataset.drop(drop_feature, axis=1)
X = ajX.drop(text_col, axis = 1)
ajy = dataset['부도']
y = dataset['부도']

# **재무데이터 + 비재무데이터**

In [80]:
ajX_train, ajX_test, ajy_train, ajy_test = train_test_split(ajX, ajy, test_size=0.2, stratify=ajy, random_state = 44)
ajX_train , ajy_train = RandomUnderSampler(random_state=0).fit_resample(ajX_train , ajy_train)

In [81]:
ajdt_clf = modeling(DecisionTreeClassifier(), ajX_train, ajy_train, ajX_test)
ajrf_clf = modeling(RandomForestClassifier(),ajX_train, ajy_train, ajX_test)
ajada_clf = modeling(AdaBoostClassifier(), ajX_train, ajy_train, ajX_test)
ajlg_clf = modeling(LogisticRegression(), ajX_train, ajy_train, ajX_test)
ajknn_clf = modeling(KNeighborsClassifier(), ajX_train, ajy_train, ajX_test)
ajsvc_clf = modeling(SVC(probability=True), ajX_train, ajy_train, ajX_test)
ajlgmb_clf = modeling(LGBMClassifier(), ajX_train, ajy_train, ajX_test)
ajcat_clf = modeling(CatBoostClassifier(silent=True),ajX_train, ajy_train, ajX_test)

In [82]:
scoring(ajy_test , ajdt_clf)

f1-Score: 0.5113805276848756
Recall: 0.8371145096056622
Precision: 0.5331335403726708


In [83]:
confusion_matrix(ajy_test, ajdt_clf)  # 59 62

array([[  69,   11],
       [ 931, 4014]], dtype=int64)

# **재무데이터only**

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 46)
X_train, y_train = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)

In [85]:
dt_clf = modeling(DecisionTreeClassifier(), X_train, y_train, X_test)
rf_clf = modeling(RandomForestClassifier(), X_train, y_train, X_test)
ada_clf = modeling(AdaBoostClassifier(), X_train, y_train, X_test)
lg_clf = modeling(LogisticRegression(), X_train, y_train, X_test)
knn_clf = modeling(KNeighborsClassifier(), X_train, y_train, X_test)
svc_clf = modeling(SVC(probability=True), X_train, y_train, X_test)
lgmb_clf = modeling(LGBMClassifier(), X_train, y_train, X_test)
cat_clf = modeling(CatBoostClassifier(silent=True),X_train, y_train, X_test)

In [86]:
scoring(y_test , dt_clf)

f1-Score: 0.4950981095784231
Recall: 0.8063321536905965
Precision: 0.5277791531535992


In [87]:
confusion_matrix(y_test, dt_clf) # 42 79

array([[  66,   14],
       [1050, 3895]], dtype=int64)