# **라이브러리**

In [1]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import *
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score

import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'malgun gothic'

# **함수모음**

## **모델링**

In [2]:
def modeling(model, X, y, test):    
    model.fit(X, y)
    pred = model.predict_proba(test)
    pred = (pred[:,0] < 0.4)*1         # threshold 부도기업일 확률이 10% 이상이면 부도로 판단해라.
    return pred

## **스코어**

In [3]:
def scoring(y_true, y_pred):
    print('f1-Score:', f1_score(y_true, y_pred, average='macro'))
    print('Recall:',recall_score(y_true, y_pred, average='macro'))
    print('Precision:',precision_score(y_true, y_pred, average='macro'))   

In [4]:
dataset = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Data\최종데이터셋\Finaldataset_ver7.5.csv',index_col=0)

In [5]:
dataset['부도'].value_counts()

0.0    26430
1.0      402
Name: 부도, dtype: int64

In [6]:
dataset.columns

Index(['회사명', '거래소코드', '회계년도', '유동비율', '당좌비율', '유동부채비율', '비유동비율', '순운전자본비율',
       '부채비율', '차입금의존도', '비유동장기적합률', '매출채권_대_매입채무비율', '자기자본순이익률', '총자본순이익률',
       '자기자본영업이익률', '총자본영업이익률', '경영자본순이익률', '경영자본영업이익률', '매출액순이익률', '금융비용부담률',
       '매출액영업이익률', '당좌자산회전률', '재고자산회전률', '자기자본회전률', '경영자본회전률', '비유동자산회전률',
       '매출채권회전률', '매입채무회전률', '유형자산회전율', '설비투자효율', '총자본투자효율', '자기자본증가율',
       '매출액증가율', '영업이익증가율', '순이익증가율', '유형자산증가율', '유동자산증가율', 'EBITDA_매출액',
       '부채구성비율', '청산가치율', '이익잉여금비율', '자본잉여금비율', '경영자산비율', '이자보상비율', '매출액판관비율',
       '매출원가율', '부가가치율', '재고자산보유기간', '매출채권회수기간', '매입채무지급기간', 'EBITDA이자보상비율',
       'EBITDA유동부채비율', 'EBITDA단기차입비율', 'EBITDA총자산비율', 'EBITDA총부채비율',
       '영업이익대비영업현금흐름비율', '영업이익대비EBITDA비율', '자기자본배당률', '금융비용_대_부채비율', '자기자본배율',
       '누적수익성비율', '총자산영업이익률', '총자산회전율', '정기공시제목', '수시공시제목', '대표이사_변경',
       '최대주주_변경', '회계처리위반', '횡령배임', '신종채권', '영업조업중단', '출자목적_투자', '출자목적_경영권',
       '출자목적_영업이익', '기타', '외국인_주식분포비율', '종가', '종가변동률', '년', 'key', '부도'],
      dtype='object'

In [55]:
drop_feature = ['회사명', '거래소코드', '회계년도', '소속코드', '상장폐지일', '감사의견코드', '년', 'key','폐지사유요약합본','공시발생일', '정기공시제목', '수시공시제목', '부도',
'기타', '외국인_주식분포비율', '종가','유동비율', '당좌비율', '순운전자본비율',
       '부채비율', '차입금의존도', '비유동장기적합률', '매출채권_대_매입채무비율', '총자본순이익률',
       '자기자본영업이익률', '총자본영업이익률', '경영자본영업이익률', '매출액순이익률', '금융비용부담률',
       '매출액영업이익률', '당좌자산회전률', '재고자산회전률', '자기자본회전률', '비유동자산회전률',
       '매출채권회전률', '매입채무회전률', '유형자산회전율',  '자기자본증가율',
       '매출액증가율', '순이익증가율', '유형자산증가율', '유동자산증가율', 'EBITDA_매출액',
       '부채구성비율', '청산가치율', '이익잉여금비율', '자본잉여금비율', '경영자산비율', '이자보상비율', '매출액판관비율',
       '매출원가율', '재고자산보유기간', '매입채무지급기간', 'EBITDA이자보상비율',
       'EBITDA유동부채비율', 'EBITDA총자산비율', 'EBITDA총부채비율',
       '영업이익대비영업현금흐름비율', '영업이익대비EBITDA비율', '자기자본배당률', '금융비용_대_부채비율', '자기자본배율',
       '누적수익성비율', '총자산영업이익률', '총자산회전율']
text_col = ['대표이사_변경', '최대주주_변경', '회계처리위반', '횡령배임','신종채권','출자목적_투자','영업조업중단', '출자목적_경영권', '출자목적_영업이익','종가변동률']

In [56]:
ajX = dataset.drop(drop_feature, axis=1)
X = ajX.drop(text_col, axis = 1)
ajy = dataset['부도']
y = dataset['부도']

# **재무데이터 + 비재무데이터**

In [80]:
ajX_train, ajX_test, ajy_train, ajy_test = train_test_split(ajX, ajy, test_size=0.2, stratify=ajy, random_state = 44)
ajX_train , ajy_train = RandomUnderSampler(random_state=0).fit_resample(ajX_train , ajy_train)

In [81]:
ajdt_clf = modeling(DecisionTreeClassifier(), ajX_train, ajy_train, ajX_test)
ajrf_clf = modeling(RandomForestClassifier(),ajX_train, ajy_train, ajX_test)
ajada_clf = modeling(AdaBoostClassifier(), ajX_train, ajy_train, ajX_test)
ajlg_clf = modeling(LogisticRegression(), ajX_train, ajy_train, ajX_test)
ajknn_clf = modeling(KNeighborsClassifier(), ajX_train, ajy_train, ajX_test)
ajsvc_clf = modeling(SVC(probability=True), ajX_train, ajy_train, ajX_test)
ajlgmb_clf = modeling(LGBMClassifier(), ajX_train, ajy_train, ajX_test)
ajcat_clf = modeling(CatBoostClassifier(silent=True),ajX_train, ajy_train, ajX_test)

In [82]:
scoring(ajy_test , ajdt_clf)

f1-Score: 0.5113805276848756
Recall: 0.8371145096056622
Precision: 0.5331335403726708


In [83]:
confusion_matrix(ajy_test, ajdt_clf)  # 59 62

array([[  69,   11],
       [ 931, 4014]], dtype=int64)

# **재무데이터only**

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 46)
X_train, y_train = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)

In [85]:
dt_clf = modeling(DecisionTreeClassifier(), X_train, y_train, X_test)
rf_clf = modeling(RandomForestClassifier(), X_train, y_train, X_test)
ada_clf = modeling(AdaBoostClassifier(), X_train, y_train, X_test)
lg_clf = modeling(LogisticRegression(), X_train, y_train, X_test)
knn_clf = modeling(KNeighborsClassifier(), X_train, y_train, X_test)
svc_clf = modeling(SVC(probability=True), X_train, y_train, X_test)
lgmb_clf = modeling(LGBMClassifier(), X_train, y_train, X_test)
cat_clf = modeling(CatBoostClassifier(silent=True),X_train, y_train, X_test)

In [86]:
scoring(y_test , dt_clf)

f1-Score: 0.4950981095784231
Recall: 0.8063321536905965
Precision: 0.5277791531535992


In [87]:
confusion_matrix(y_test, dt_clf) # 42 79

array([[  66,   14],
       [1050, 3895]], dtype=int64)