# **라이브러리**

In [1]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'malgun gothic'

# **함수모음**

## **모델링**

In [2]:
def modeling(model, X, y, test):    
    model.fit(X, y)
    pred = model.predict_proba(test)
    pred = (pred[:,0] < 0.5)*1         # threshold 부도기업일 확률이 10% 이상이면 부도로 판단해라.
    return pred

def modeling_(model, X, y, test):    
    model.fit(X, y)
    pred = model.predict_proba(test)
    pred = (pred[:,0] < optimal_threshold)*1         # threshold 부도기업일 확률이 10% 이상이면 부도로 판단해라.
    return pred

######################################## Split train, test
def splittrain(data):
    subdata = data.sort_values(by='년').reset_index(drop=True)
    data = subdata.loc[subdata['년'] < 2015]
    return data

def splittesst(data):
    global test
    subdata = data.sort_values(by='년').reset_index(drop=True)
    test = subdata.loc[subdata['년'] >= 2015]
    return test

## **스코어**

In [3]:
def scoring(y_true, y_pred):
    print('accuracy:', accuracy_score(y_true,y_pred))
    print('f1-Score:', f1_score(y_true, y_pred, average='weighted'))
    print('Recall:',recall_score(y_true, y_pred, average='weighted'))
    print('Precision:',precision_score(y_true, y_pred, average='weighted'))
    print(confusion_matrix(y_true, y_pred))

## **귀찮다 귀찮아**

# **데이터 나누기**

## **재무데이터**

In [4]:
dataset = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\0years_ago.csv')
dataset1 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\1years_ago.csv')
dataset2 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\2years_ago.csv')
dataset3 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\3years_ago.csv')
dataset4 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\4years_ago.csv')

drop_features = ['회사명', '거래소코드', '회계년도','년', 'key', '부도','최대주주_변경', '회계처리위반', '횡령배임', '신종채권', '영업조업중단',
'출자목적_투자','출자목적_경영권', '출자목적_영업이익','외국인_주식분포비율', '종가','종가변동률','신종채권_운영', '신종채권_시설', '신종채권_기타']

dataset.columns

Index(['회사명', '거래소코드', '회계년도', '순운전자본비율', '총자본순이익률', '경영자본순이익률', '경영자본회전률',
       '매입채무회전률', '설비투자효율', '총자본투자효율', '자기자본증가율', '부채구성비율', '재고자산보유기간',
       '매출채권회수기간', '최대주주_변경', '회계처리위반', '횡령배임', '신종채권', '영업조업중단', '출자목적_투자',
       '출자목적_경영권', '출자목적_영업이익', '외국인_주식분포비율', '종가', '종가변동률', '년', 'key', '부도',
       '신종채권_운영', '신종채권_시설', '신종채권_기타'],
      dtype='object')

## **재무데이터 + 비재무데이터**

In [5]:
ajdataset = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\0years_agostep3.csv')
ajdataset1 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\1years_agostep3.csv')
ajdataset2 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\2years_agostep3.csv')
ajdataset3 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\3years_agostep3.csv')
ajdataset4 = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\data\ver2\4years_agostep3.csv')

drop_feature = ['회사명', '거래소코드', '회계년도','년', 'key', '부도']

ajdataset.columns

Index(['회사명', '거래소코드', '회계년도', '설비투자효율', '총자본투자효율', '부채구성비율', '비유동장기적합률',
       '재고자산보유기간', '매출채권회수기간', '매입채무회전률', '경영자본회전률', '경영자본순이익률', '자기자본순이익률',
       '자기자본증가율', '년', 'key', '부도', '대표이사_변경', '최대주주_변경', '회계처리위반', '횡령배임',
       '영업조업중단', '종가변동률', '출자목적_투자', '출자목적_경영권', '출자목적_영업이익', '신종채권_운영',
       '신종채권_시설', '신종채권_기타'],
      dtype='object')

In [6]:
dataset_ = splittrain(dataset)
test_ = splittesst(dataset)
dataset1_ = splittrain(dataset1)
test1_ = splittesst(dataset1)
dataset2_ = splittrain(dataset2)
test2_ = splittesst(dataset2)
dataset3_ = splittrain(dataset3)
test3_ = splittesst(dataset3)
dataset4_ = splittrain(dataset4)
test4_ = splittesst(dataset4)

ajdataset_ = splittrain(ajdataset)
ajtest_ = splittesst(ajdataset)
ajdataset1_ = splittrain(ajdataset1)
ajtest1_ = splittesst(ajdataset1)
ajdataset2_ = splittrain(ajdataset2)
ajtest2_ = splittesst(ajdataset2)
ajdataset3_ = splittrain(ajdataset3)
ajtest3_ = splittesst(ajdataset3)
ajdataset4_ = splittrain(ajdataset4)
ajtest4_ = splittesst(ajdataset4)


## **세트 1. 재무데이터 train,test 분리**

In [7]:
X = dataset_.drop(drop_features, axis=1)
X1 = dataset1_.drop(drop_features, axis=1)
X2 = dataset2_.drop(drop_features, axis=1)
X3 = dataset3_.drop(drop_features, axis=1)
X4 = dataset4_.drop(drop_features, axis=1)

y = dataset_['부도']
y1 = dataset1_['부도']
y2 = dataset2_['부도']
y3 = dataset3_['부도']
y4 = dataset4_['부도']

X_test =test_.drop(drop_features, axis=1)
X1_test =test1_.drop(drop_features, axis=1)
X2_test =test2_.drop(drop_features, axis=1)
X3_test =test3_.drop(drop_features, axis=1)
X4_test =test4_.drop(drop_features, axis=1)

y_test= test_['부도']
y1_test =test1_['부도']
y2_test =test2_['부도']
y3_test =test3_['부도']
y4_test =test4_['부도']

print(X3_test.shape, y3_test.shape)


(8381, 11) (8381,)


## **세트2 재무 + 비재무데이터 train, test 분리**

In [8]:
ajX = ajdataset_.drop(drop_feature, axis=1)
ajX1 = ajdataset1_.drop(drop_feature, axis=1)
ajX2 = ajdataset2_.drop(drop_feature, axis=1)
ajX3 = ajdataset3_.drop(drop_feature, axis=1)
ajX4 = ajdataset4_.drop(drop_feature, axis=1)

ajy = ajdataset_['부도']
ajy1 = ajdataset1_['부도']
ajy2 = ajdataset2_['부도']
ajy3 = ajdataset3_['부도']
ajy4 = ajdataset4_['부도']

ajX_test =ajtest_.drop(drop_feature, axis=1)
ajX1_test =ajtest1_.drop(drop_feature, axis=1)
ajX2_test =ajtest2_.drop(drop_feature, axis=1)
ajX3_test =ajtest3_.drop(drop_feature, axis=1)
ajX4_test =ajtest4_.drop(drop_feature, axis=1)

ajy_test= ajtest_['부도']
ajy1_test =ajtest1_['부도']
ajy2_test =ajtest2_['부도']
ajy3_test =ajtest3_['부도']
ajy4_test =ajtest4_['부도']

print(ajX3_test.shape, ajy3_test.shape)

(8381, 23) (8381,)


# **YEAR**

# **데이터 검증 및 언더샘플링**

In [9]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

mdn = ['Decision Tree','Random Forest','Adaboost','logistic', 'KNN','SVC','lightGBM','Catboost','MLP','Xgboost']

def run_skfold(feature, target):
    X, y = RandomUnderSampler(random_state=1).fit_resample(feature, target) # RandomUnderSampling
    skf = StratifiedKFold(n_splits=4, shuffle=True)
    arr = [] # Validation Set's Accuracy
    models = [DecisionTreeClassifier(),RandomForestClassifier(),AdaBoostClassifier(),LogisticRegression(),KNeighborsClassifier(),
    SVC(probability=True),LGBMClassifier(),CatBoostClassifier(silent=True),MLPClassifier(),XGBClassifier()]

    for model in models:
        val_pred = 0
        for train_index, val_index in skf.split(X,y):
            # print("TRAIN:", train_index, "TEST:", val_index)
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]
            val_pred += accuracy_score(y_val, modeling(model, X_train, y_train, X_val))
        arr.append(val_pred/4)
    return arr # each models accuracy

In [10]:
# ,2,3,4 숫자구분해줘야함
X = np.array(X)
ajX = np.array(ajX)

y = y
ajy = ajy

ajX_test = ajX_test
X_test = X_test

y_test = y_test
ajy_test = ajy_test

vanval = run_skfold(X, y)
ajval = run_skfold(ajX,ajy)



## **테스트 셋 검증**

In [11]:
X, y = RandomUnderSampler(random_state=1).fit_resample(X, y)
########################################################################################## 데이터셋 1
dt_clf = modeling(DecisionTreeClassifier(), X, y, X_test)
rf_clf = modeling(RandomForestClassifier(),X, y, X_test)
ada_clf = modeling(AdaBoostClassifier(), X, y, X_test)
lg_clf = modeling(LogisticRegression(), X, y, X_test)
knn_clf = modeling(KNeighborsClassifier(), X, y, X_test)
svc_clf = modeling(SVC(probability=True), X, y, X_test)
lgmb_clf = modeling(LGBMClassifier(), X, y, X_test)
cat_clf = modeling(CatBoostClassifier(silent=True),X, y, X_test)
mlp_clf = modeling(MLPClassifier(),X, y, X_test)
xgb_clf = modeling(XGBClassifier(),X, y, X_test)

ajX, ajy = RandomUnderSampler(random_state=1).fit_resample(ajX, ajy)
########################################################################################## 데이터셋 2
ajdt_clf = modeling(DecisionTreeClassifier(), ajX, ajy, ajX_test)
ajrf_clf = modeling(RandomForestClassifier(),ajX, ajy, ajX_test)
ajada_clf = modeling(AdaBoostClassifier(), ajX, ajy, ajX_test)
ajlg_clf = modeling(LogisticRegression(), ajX, ajy, ajX_test)
ajknn_clf = modeling(KNeighborsClassifier(), ajX, ajy, ajX_test)
ajsvc_clf = modeling(SVC(probability=True), ajX, ajy, ajX_test)
ajlgmb_clf = modeling(LGBMClassifier(), ajX, ajy, ajX_test)
ajcat_clf = modeling(CatBoostClassifier(silent=True),ajX, ajy, ajX_test)
ajmlp_clf = modeling(MLPClassifier(),ajX, ajy, ajX_test)
ajxgb_clf = modeling(XGBClassifier(),ajX, ajy, ajX_test)



In [227]:
""" for i in arr:
    print(scoring(y_test , i))
    print('-'*60)

for i in ajarr:
    print(scoring(y_test , i))
    print('-'*60) """

" for i in arr:\n    print(scoring(y_test , i))\n    print('-'*60)\n\nfor i in ajarr:\n    print(scoring(y_test , i))\n    print('-'*60) "

In [12]:
arr =[dt_clf,rf_clf,ada_clf,lg_clf,knn_clf,svc_clf,lgmb_clf,cat_clf,mlp_clf,xgb_clf]
ajarr = [ajdt_clf,ajrf_clf,ajada_clf,ajlg_clf,ajknn_clf,ajsvc_clf,ajlgmb_clf,ajcat_clf,ajmlp_clf,ajxgb_clf]

In [40]:
def test_acc(y_true, arr_):
    acc = []
    for i in arr_:
        acc.append(round(accuracy_score(y_true, i),4))
    return acc

ogntest = test_acc(y_test, arr)
ajdtest = test_acc(ajy_test, ajarr)

def get_f1(y_true, arr_):
    f1 = []
    for i in arr_:
        f1.append(round(f1_score(y_true, i),4, )) # average='weighted'
    return f1
     
ognf1 = get_f1(y_test, arr)
ajdf1 = get_f1(ajy_test, ajarr)

def get_rc(y_true, arr_):
    rc = []
    for i in arr_:
        rc.append(round(recall_score(y_true, i),4))
    return rc

ognrc = get_rc(y_test, arr)
ajdrc = get_rc(ajy_test, ajarr)

def get_pc(y_true, arr_):
    pc = []
    for i in arr_:
        pc.append(round(precision_score(y_true, i),4))
    return pc

ognpc = get_pc(y_test, arr)
ajdpc = get_pc(ajy_test, ajarr)


In [41]:
subsetone = pd.DataFrame({'model': mdn, 'SET1 val Accuracy': vanval, 'SET1 test Accuracy' : ogntest, 'SET1 test f1-Score': ognf1, 'SET1 test Precision': ognpc, 'SET1 test Recall': ognrc, 
                                        'SET2 val Accuracy' : ajval, 'SET2 test Accuracy' : ajdtest, 'SET2 test f1_Score': ajdf1, 'SET2 test Precision': ajdpc, 'SET2 test Recall': ajdrc})
subsetone.to_csv('0년차검증.csv', index=False)

In [42]:
asd = pd.read_csv(r'C:\Users\JH\Desktop\Final\Ubion-CorporateBankruptcyAnalysis\Code\JH\code\modeling\0년차검증.csv')
asd

Unnamed: 0,model,SET1 val Accuracy,SET1 test Accuracy,SET1 test f1-Score,SET1 test Precision,SET1 test Recall,SET2 val Accuracy,SET2 test Accuracy,SET2 test f1_Score,SET2 test Precision,SET2 test Recall
0,Decision Tree,0.756404,0.7072,0.0326,0.0167,0.7925,0.782241,0.7302,0.0337,0.0172,0.7547
1,Random Forest,0.835255,0.8243,0.0508,0.0263,0.7547,0.855296,0.8537,0.0633,0.0329,0.7925
2,Adaboost,0.797964,0.8162,0.0487,0.0252,0.7547,0.839507,0.8271,0.0553,0.0286,0.8113
3,logistic,0.79945,0.8762,0.064,0.0336,0.6792,0.810944,0.8307,0.0489,0.0253,0.6981
4,KNN,0.806527,0.8498,0.0534,0.0278,0.6792,0.823768,0.855,0.0552,0.0288,0.6792
5,SVC,0.551716,0.0063,0.0124,0.0062,1.0,0.5589,0.0062,0.0124,0.0062,1.0
6,lightGBM,0.82234,0.8008,0.0472,0.0243,0.7925,0.866765,0.8368,0.0583,0.0303,0.8113
7,Catboost,0.830936,0.8288,0.0533,0.0276,0.7736,0.863957,0.8495,0.0643,0.0335,0.8302
8,MLP,0.755115,0.7301,0.0345,0.0176,0.7736,0.765057,0.856,0.0584,0.0304,0.717
9,Xgboost,0.820788,0.8074,0.0488,0.0252,0.7925,0.860985,0.8353,0.054,0.028,0.7547


In [36]:
scoring(y_test, cat_clf)

accuracy: 0.8287864534336783
f1-Score: 0.9005688514146519
Recall: 0.8287864534336783
Precision: 0.9922407205629455
[[7007 1444]
 [  12   41]]


In [37]:
asd[['model','SET1 test Accuracy','SET1 test f1-Score']].sort_values(by='SET1 test f1-Score', ascending=False)

Unnamed: 0,model,SET1 test Accuracy,SET1 test f1-Score
3,logistic,0.8762,0.4989
4,KNN,0.8498,0.4859
7,Catboost,0.8288,0.4796
1,Random Forest,0.8243,0.477
2,Adaboost,0.8162,0.4735
9,Xgboost,0.8074,0.4708
6,lightGBM,0.8008,0.468
8,MLP,0.7301,0.4388
0,Decision Tree,0.7072,0.4301
5,SVC,0.0063,0.0063


In [38]:
asd[['model','SET2 test Accuracy','SET2 test f1_Score']].sort_values(by='SET2 test f1_Score', ascending=False)

Unnamed: 0,model,SET2 test Accuracy,SET2 test f1_Score
1,Random Forest,0.8537,0.492
7,Catboost,0.8495,0.4912
8,MLP,0.856,0.4902
4,KNN,0.855,0.4883
6,lightGBM,0.8368,0.4845
9,Xgboost,0.8353,0.4819
2,Adaboost,0.8271,0.4801
3,logistic,0.8307,0.478
0,Decision Tree,0.7302,0.4385
5,SVC,0.0062,0.0062
