In [84]:
# 라이브러리 로드

import pandas as pd 
import numpy as np 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids, EditedNearestNeighbours
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

from sklearn.preprocessing import StandardScaler
std_sc = StandardScaler()


In [85]:
def over(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target
    
    if method == 'random':
        # RandomOverSampler 객체 생성, sampling_strategy 값 설정
        oversampler = RandomOverSampler(sampling_strategy=sampling_strategy)

    elif method == 'smote':
        # SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTE(sampling_strategy=sampling_strategy)

    elif method == 'adasyn':
        # adasyn 객체 생성, sampling_strategy 값 설정
        oversampler = ADASYN(sampling_strategy=sampling_strategy)

    elif method == 'Borderline-SMOTE':
        # Borderline-SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)

    elif method == 'SMOTENC':
        # SMOTENC 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTENC(sampling_strategy=sampling_strategy)

    
    # 오버샘플링 수행
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # 오버샘플링된 데이터 프레임 생성
    df_over = pd.concat([X_resampled, y_resampled], axis=1)

    return df_over


In [86]:
def under(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target

    if method == 'random':
        # RandomUnderSampler 객체 생성, sampling_strategy 값 설정
        undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    elif method == 'tomek':
        # TomekLinks 객체 생성, sampling_strategy 값 설정
        undersampler = TomekLinks(sampling_strategy=sampling_strategy)

    elif method == 'NearMiss':
        # NearMiss 객체 생성, sampling_strategy 값 설정
        undersampler = NearMiss(sampling_strategy=sampling_strategy)

    elif method == 'cluster_centroids':
        # ClusterCentroids 객체 생성, sampling_strategy 값 설정
        undersampler = ClusterCentroids(sampling_strategy=sampling_strategy)

    elif method == 'edited_nn':
        # EditedNearestNeighbours 객체 생성, sampling_strategy 값 설정
        undersampler = EditedNearestNeighbours(sampling_strategy=sampling_strategy)

    # 언더샘플링 수행
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # 언더샘플링된 데이터 프레임 생성
    df_under = pd.concat([X_resampled, y_resampled], axis=1)

    return df_under

In [87]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(),
        XGBClassifier(),
        
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [88]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_이상치+통계검증_train.csv",index_col=0)
test =  pd.read_csv('./datasets/통계검증완료/코스닥_이상치+통계검증_test.csv',index_col=0)

In [89]:
train["회계년도"] = train["회계년도"].astype("object")
train["거래소코드"] = train["거래소코드"].astype("object")

train_obj = train.select_dtypes("object")
print(train_obj.columns)

test["회계년도"] = test["회계년도"].astype("object")
test["거래소코드"] = test["거래소코드"].astype("object")

test_obj = test.select_dtypes("object")
print(test_obj.columns)

Index(['회사명', '거래소코드', '회계년도', '산업군'], dtype='object')
Index(['회사명', '거래소코드', '회계년도', '산업군'], dtype='object')


In [90]:
최종_col_1 = ['대주주지분율', '순운전자본비율', 'EBITDA마진율', '누적수익성비율', '자본금회전률', 'TMD', '총자본증가율',
       '자기자본증가율', '정상영업이익증가율', '자기자본순이익률', 'ROA변화율', '이자부담률']

최종_col_2 = ['대주주지분율', '순운전자본비율', '매출액총이익률', '누적수익성비율', '자본금회전률', '당기전기영업손익',
       '금융비용부담률', 'TMD', '정상영업이익증가율', '자기자본순이익률', '이자부담률']

최종_col_3 = ['자기자본순이익률', 'EBITDA마진율', '누적수익성비율', '당기전기영업손익', '총자본증가율',
       'abs(영업현금흐름-당기순이익)/매출액', 'TMD']

In [91]:
train_x_1 = train[최종_col_1]
train_x_2 = train[최종_col_2]
train_x_3 = train[최종_col_3]


test_x_1 = test[최종_col_1]
test_x_2 = test[최종_col_2]
test_x_3 = test[최종_col_3]



---
### 스케일링 train, test

In [92]:
train_x_1_std = std_sc.fit_transform(train_x_1)
train_x_2_std = std_sc.fit_transform(train_x_2)
train_x_3_std = std_sc.fit_transform(train_x_3)

train_x_1_std = pd.DataFrame(train_x_1_std, columns= train_x_1.columns)
train_x_2_std = pd.DataFrame(train_x_2_std, columns= train_x_2.columns)
train_x_3_std = pd.DataFrame(train_x_3_std, columns= train_x_3.columns)


In [93]:
test_x_1_std = std_sc.fit(train_x_1).transform(test_x_1)
test_x_2_std = std_sc.fit(train_x_2).transform(test_x_2)
test_x_3_std = std_sc.fit(train_x_3).transform(test_x_3)

test_x_1_std = pd.DataFrame(test_x_1_std, columns= test_x_1.columns)
test_x_2_std = pd.DataFrame(test_x_2_std, columns= test_x_2.columns)
test_x_3_std = pd.DataFrame(test_x_3_std, columns= test_x_3.columns)

In [94]:
train_1 = pd.concat([train_obj, train_x_1_std, train[["target_1"]]], axis=1)
train_2 = pd.concat([train_obj, train_x_2_std, train[["target_2"]]], axis=1)
train_3 = pd.concat([train_obj, train_x_3_std, train[["target_3"]]], axis=1)

test_1 = pd.concat([test_obj, test_x_1_std, test[["target_1"]]], axis=1)
test_2 = pd.concat([test_obj, test_x_2_std, test[["target_2"]]], axis=1)
test_3 = pd.concat([test_obj, test_x_3_std, test[["target_3"]]], axis=1)

### target_1

In [95]:
# 로우 데이터 
model_basic(train_1[최종_col_1], train_1["target_1"], test_1[최종_col_1], test_1["target_1"])

[[2795   22]
 [  36   39]]
[[2775   42]
 [  29   46]]
[[2665  152]
 [  12   63]]
[[2802   15]
 [  23   52]]
[[2801   16]
 [  28   47]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9799,0.6393,0.52,0.5735,0.9637
1,LinearDiscriminantAnalysis(),0.9754,0.5227,0.6133,0.5644,0.9097
2,GaussianNB(),0.9433,0.293,0.84,0.4345,0.9675
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9869,0.7761,0.6933,0.7324,0.9665
4,"XGBClassifier(base_score=None, booster=None, c...",0.9848,0.746,0.6267,0.6812,0.96


In [96]:
train_1_over = over(train_1[최종_col_1], train_1["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)
train_1_under = under(train_1[최종_col_1], train_1["target_1"], method='random', sampling_strategy=0.5)
test_1_under = under(test_1[최종_col_1], test_1["target_1"], method='random', sampling_strategy=0.5)

In [97]:
# 오버 - 로우 
model_basic(train_1_over[최종_col_1], train_1_over["target_1"], test_1[최종_col_1], test_1["target_1"])

[[2656  161]
 [  13   62]]
[[2683  134]
 [  12   63]]
[[2553  264]
 [   8   67]]
[[2743   74]
 [  16   59]]
[[2748   69]
 [  15   60]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9398,0.278,0.8267,0.4161,0.9602
1,LinearDiscriminantAnalysis(),0.9495,0.3198,0.84,0.4632,0.9657
2,GaussianNB(),0.9059,0.2024,0.8933,0.33,0.9678
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9689,0.4436,0.7867,0.5673,0.9748
4,"XGBClassifier(base_score=None, booster=None, c...",0.971,0.4651,0.8,0.5882,0.9652


In [98]:
# 오버 - 언더
model_basic(train_1_over[최종_col_1], train_1_over["target_1"], test_1_under[최종_col_1], test_1_under["target_1"])

[[142   8]
 [ 13  62]]
[[144   6]
 [ 12  63]]
[[142   8]
 [  8  67]]
[[147   3]
 [ 16  59]]
[[148   2]
 [ 15  60]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9067,0.8857,0.8267,0.8552,0.9628
1,LinearDiscriminantAnalysis(),0.92,0.913,0.84,0.875,0.9672
2,GaussianNB(),0.9289,0.8933,0.8933,0.8933,0.9726
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9156,0.9516,0.7867,0.8613,0.9754
4,"XGBClassifier(base_score=None, booster=None, c...",0.9244,0.9677,0.8,0.8759,0.9707


In [99]:
# # 언더 - 로우
model_basic(train_1_under[최종_col_1], train_1_under["target_1"], test_1[최종_col_1], test_1["target_1"]) 

[[2611  206]
 [  10   65]]
[[2673  144]
 [  16   59]]
[[2560  257]
 [   8   67]]
[[2553  264]
 [   8   67]]
[[2566  251]
 [  12   63]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9253,0.2399,0.8667,0.3757,0.9629
1,LinearDiscriminantAnalysis(),0.9447,0.2906,0.7867,0.4245,0.9489
2,GaussianNB(),0.9084,0.2068,0.8933,0.3358,0.9625
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9059,0.2024,0.8933,0.33,0.9683
4,"XGBClassifier(base_score=None, booster=None, c...",0.9091,0.2006,0.84,0.3239,0.954


In [100]:
# # 언더 - 언더
model_basic(train_1_under[최종_col_1], train_1_under["target_1"], test_1_under[최종_col_1], test_1_under["target_1"]) 

[[142   8]
 [ 10  65]]
[[144   6]
 [ 16  59]]
[[143   7]
 [  8  67]]
[[137  13]
 [  7  68]]
[[137  13]
 [ 12  63]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.92,0.8904,0.8667,0.8784,0.964
1,LinearDiscriminantAnalysis(),0.9022,0.9077,0.7867,0.8429,0.9476
2,GaussianNB(),0.9333,0.9054,0.8933,0.8993,0.9694
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9111,0.8395,0.9067,0.8718,0.9756
4,"XGBClassifier(base_score=None, booster=None, c...",0.8889,0.8289,0.84,0.8344,0.9615


---
### target_2 

In [101]:
# 로우 데이터 
model_basic(train_2[최종_col_2], train_2["target_2"], test_2[최종_col_2], test_2["target_2"])

[[2803   16]
 [  32   41]]
[[2769   50]
 [  23   50]]
[[2658  161]
 [  10   63]]
[[2804   15]
 [  26   47]]
[[2791   28]
 [  30   43]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9834,0.7193,0.5616,0.6308,0.9699
1,LinearDiscriminantAnalysis(),0.9748,0.5,0.6849,0.578,0.9678
2,GaussianNB(),0.9409,0.2812,0.863,0.4242,0.9712
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9858,0.7581,0.6438,0.6963,0.9851
4,"XGBClassifier(base_score=None, booster=None, c...",0.9799,0.6056,0.589,0.5972,0.9713


In [102]:
train_2_over = over(train_2[최종_col_2], train_2["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)
train_2_under = under(train_2[최종_col_2], train_2["target_2"], method='random', sampling_strategy=0.5)
test_2_under = under(test_2[최종_col_2], test_2["target_2"], method='random', sampling_strategy=0.5)

In [103]:
# 오버 - 로우 
model_basic(train_2_over[최종_col_2], train_2_over["target_2"], test_2[최종_col_2], test_2["target_2"])

[[2569  250]
 [  10   63]]
[[2566  253]
 [   5   68]]
[[2450  369]
 [   5   68]]
[[2735   84]
 [  20   53]]
[[2729   90]
 [  20   53]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9101,0.2013,0.863,0.3264,0.9648
1,LinearDiscriminantAnalysis(),0.9108,0.2118,0.9315,0.3452,0.9701
2,GaussianNB(),0.8707,0.1556,0.9315,0.2667,0.9701
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.964,0.3869,0.726,0.5048,0.9719
4,"XGBClassifier(base_score=None, booster=None, c...",0.962,0.3706,0.726,0.4907,0.9679


In [104]:
# 오버 - 언더
model_basic(train_2_over[최종_col_2], train_2_over["target_2"], test_2_under[최종_col_2], test_2_under["target_2"])

[[127  19]
 [ 10  63]]
[[130  16]
 [  5  68]]
[[124  22]
 [  5  68]]
[[140   6]
 [ 23  50]]
[[139   7]
 [ 20  53]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8676,0.7683,0.863,0.8129,0.9509
1,LinearDiscriminantAnalysis(),0.9041,0.8095,0.9315,0.8662,0.9597
2,GaussianNB(),0.8767,0.7556,0.9315,0.8344,0.9597
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8676,0.8929,0.6849,0.7752,0.9641
4,"XGBClassifier(base_score=None, booster=None, c...",0.8767,0.8833,0.726,0.797,0.9631


In [105]:
# # 언더 - 로우
model_basic(train_2_under[최종_col_2], train_2_under["target_2"], test_2[최종_col_2], test_2["target_2"]) 

[[2534  285]
 [   9   64]]
[[2581  238]
 [  13   60]]
[[2555  264]
 [   8   65]]
[[2453  366]
 [   4   69]]
[[2433  386]
 [  10   63]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8983,0.1834,0.8767,0.3033,0.9622
1,LinearDiscriminantAnalysis(),0.9132,0.2013,0.8219,0.3235,0.9469
2,GaussianNB(),0.9059,0.1976,0.8904,0.3234,0.9678
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8721,0.1586,0.9452,0.2717,0.9639
4,"XGBClassifier(base_score=None, booster=None, c...",0.8631,0.1403,0.863,0.2414,0.9522


In [106]:
# # 언더 - 언더
model_basic(train_2_under[최종_col_2], train_2_under["target_2"], test_2_under[최종_col_2], test_2_under["target_2"]) 

[[126  20]
 [  9  64]]
[[132  14]
 [ 13  60]]
[[131  15]
 [  8  65]]
[[121  25]
 [  4  69]]
[[120  26]
 [ 10  63]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8676,0.7619,0.8767,0.8153,0.9518
1,LinearDiscriminantAnalysis(),0.8767,0.8108,0.8219,0.8163,0.9353
2,GaussianNB(),0.895,0.8125,0.8904,0.8497,0.9589
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8676,0.734,0.9452,0.8263,0.9543
4,"XGBClassifier(base_score=None, booster=None, c...",0.8356,0.7079,0.863,0.7778,0.9398


---
### target_3

In [107]:
# 로우 데이터 
model_basic(train_3[최종_col_3], train_3["target_3"], test_3[최종_col_3], test_3["target_3"])

[[2801   19]
 [  32   40]]
[[2781   39]
 [  25   47]]
[[2671  149]
 [  16   56]]
[[2801   19]
 [  26   46]]
[[2796   24]
 [  28   44]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9824,0.678,0.5556,0.6107,0.961
1,LinearDiscriminantAnalysis(),0.9779,0.5465,0.6528,0.5949,0.9667
2,GaussianNB(),0.9429,0.2732,0.7778,0.4043,0.9633
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9844,0.7077,0.6389,0.6715,0.9584
4,"XGBClassifier(base_score=None, booster=None, c...",0.982,0.6471,0.6111,0.6286,0.938


In [108]:
train_3_over = over(train_3[최종_col_3], train_3["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)
train_3_under = under(train_3[최종_col_3], train_3["target_3"], method='random', sampling_strategy=0.5)
test_3_under = under(test_3[최종_col_3], test_3["target_3"], method='random', sampling_strategy=0.5)

In [109]:
# 오버 - 로우 
model_basic(train_3_over[최종_col_3], train_3_over["target_3"], test_3[최종_col_3], test_3["target_3"])

[[2657  163]
 [  14   58]]
[[2687  133]
 [  14   58]]
[[2608  212]
 [   8   64]]
[[2732   88]
 [  19   53]]
[[2742   78]
 [  17   55]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9388,0.2624,0.8056,0.3959,0.9603
1,LinearDiscriminantAnalysis(),0.9492,0.3037,0.8056,0.4411,0.9635
2,GaussianNB(),0.9239,0.2319,0.8889,0.3678,0.9655
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.963,0.3759,0.7361,0.4977,0.9628
4,"XGBClassifier(base_score=None, booster=None, c...",0.9672,0.4135,0.7639,0.5366,0.9557


In [110]:
# 오버 - 언더
model_basic(train_3_over[최종_col_3], train_3_over["target_3"], test_3_under[최종_col_3], test_3_under["target_3"])

[[132  12]
 [ 14  58]]
[[135   9]
 [ 14  58]]
[[133  11]
 [  8  64]]
[[138   6]
 [ 21  51]]
[[136   8]
 [ 17  55]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8796,0.8286,0.8056,0.8169,0.9568
1,LinearDiscriminantAnalysis(),0.8935,0.8657,0.8056,0.8345,0.963
2,GaussianNB(),0.912,0.8533,0.8889,0.8707,0.9663
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.875,0.8947,0.7083,0.7907,0.9598
4,"XGBClassifier(base_score=None, booster=None, c...",0.8843,0.873,0.7639,0.8148,0.9481


In [111]:
# # 언더 - 로우
model_basic(train_3_under[최종_col_3], train_3_under["target_3"], test_3[최종_col_3], test_3["target_3"]) 

[[2622  198]
 [  12   60]]
[[2744   76]
 [  19   53]]
[[2688  132]
 [  15   57]]
[[2553  267]
 [   9   63]]
[[2531  289]
 [  12   60]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9274,0.2326,0.8333,0.3636,0.9599
1,LinearDiscriminantAnalysis(),0.9672,0.4109,0.7361,0.5274,0.9628
2,GaussianNB(),0.9492,0.3016,0.7917,0.4368,0.9656
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9046,0.1909,0.875,0.3134,0.9612
4,"XGBClassifier(base_score=None, booster=None, c...",0.8959,0.1719,0.8333,0.285,0.9456


In [112]:
# # 언더 - 언더
model_basic(train_3_under[최종_col_3], train_3_under["target_3"], test_3_under[최종_col_3], test_3_under["target_3"]) 

[[132  12]
 [ 12  60]]
[[141   3]
 [ 19  53]]
[[137   7]
 [ 15  57]]
[[125  19]
 [ 11  61]]
[[127  17]
 [ 12  60]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.8889,0.8333,0.8333,0.8333,0.9556
1,LinearDiscriminantAnalysis(),0.8981,0.9464,0.7361,0.8281,0.9641
2,GaussianNB(),0.8981,0.8906,0.7917,0.8382,0.966
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.8611,0.7625,0.8472,0.8026,0.9513
4,"XGBClassifier(base_score=None, booster=None, c...",0.8657,0.7792,0.8333,0.8054,0.9317
