In [83]:
import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel,SequentialFeatureSelector

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [84]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(),
        XGBClassifier(),
        
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [85]:
train = pd.read_csv('./datasets/통계검증완료/코스닥_standard_train.csv',index_col=0)
test =  pd.read_csv('./datasets/통계검증완료/코스닥_standard_test.csv',index_col=0)



RandomState(MT19937) at 0x17314CC0340

---
# 산업별 더미변수 - 제조업,건설업,정보통신업,그외
---

In [86]:
train.loc[~(train['산업군'].str.contains('제조업') | train['산업군'].str.contains('건설업') | train['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
dummy_df = pd.get_dummies(train['산업군'])
train = pd.concat([train,dummy_df],axis=1)

In [87]:
test.loc[~(test['산업군'].str.contains('제조업') | test['산업군'].str.contains('건설업') | test['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
dummy_df = pd.get_dummies(test['산업군'])
test = pd.concat([test,dummy_df],axis=1)

----

In [88]:
train.columns

Index(['회사명', '거래소코드', '회계년도', '산업군', '총자본순이익률', 'EBIT/총자산', '총자본정상영업이익률',
       '현금흐름 대 자산', '현금흐름/총부채비율', '차입금의존도', '현금흐름 대 매출액', '순운전자본비율',
       'EBITDA마진율', '타인자본회전률', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액', '총자본회전률',
       '금융비용부담률', '당기전기영업손익', '자본금회전률', '영업현금흐름-단기차입금', 'FINDEP', '총자본증가율',
       '자기자본증가율', 'TMD', '매출액총이익률', '자기자본순이익률', '매출액증가율', '대주주지분율', '외국인지분율',
       'ROA변화율', '정상영업이익증가율', '이자부담률', 'target_1', 'target_2', 'target_3',
       '건설업', '그 외', '정보통신업', '제조업'],
      dtype='object')

In [89]:
col_int = ['총자본순이익률', 'EBIT/총자산', '총자본정상영업이익률',
       '현금흐름 대 자산', '현금흐름/총부채비율', '차입금의존도', '현금흐름 대 매출액', '순운전자본비율',
       'EBITDA마진율', '타인자본회전률', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액', '총자본회전률',
       '금융비용부담률', '당기전기영업손익', '자본금회전률', '영업현금흐름-단기차입금', 'FINDEP', '총자본증가율',
       '자기자본증가율', 'TMD', '매출액총이익률', '자기자본순이익률', '매출액증가율', '대주주지분율', '외국인지분율',
       'ROA변화율', '정상영업이익증가율', '이자부담률']

---
#target1
---

In [90]:
x_train = train[col_int]
y_train = train[['target_1']]

x_test = test[col_int]
y_test = test[['target_1']]

In [91]:
y_test.value_counts()

target_1
0           3503
1             89
dtype: int64

In [92]:
#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select.index = x_train.columns

---
# lasso - 임베디드기법
---

In [93]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select["lasso_0.1"] = selector.get_support()


---
# stepwise - 래퍼기법
---

In [94]:
# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.1)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['logit_bwd'] = selector.get_support().tolist()

In [95]:
# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select['lda_bwd'] = selector.get_support().tolist()

---
## 비모수적 방법
---

In [96]:
# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select['RFC_bwd'] = selector.get_support().tolist()

---
## 최종 select
---

In [97]:
for col in df_select.columns.to_list():
    df_select[col] = df_select[col].apply(lambda x : 1 if x==True else 0)
df_select['total'] = df_select.sum(axis=1)
df_select[df_select['total']>=5].index

Index(['EBIT/총자산', '현금흐름 대 자산', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액', '총자본회전률',
       '금융비용부담률', 'TMD', '자기자본순이익률', 'ROA변화율'],
      dtype='object')

In [98]:
최종_col =df_select[df_select['total']>=5].index.to_list()

x_train_1 = x_train[최종_col]
x_test_1 = x_test[최종_col]

In [99]:
model_basic(x_train_1,y_train,x_train_1,y_train)

[[5066   20]
 [ 104   70]]
[[5059   27]
 [ 104   70]]
[[4902  184]
 [  68  106]]
[[5086    0]
 [   0  174]]
[[5086    0]
 [   0  174]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9764,0.7778,0.4023,0.5303,0.9366
1,LinearDiscriminantAnalysis(),0.9751,0.7216,0.4023,0.5166,0.9304
2,GaussianNB(),0.9521,0.3655,0.6092,0.4569,0.9205
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [100]:
model_basic(x_train_1,y_train,x_test_1,y_test)

[[3396  107]
 [  43   46]]
[[3332  171]
 [  34   55]]
[[3110  393]
 [  20   69]]
[[3415   88]
 [  39   50]]
[[3437   66]
 [  46   43]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9582,0.3007,0.5169,0.3802,0.9259
1,LinearDiscriminantAnalysis(),0.9429,0.2434,0.618,0.3492,0.9278
2,GaussianNB(),0.885,0.1494,0.7753,0.2505,0.9185
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9646,0.3623,0.5618,0.4405,0.9355
4,"XGBClassifier(base_score=None, booster=None, c...",0.9688,0.3945,0.4831,0.4343,0.9293


-----
# target2

In [101]:
x_train = train[col_int]
y_train = train[['target_2']]

x_test = test[col_int]
y_test = test[['target_2']]

#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select_2 = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select_2.index = x_train.columns

In [102]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select_2["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select_2["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select_2["lasso_0.1"] = selector.get_support()

# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['logit_bwd'] = selector.get_support().tolist()


# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select_2['lda_bwd'] = selector.get_support().tolist()

# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_2['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_2['RFC_bwd'] = selector.get_support().tolist()

In [103]:
for col in df_select_2.columns.to_list():
    df_select_2[col] = df_select_2[col].apply(lambda x : 1 if x==True else 0)
df_select_2['total'] = df_select_2.sum(axis=1)
df_select_2[df_select_2['total']>=5].index



Index(['총자본정상영업이익률', '순운전자본비율', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액', '금융비용부담률',
       'TMD', '매출액총이익률'],
      dtype='object')

In [104]:
최종_col =df_select_2[df_select_2['total']>=5].index.to_list()

x_train_2 = x_train[최종_col]
x_test_2 = x_test[최종_col]

In [105]:
model_basic(x_train_2,y_train,x_train_2,y_train)

[[5022   21]
 [ 149   68]]
[[4995   48]
 [ 139   78]]
[[4854  189]
 [ 110  107]]
[[5043    0]
 [   0  217]]
[[5043    0]
 [   0  217]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9677,0.764,0.3134,0.4444,0.9025
1,LinearDiscriminantAnalysis(),0.9644,0.619,0.3594,0.4548,0.9058
2,GaussianNB(),0.9432,0.3615,0.4931,0.4172,0.8818
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [106]:
model_basic(x_train_2,y_train,x_test_2,y_test)

[[3371  134]
 [  40   47]]
[[3296  209]
 [  36   51]]
[[3105  400]
 [  27   60]]
[[3415   90]
 [  43   44]]
[[3412   93]
 [  45   42]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9516,0.2597,0.5402,0.3507,0.8398
1,LinearDiscriminantAnalysis(),0.9318,0.1962,0.5862,0.2939,0.8435
2,GaussianNB(),0.8811,0.1304,0.6897,0.2194,0.8526
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.963,0.3284,0.5057,0.3982,0.8669
4,"XGBClassifier(base_score=None, booster=None, c...",0.9616,0.3111,0.4828,0.3784,0.8575


----
# target3

In [107]:
x_train = train[col_int]
y_train = train[['target_3']]

x_test = test[col_int]
y_test = test[['target_3']]

#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select_3 = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select_3.index = x_train.columns

In [108]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select_3["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select_3["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select_3["lasso_0.1"] = selector.get_support()

# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['logit_bwd'] = selector.get_support().tolist()


# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select_3['lda_bwd'] = selector.get_support().tolist()

# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_3['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_3['RFC_bwd'] = selector.get_support().tolist()

In [114]:
for col in df_select_3.columns.to_list():
    df_select_3[col] = df_select_3[col].apply(lambda x : 1 if x==True else 0)
df_select_3['total'] = df_select_3.sum(axis=1)
df_select_3[df_select_3['total']>=5].index

Index(['EBIT/총자산', '현금흐름 대 자산', '순운전자본비율', '누적수익성비율', 'abs(영업현금흐름-당기순이익)/매출액',
       '총자본회전률', '금융비용부담률', '당기전기영업손익', '총자본증가율', 'TMD'],
      dtype='object')

In [110]:
최종_col =df_select_3[df_select_3['total']>=5].index.to_list()

x_train_3 = x_train[최종_col]
x_test_3 = x_test[최종_col]

In [111]:
model_basic(x_train_3,y_train,x_train_3,y_train)

[[5077   18]
 [  95   70]]
[[5062   33]
 [  93   72]]
[[4909  186]
 [  67   98]]
[[5095    0]
 [   0  165]]
[[5095    0]
 [   0  165]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9785,0.7955,0.4242,0.5534,0.9389
1,LinearDiscriminantAnalysis(),0.976,0.6857,0.4364,0.5333,0.9295
2,GaussianNB(),0.9519,0.3451,0.5939,0.4365,0.9239
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [112]:
model_basic(x_train_3,y_train,x_test_3,y_test)

[[3408  112]
 [  30   42]]
[[3339  181]
 [  26   46]]
[[3095  425]
 [  14   58]]
[[3433   87]
 [  30   42]]
[[3458   62]
 [  34   38]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9605,0.2727,0.5833,0.3717,0.9395
1,LinearDiscriminantAnalysis(),0.9424,0.2026,0.6389,0.3077,0.9357
2,GaussianNB(),0.8778,0.1201,0.8056,0.209,0.9293
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9674,0.3256,0.5833,0.4179,0.9452
4,"XGBClassifier(base_score=None, booster=None, c...",0.9733,0.38,0.5278,0.4419,0.9299
