In [55]:
import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel,SequentialFeatureSelector

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [56]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(),
        XGBClassifier(),
        
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [57]:
train = pd.read_csv('./datasets/통계검증완료/코스피_standar_train.csv',index_col=0)
test =  pd.read_csv('./datasets/통계검증완료/코스피_standar_test.csv',index_col=0)
np.random.RandomState(1)

RandomState(MT19937) at 0x1BC4E70DA40

In [58]:
# train = pd.read_csv('./datasets/통계검증완료/코스피_robu_train.csv',index_col=0)
# test =  pd.read_csv('./datasets/통계검증완료/코스피_robu_test.csv',index_col=0)
# np.random.RandomState(1)

---
# 산업별 더미변수 - 제조업,건설업,정보통신업,그외
---

In [59]:
train.loc[~(train['산업군'].str.contains('제조업') | train['산업군'].str.contains('건설업') | train['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
dummy_df = pd.get_dummies(train['산업군'])
train = pd.concat([train,dummy_df],axis=1)

In [60]:
test.loc[~(test['산업군'].str.contains('제조업') | test['산업군'].str.contains('건설업') | test['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
dummy_df = pd.get_dummies(test['산업군'])
test = pd.concat([test,dummy_df],axis=1)

----

In [61]:
train.columns

Index(['회사명', '거래소코드', '회계년도', '산업군', 'target_1', 'target_2', 'target_3',
       '현금흐름/총부채비율', '현금흐름 대 자산', '총자본정상영업이익률', 'EBIT/총자산', '타인자본회전률',
       '총자본회전률', '차입금의존도', '순운전자본비율', 'EBITDA마진율', 'TMD', '누적수익성비율', '부채비율',
       '금융비용부담률', '자기자본순이익률', '외국인지분율', '총자본증가율', 'ROA변화율', '매출액총이익률',
       '당기전기영업손익', '매출액증가율', '자본금회전률', '대주주지분율', '정상영업이익증가율', '건설업', '그 외',
       '정보통신업', '제조업'],
      dtype='object')

In [62]:
col_int = ['현금흐름/총부채비율', '현금흐름 대 자산', '총자본정상영업이익률', 'EBIT/총자산', '타인자본회전률',
       '총자본회전률', '차입금의존도', '순운전자본비율', 'EBITDA마진율', 'TMD', '누적수익성비율', '부채비율',
       '금융비용부담률', '자기자본순이익률', '외국인지분율', '총자본증가율', 'ROA변화율', '매출액총이익률',
       '당기전기영업손익', '매출액증가율', '자본금회전률', '대주주지분율', '정상영업이익증가율','건설업',
       '정보통신업', '제조업']

---
#target1
---

In [63]:
x_train = train[col_int]
y_train = train[['target_1']]

x_test = test[col_int]
y_test = test[['target_1']]

In [64]:
#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select.index = x_train.columns

---
# lasso - 임베디드기법
---

In [65]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select["lasso_0.1"] = selector.get_support()


---
# stepwise - 래퍼기법
---

In [66]:
# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.1)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['logit_bwd'] = selector.get_support().tolist()

In [67]:
# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select['lda_bwd'] = selector.get_support().tolist()

---
## 비모수적 방법
---

In [68]:
# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select['RFC_bwd'] = selector.get_support().tolist()

---
## 최종 select
---

In [85]:
for col in df_select.columns.to_list():
    df_select[col] = df_select[col].apply(lambda x : 1 if x==True else 0)
df_select['total'] = df_select.sum(axis=1)
df_select[df_select['total']>=4].index

Index(['총자본정상영업이익률', 'EBIT/총자산', 'TMD', '누적수익성비율', '금융비용부담률', '자기자본순이익률',
       '정상영업이익증가율'],
      dtype='object')

In [86]:
최종_col =df_select[df_select['total']>=4].index.to_list()

x_train_1 = x_train[최종_col]
x_test_1 = x_test[최종_col]

In [87]:
model_basic(x_train_1,y_train,x_train_1,y_train)

[[2754    0]
 [   5   35]]
[[2754    0]
 [   9   31]]
[[2712   42]
 [   5   35]]
[[2754    0]
 [   0   40]]
[[2754    0]
 [   0   40]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9982,1.0,0.875,0.9333,0.9977
1,LinearDiscriminantAnalysis(),0.9968,1.0,0.775,0.8732,0.9909
2,GaussianNB(),0.9832,0.4545,0.875,0.5983,0.9945
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [88]:
model_basic(x_train_1,y_train,x_test_1,y_test)

[[1927   80]
 [   1    5]]
[[1954   53]
 [   4    2]]
[[1751  256]
 [   0    6]]
[[1899  108]
 [   1    5]]
[[1924   83]
 [   2    4]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9598,0.0588,0.8333,0.1099,0.9759
1,LinearDiscriminantAnalysis(),0.9717,0.0364,0.3333,0.0656,0.9611
2,GaussianNB(),0.8728,0.0229,1.0,0.0448,0.9378
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9459,0.0442,0.8333,0.084,0.9694
4,"XGBClassifier(base_score=None, booster=None, c...",0.9578,0.046,0.6667,0.086,0.9705


-----
# target2

In [89]:
x_train = train[col_int]
y_train = train[['target_2']]

x_test = test[col_int]
y_test = test[['target_2']]

#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select_2 = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select_2.index = x_train.columns

In [90]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select_2["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select_2["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select_2["lasso_0.1"] = selector.get_support()

# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['logit_bwd'] = selector.get_support().tolist()


# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select_2['lda_bwd'] = selector.get_support().tolist()

# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_2['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_2['RFC_bwd'] = selector.get_support().tolist()

In [75]:
for col in df_select_2.columns.to_list():
    df_select_2[col] = df_select_2[col].apply(lambda x : 1 if x==True else 0)
df_select_2['total'] = df_select_2.sum(axis=1)
df_select_2[df_select_2['total']>=4].index

Index(['총자본정상영업이익률', 'TMD', '누적수익성비율', '부채비율', '금융비용부담률', '자기자본순이익률'], dtype='object')

In [76]:
최종_col =df_select_2[df_select_2['total']>=4].index.to_list()

x_train_2 = x_train[최종_col]
x_test_2 = x_test[최종_col]

In [77]:
model_basic(x_train_2,y_train,x_train_2,y_train)

[[2754    0]
 [   5   35]]
[[2754    0]
 [  10   30]]
[[2712   42]
 [   3   37]]
[[2754    0]
 [   0   40]]
[[2754    0]
 [   0   40]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9982,1.0,0.875,0.9333,0.9977
1,LinearDiscriminantAnalysis(),0.9964,1.0,0.75,0.8571,0.992
2,GaussianNB(),0.9839,0.4684,0.925,0.6218,0.9954
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [91]:
model_basic(x_train_2,y_train,x_test_2,y_test)

[[1923   79]
 [   5    6]]
[[1953   49]
 [   9    2]]
[[1803  199]
 [   4    7]]
[[1917   85]
 [   5    6]]
[[1926   76]
 [   6    5]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9583,0.0706,0.5455,0.125,0.8151
1,LinearDiscriminantAnalysis(),0.9712,0.0392,0.1818,0.0645,0.7613
2,GaussianNB(),0.8992,0.034,0.6364,0.0645,0.773
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9553,0.0659,0.5455,0.1176,0.7728
4,"XGBClassifier(base_score=None, booster=None, c...",0.9593,0.0617,0.4545,0.1087,0.7545


----
# target3

In [79]:
x_train = train[col_int]
y_train = train[['target_3']]

x_test = test[col_int]
y_test = test[['target_3']]

#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select_3 = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select_3.index = x_train.columns

In [80]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select_3["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select_3["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select_3["lasso_0.1"] = selector.get_support()

# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['logit_bwd'] = selector.get_support().tolist()


# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select_3['lda_bwd'] = selector.get_support().tolist()

# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_3['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_3['RFC_bwd'] = selector.get_support().tolist()

In [81]:
for col in df_select_3.columns.to_list():
    df_select_3[col] = df_select_3[col].apply(lambda x : 1 if x==True else 0)
df_select_3['total'] = df_select_3.sum(axis=1)
df_select_3[df_select_3['total']>=4].index

Index(['총자본정상영업이익률', 'TMD', '누적수익성비율', '부채비율', '금융비용부담률', '자기자본순이익률'], dtype='object')

In [82]:
최종_col =df_select_3[df_select_3['total']>=4].index.to_list()

x_train_3 = x_train[최종_col]
x_test_3 = x_test[최종_col]

In [83]:
model_basic(x_train_3,y_train,x_train_3,y_train)

[[2754    0]
 [   5   35]]
[[2754    0]
 [  10   30]]
[[2712   42]
 [   3   37]]
[[2754    0]
 [   0   40]]
[[2754    0]
 [   0   40]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9982,1.0,0.875,0.9333,0.9977
1,LinearDiscriminantAnalysis(),0.9964,1.0,0.75,0.8571,0.992
2,GaussianNB(),0.9839,0.4684,0.925,0.6218,0.9954
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [84]:
model_basic(x_train_3,y_train,x_test_3,y_test)

[[1927   80]
 [   1    5]]
[[1958   49]
 [   4    2]]
[[1807  200]
 [   0    6]]
[[1910   97]
 [   1    5]]
[[1930   77]
 [   2    4]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9598,0.0588,0.8333,0.1099,0.976
1,LinearDiscriminantAnalysis(),0.9737,0.0392,0.3333,0.0702,0.963
2,GaussianNB(),0.9006,0.0291,1.0,0.0566,0.9501
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9513,0.049,0.8333,0.0926,0.9681
4,"XGBClassifier(base_score=None, booster=None, c...",0.9608,0.0494,0.6667,0.092,0.9718
