In [3]:
import pandas as pd 
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, accuracy_score,f1_score
from sklearn.feature_selection import SelectFromModel,SelectFpr,SelectFwe,SelectFdr
from sklearn.feature_selection import SequentialFeatureSelector,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier




import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic') # For Windows
mpl.rcParams['axes.unicode_minus'] = False
from xgboost import XGBClassifier
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [4]:
df = pd.read_csv("./datasets/스케일링완료/윈저_minmax.csv",index_col=0)


In [5]:
#  년도를 기준으로 test train 데이터 분리
test = df[df['회계년도']>=2020].reset_index(drop=True)
train = df[df["회계년도"] <= 2019].reset_index(drop=True) 

# 데이터의 명목형 컬럼
col_int = [ '대주주지분변화분', '외국인지분분변화', '자산', '총자본증가율', '비유동자산증가율',
              '유동자산증가율', '자기자본증가율', '정상영업이익증가율', '순이익증가율', '총포괄이익증가율', '매출액순이익률',
              '자기자본순이익률', '매출원가대매출액비율', '광고선전비대매출액비율', '유보율', '당좌비율', '현금비율', '부채비율',
              '유동부채비율', '차입금비율', '이자보상배율(이자비용)', '유보액대납입자본배율', '투자집중도', '1인평균지급액(임원)',
              '1인년간평균급여(직원)', '토빈Q', 'WW지수', 'RDS', '도입기', '성숙기', '성장기', '쇠퇴기']


# 종속변수와 독립변수 분리
y_train= train[['target']]
x_train = train[col_int]

# 종속변수와 독립변수 분리
y_test= test[['target']]
x_test = test[col_int]

## SelectFromModel을 통한 피쳐 선정

---

In [8]:
#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select = pd.DataFrame()

# 생성된 데이터프레임의 인덱스 지정
df_select.index = x_train.columns

In [51]:

#  LogisticRegression
selector = SelectFromModel(estimator=LogisticRegression(solver='liblinear',penalty='l1',C=0.05)).fit(x_train, y_train)
df_select["lasso"] = selector.get_support()

# XGBClassifier
selector_1 = SelectFromModel(estimator=XGBClassifier()).fit(x_train, y_train)
df_select["XGB"] = selector_1.get_support()

# RandomForestClassifier
selector_2 = SelectFromModel(estimator=RandomForestClassifier()).fit(x_train, y_train)
df_select["RFC"] = selector_2.get_support()

# DecisionTreeClassifier
selector_3 = SelectFromModel(estimator=DecisionTreeClassifier()).fit(x_train, y_train)
df_select["DecisionTreeClassifier"] = selector_3.get_support()

# AdaBoostClassifier
selector_4 = SelectFromModel(estimator= AdaBoostClassifier()).fit(x_train, y_train)
df_select[" AdaBoostClassifier"] = selector_4.get_support()

# GradientBoostingClassifier
selector_5 = SelectFromModel(estimator=GradientBoostingClassifier()).fit(x_train, y_train)
df_select["GradientBoostingClassifier"] = selector_5.get_support()


###########################
#selectFpr
selecfpr = SelectFpr(f_classif,alpha=0.05)
selecfpr.fit(x_train,y_train)
df_select['fpr_fcl'] = selecfpr.get_support().tolist()


# selectfwe
filter_fwe = SelectFwe(f_classif, alpha=0.05)
filter_fwe.fit(x_train, y_train)
df_select['fwe_fcl'] = filter_fwe.get_support().tolist()

# selectfdr
filter_fdr = SelectFdr(f_classif, alpha=0.05)
filter_fdr.fit(x_train, y_train)
df_select['fdr_fcl'] = filter_fdr.get_support().tolist()


df_select

Unnamed: 0,lasso,XGB,RFC,DecisionTreeClassifier,AdaBoostClassifier,GradientBoostingClassifier,fpr_fcl,fwe_fcl,fdr_fcl,logit_fwd,logit_bwd,RFC_fwd,RFC_bwd
대주주지분변화분,True,False,False,False,False,False,False,False,False,True,False,False,False
외국인지분분변화,False,False,False,False,False,False,False,False,False,True,False,False,True
자산,True,True,True,True,True,True,True,True,True,True,False,True,True
총자본증가율,False,False,True,False,False,False,False,False,False,False,True,False,True
비유동자산증가율,False,False,False,True,False,False,True,False,False,True,False,False,False
유동자산증가율,True,False,False,False,True,False,False,False,False,False,False,True,True
자기자본증가율,True,True,True,True,True,True,True,True,True,True,True,True,True
정상영업이익증가율,True,False,True,True,True,False,False,False,False,True,False,True,True
순이익증가율,True,True,True,True,True,True,False,False,False,False,True,True,True
총포괄이익증가율,False,False,True,True,True,True,True,True,True,False,True,False,True


## StepWise

In [57]:
# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.1)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=2)
selector.fit(x_train, y_train)
df_select['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=2)
selector.fit(x_train, y_train)
df_select['logit_bwd'] = selector.get_support().tolist()


In [58]:
a = estimator.fit(x_train,y_train)
coef = a.coef_
result = pd.DataFrame(index= x_train.columns.tolist(),data = coef[0],)
result

Unnamed: 0,0
대주주지분변화분,-0.088695
외국인지분분변화,0.0
자산,0.160779
총자본증가율,0.248371
비유동자산증가율,0.0
유동자산증가율,0.17369
자기자본증가율,-1.962115
정상영업이익증가율,-0.141193
순이익증가율,-0.370437
총포괄이익증가율,0.342346


In [36]:
coef[0]

array([-0.16283735, -0.02857401,  0.21174235,  0.80833984, -0.23345062,
        0.1263458 , -2.53003583, -0.24572204, -0.80322251,  0.89191074,
        0.51822882,  2.18294792, -0.24675823,  0.35799991, -0.22520786,
        0.4785654 , -0.77399212, -1.53560464, -0.12895124,  0.63185907,
        0.19094514,  1.40050096, -0.61371355,  0.35826914,  0.83836728,
       -0.96368714, -0.29526503, -0.61762151, -0.26873584,  0.38916309,
        0.        , -0.16070551])

In [13]:
# RandomForest_foward
estimator = RandomForestClassifier()
selector = SequentialFeatureSelector(estimator,direction='forward',cv=2)
selector.fit(x_train, y_train)
df_select['RFC_fwd'] = selector.get_support().tolist()
# RandomForest _ backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=2)
selector.fit(x_train, y_train)
df_select['RFC_bwd'] = selector.get_support().tolist()

In [14]:
df_select

Unnamed: 0,lasso,XGB,RFC,DecisionTreeClassifier,AdaBoostClassifier,GradientBoostingClassifier,fpr_fcl,fwe_fcl,fdr_fcl,logit_fwd,logit_bwd,RFC_fwd,RFC_bwd
대주주지분변화분,True,False,False,False,False,False,False,False,False,True,False,False,False
외국인지분분변화,True,False,False,False,False,False,False,False,False,True,False,False,True
자산,True,True,True,True,True,True,True,True,True,True,False,True,True
총자본증가율,True,False,True,False,False,False,False,False,False,False,True,False,True
비유동자산증가율,True,False,False,False,False,False,True,False,False,True,False,False,False
유동자산증가율,True,False,False,False,True,False,False,False,False,False,False,True,True
자기자본증가율,True,True,True,True,True,True,True,True,True,True,True,True,True
정상영업이익증가율,True,False,True,False,True,False,False,False,False,True,False,True,True
순이익증가율,True,True,True,True,True,True,False,False,False,False,True,True,True
총포괄이익증가율,True,False,True,False,True,True,True,True,True,False,True,False,True


In [16]:
i = df_select.columns
dfdf= pd.DataFrame()


for i in df_select.columns:
    
    a = df_select[df_select[i].values==True]
    b = pd.concat([dfdf,a],axis=1)
b

Unnamed: 0,lasso,ElasticNet,XGB,DecisionTreeClassifier,AdaBoostClassifier,GradientBoostingClassifier
자산,True,True,True,True,True,True
자기자본증가율,True,True,True,True,True,True
순이익증가율,True,True,True,False,True,True
총포괄이익증가율,True,True,True,True,True,True
자기자본순이익률,True,True,True,True,True,True
유보율,True,True,True,True,True,True
1인년간평균급여(직원),True,True,True,True,True,True


## SelectKBest 모듈 활용하여 변수 선택


- scikit-learn의 SelectKBest 모듈은 target 변수와 그외 변수 사이의 상관관계를 계산하여 가장 상관관계가 높은 변수 k개를 선정할 수 있는 모듈입니다. 
- 상관관계를 분석하는 방법은 f-regression 방식과 chi2(카이제곱)방식, f-classif 등이 있다
---

In [52]:
#  f_regression

from sklearn.feature_selection import f_regression, SelectKBest
## selctor 정의하기.
selector = SelectKBest(score_func=f_regression, k=13)
## 학습데이터에 fit_transform 
X_train_selected = selector.fit_transform(x_train, y_train)
X_test_selected = selector.transform(x_test)
X_train_selected.shape, X_test_selected.shape

all_names = x_train.columns
## selector.get_support()
selected_mask = selector.get_support()
## 선택된 특성(변수)들
selected_names = all_names[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = all_names[~selected_mask]
print('Selected names: ', selected_names)
print('Unselected names: ', unselected_names)

col_f_reg_selec = selected_names
x_train_f_reg = x_train[col_f_reg_selec]


Selected names:  Index(['자산', '매출액순이익률', '자기자본순이익률', '유보율', '부채비율', '유동부채비율', '차입금비율',
       '이자보상배율(이자비용)', '유보액대납입자본배율', '1인년간평균급여(직원)', '도입기', '성숙기', '쇠퇴기'],
      dtype='object')
Unselected names:  Index(['대주주지분변화분', '외국인지분분변화', '총자본증가율', '비유동자산증가율', '유동자산증가율', '자기자본증가율',
       '정상영업이익증가율', '순이익증가율', '총포괄이익증가율', '매출원가대매출액비율', '광고선전비대매출액비율', '당좌비율',
       '현금비율', '투자집중도', '1인평균지급액(임원)', '토빈Q', 'WW지수', 'RDS', '성장기'],
      dtype='object')


In [29]:
# f_classif

from sklearn.feature_selection import f_classif, SelectKBest
## selctor 정의하기.
selector = SelectKBest(score_func=f_classif, k=15)
## 학습데이터에 fit_transform 
X_train_selected = selector.fit_transform(x_train, y_train)
X_test_selected = selector.transform(x_test)
X_train_selected.shape, X_test_selected.shape

all_names = x_train.columns
## selector.get_support()
selected_mask = selector.get_support()
## 선택된 특성(변수)들
selected_names = all_names[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = all_names[~selected_mask]
print('Selected names: ', selected_names)
print('Unselected names: ', unselected_names)

col_f_classif_selec = selected_names
x_train_f_clf = x_train[col_f_classif_selec]

Selected names:  Index(['자산', '매출액순이익률', '자기자본순이익률', '유보율', '부채비율', '유동부채비율', '차입금비율',
       '이자보상배율(이자비용)', '유보액대납입자본배율', '투자집중도', '1인평균지급액(임원)', '1인년간평균급여(직원)',
       '도입기', '성숙기', '쇠퇴기'],
      dtype='object')
Unselected names:  Index(['대주주지분변화분', '외국인지분분변화', '총자본증가율', '비유동자산증가율', '유동자산증가율', '자기자본증가율',
       '정상영업이익증가율', '순이익증가율', '총포괄이익증가율', '매출원가대매출액비율', '광고선전비대매출액비율', '당좌비율',
       '현금비율', '토빈Q', 'WW지수', 'RDS', '성장기'],
      dtype='object')


## SelectPercentile

---

In [11]:
from sklearn.feature_selection import SelectPercentile

In [17]:
select = SelectPercentile(percentile=65)

select.fit(x_train, y_train)



x_train_selected = select.transform(x_train)



print("x_train.shape: {} ".format(x_train.shape))

# x_train.shape: (284, 80) 

print("x_train_selected.shape: {} ".format(x_train_selected.shape))

# x_train_selected.shape: (284, 40) 

all_names = x_train.columns
## select.get_support()
selected_mask = select.get_support()
## 선택된 특성(변수)들
selected_names = all_names[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = all_names[~selected_mask]
print('Selected names: ', selected_names)
print('Unselected names: ', unselected_names)

col_f_classif_selec = selected_names
x_train_f_clf = x_train[col_f_classif_selec]

x_train.shape: (8336, 32) 
x_train_selected.shape: (8336, 21) 
Selected names:  Index(['자산', '총자본증가율', '유동자산증가율', '자기자본증가율', '매출액순이익률', '자기자본순이익률',
       '매출원가대매출액비율', '유보율', '현금비율', '부채비율', '유동부채비율', '차입금비율', '유보액대납입자본배율',
       '투자집중도', '1인평균지급액(임원)', '1인년간평균급여(직원)', 'WW지수', '도입기', '성숙기', '성장기',
       '쇠퇴기'],
      dtype='object')
Unselected names:  Index(['대주주지분변화분', '외국인지분분변화', '비유동자산증가율', '정상영업이익증가율', '순이익증가율', '총포괄이익증가율',
       '광고선전비대매출액비율', '당좌비율', '이자보상배율(이자비용)', '토빈Q', 'RDS'],
      dtype='object')
