In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [40]:
df = pd.read_csv('../csv/preprocessed/7_label_blood_sugar.csv')

In [None]:
target = df.pop('식전혈당(공복혈당)')

x = df.values
y = target.values.reshape(-1, 1)

In [41]:
columns = df.columns
columns

Index(['성별코드', '연령대코드(5세단위)', '시도코드', '신장(5Cm단위)', '체중(5Kg단위)', '허리둘레',
       '시력(좌)', '시력(우)', '수축기혈압', '이완기혈압', '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤',
       'LDL콜레스테롤', '혈색소', '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT',
       '감마지티피', '흡연상태', '음주여부', 'BMI'],
      dtype='object')

In [None]:
def get_feature_selection_result(selector, importances):
    result = []
    is_selected = selector.get_support()

    for idx in range(len(columns)):
        selected = True if is_selected[idx] == 1 else False
        result.append([columns[idx], importances[idx], selected])
    result.sort(key=lambda x: abs(x[1]), reverse=True)

    df_to_print = pd.DataFrame(result, columns=['column', 'importance', 'selected'])
    return df_to_print

In [None]:
def result_to_plot(model, result):
    base_line = 1.0 / 23
    colors = []
    for value in result['importance']:
        colors.append('blue' if value < base_line else 'grey')

    plt.bar(result['feature'], result['importance'], color=colors)
    plt.axhline(y=base_line, color='r', linestyle='-')

    plt.title('Feature Selection with %s' %(model))
    plt.ylabel('Importance')
    plt.xlabel('Feature')
    plt.show()

# RandomForestClassifier

In [43]:
selector_RFC = SelectFromModel(estimator=RandomForestClassifier()).fit(x, y)


  self.estimator_.fit(X, y, **fit_params)


In [44]:
selected_RFC = columns[selector_RFC.get_support()]
selected_RFC

Index(['연령대코드(5세단위)', '허리둘레', '수축기혈압', '이완기혈압', '총콜레스테롤', '트리글리세라이드',
       'HDL콜레스테롤', 'LDL콜레스테롤', '혈색소', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피',
       'BMI'],
      dtype='object')

In [None]:
result_RFC = get_feature_selection_result(selector_RFC, selector_RFC.estimator_.feature_importances_)
result_RFC

In [None]:
result_to_plot('Random Forest Classifier', result_RFC)

# GradientBoostingClassifier

In [45]:
selector_GBC = SelectFromModel(estimator=GradientBoostingClassifier()).fit(x, y)

  return f(*args, **kwargs)


In [46]:
selected_GBC = columns[selector_GBC.get_support()]
selected_GBC

Index(['연령대코드(5세단위)', '허리둘레', '총콜레스테롤', '트리글리세라이드', 'LDL콜레스테롤', '요단백',
       '(혈청지오티)AST', '감마지티피'],
      dtype='object')

In [None]:
result_GBC = get_feature_selection_result(selector_GBC, selector_GBC.estimator_.feature_importances_)
result_GBC

In [None]:
result_to_plot('Gradient Boosting Classifier', result_GBC)

# LogisticRegression

In [47]:
selector_LR = SelectFromModel(estimator=LogisticRegression()).fit(x, y)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
selected_LR = columns[selector_LR.get_support()]
selected_LR

Index(['성별코드', '연령대코드(5세단위)', '신장(5Cm단위)', '체중(5Kg단위)', '혈색소', '요단백', '흡연상태',
       'BMI'],
      dtype='object')

In [None]:
result_LR = get_feature_selection_result(selector_LR, selector_FR.estimator_.coef_[0])
result_LR

In [None]:
result_to_plot('Logistic Regression', result_LR)

# DecisionTreeClassifier

In [49]:
selector_DTC = SelectFromModel(estimator=DecisionTreeClassifier()).fit(x, y)

In [50]:
selected_DTC = columns[selector_DTC.get_support()]
selected_DTC

Index(['연령대코드(5세단위)', '허리둘레', '수축기혈압', '이완기혈압', '총콜레스테롤', '트리글리세라이드',
       'HDL콜레스테롤', 'LDL콜레스테롤', '혈색소', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피',
       'BMI'],
      dtype='object')

In [88]:
result_DTC = get_feature_selection_result(selector_DTC, selector_DTC.estimator_.feature_importances_)
result_DTC

Unnamed: 0,column,importance,selected
0,트리글리세라이드,0.073994,True
1,감마지티피,0.07273,True
2,LDL콜레스테롤,0.072346,True
3,혈색소,0.062942,True
4,허리둘레,0.061283,True
5,총콜레스테롤,0.05797,True
6,(혈청지오티)ALT,0.057186,True
7,HDL콜레스테롤,0.056546,True
8,(혈청지오티)AST,0.055988,True
9,수축기혈압,0.054277,True


In [None]:
result_to_plot('Decision Tree Classifier', result_DTC)