# <span style="background-color:#fff5b1">컬럼선택법_클래스불균형_모델성능향상</span>

## 중요 컬럼 선택
- 1) EDA 통해서 종속변수(target)과 중요 관계가 있는 변수들만 선택
- 2) 수치형 변수는 상관분석 결과를 통해, 범주형 변수는 카이제곱 통계량(통계학적 방법)
- 3) 머신러닝 알고리즘을 통한 1차 분석 후 중요 변수만 선택

## Tree 계열 모델의 feature_importance 로 선택

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns

In [3]:
data = pd.read_csv('./data/salary2.csv')
# data

In [4]:
data2 = data.copy()

In [5]:
data = data.drop_duplicates()

In [6]:
data = data.dropna()

In [7]:
data = data.reset_index(drop=True)
data

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39235,53,Private,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
39236,22,Private,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
39237,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
39238,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [8]:
data['class'] = data['class'].apply(lambda x: 1 if x == ' >50K' else 0)

In [9]:
data

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39235,53,Private,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
39236,22,Private,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,0
39237,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
39238,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0


In [10]:
data = pd.get_dummies(data, drop_first=True)

### 모델 적용

In [11]:
X = data.drop('class', axis=1)
y = data['class']

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
# X와 y의 비율이 동일하게 들어가게 하기 위해 stratify 사용
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=10)

In [14]:
print(f"X_train 데이터의 수 = {len(X_train)}")
print(f"X_test 데이터의 수 = {len(X_test)}")
print()

print(f"y_train 데이터의 수 = {len(y_train)}")
print(f"y_test 데이터의 수 = {len(y_test)}")

X_train 데이터의 수 = 23544
X_test 데이터의 수 = 15696

y_train 데이터의 수 = 23544
y_test 데이터의 수 = 15696


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [16]:
dtc = DecisionTreeClassifier(max_depth=9, random_state=10)
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90     11706
           1       0.77      0.58      0.66      3990

    accuracy                           0.85     15696
   macro avg       0.82      0.76      0.78     15696
weighted avg       0.84      0.85      0.84     15696



## feature_importance 출력


In [17]:
importance = pd.DataFrame(dtc.feature_importances_)

In [18]:
cols = pd.DataFrame(dtc.feature_names_in_)

In [19]:
df_importance = pd.concat([cols, importance], axis=1)
df_importance.columns = ['column', 'importance']
important_cols = df_importance. sort_values(by='importance', ascending=False)

In [21]:
# DecisionTreeClassifier 모델이 중요하게 본 컬럼 내림차순 정렬
# 우리가 생각한 것과 다른 결과 도출 : 가장 중요하게 생각한 컬럼이 다름
# 상위 3개의 데이터가 전체 중요도의 80%를 차지 -> 상위 5개의 데이터가 91% 차지
important_cols[important_cols['importance'] > 0].head(10)

Unnamed: 0,column,importance
27,marital-status_ Married-civ-spouse,0.400573
2,capital-gain,0.216183
1,education-num,0.194695
3,capital-loss,0.078845
0,age,0.03873
4,hours-per-week,0.03111
34,occupation_ Exec-managerial,0.01152
8,workclass_ Self-emp-not-inc,0.006948
35,occupation_ Farming-fishing,0.004866
54,sex_ Male,0.003542


## Random_Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rfc = RandomForestClassifier(random_state=10)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.89     11706
           1       0.68      0.60      0.64      3990

    accuracy                           0.83     15696
   macro avg       0.77      0.75      0.76     15696
weighted avg       0.82      0.83      0.82     15696



In [34]:
importance = pd.DataFrame(rfc.feature_importances_, columns=['importance'])
columns = pd.DataFrame(rfc.feature_names_in_, columns=['column'])
rfc_importance = pd.concat([columns, importance], axis=1)
rfc_importance

Unnamed: 0,column,importance
0,age,0.229726
1,education-num,0.060181
2,capital-gain,0.122142
3,capital-loss,0.041190
4,hours-per-week,0.117826
...,...,...
90,native-country_ Thailand,0.000140
91,native-country_ Trinadad&Tobago,0.000107
92,native-country_ United-States,0.006653
93,native-country_ Vietnam,0.000283


In [35]:
# 의사결정나무 모델의 컬럼 중요도와는 다르다는 것을 확인할 수 있음
rfc_importance = rfc_importance[rfc_importance['importance'] > 0.01]. sort_values(by='importance', ascending=False)
rfc_importance

Unnamed: 0,column,importance
0,age,0.229726
2,capital-gain,0.122142
4,hours-per-week,0.117826
27,marital-status_ Married-civ-spouse,0.083482
1,education-num,0.060181
3,capital-loss,0.04119
29,marital-status_ Never-married,0.034063
34,occupation_ Exec-managerial,0.019778
54,sex_ Male,0.019321
45,relationship_ Not-in-family,0.017576


## XGBoost

In [36]:
from xgboost import XGBClassifier

In [37]:
xgb = XGBClassifier(random_state=10)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91     11706
           1       0.77      0.67      0.71      3990

    accuracy                           0.86     15696
   macro avg       0.83      0.80      0.81     15696
weighted avg       0.86      0.86      0.86     15696



In [38]:
importance3 = pd.DataFrame(xgb.feature_importances_, columns=['importance'])
col3 = pd.DataFrame(xgb.feature_names_in_, columns=['column'])
xgb_importance = pd.concat([col3, importance3], axis=1)
xgb_importance

Unnamed: 0,column,importance
0,age,0.008742
1,education-num,0.046971
2,capital-gain,0.058502
3,capital-loss,0.024621
4,hours-per-week,0.008763
...,...,...
90,native-country_ Thailand,0.000000
91,native-country_ Trinadad&Tobago,0.004804
92,native-country_ United-States,0.003897
93,native-country_ Vietnam,0.002186


In [39]:
xgb_importance = xgb_importance[xgb_importance['importance'] > 0.01]. sort_values(by='importance', ascending=False)
xgb_importance

Unnamed: 0,column,importance
27,marital-status_ Married-civ-spouse,0.413689
2,capital-gain,0.058502
1,education-num,0.046971
38,occupation_ Other-service,0.033853
3,capital-loss,0.024621
35,occupation_ Farming-fishing,0.023159
34,occupation_ Exec-managerial,0.021343
47,relationship_ Own-child,0.015753
40,occupation_ Prof-specialty,0.01367
79,native-country_ Mexico,0.013073


In [40]:
data2.columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'class'],
      dtype='object')

In [41]:
data2 = data2.drop_duplicates()
data2 = data2.dropna()
data3 = data2[['age','marital-status', 'capital-gain', 'capital-loss', 'sex', 'education-num', 'occupation','class']]

In [42]:
data3 = data3.reset_index(drop=True)
print(data3.columns)
data3

Index(['age', 'marital-status', 'capital-gain', 'capital-loss', 'sex',
       'education-num', 'occupation', 'class'],
      dtype='object')


Unnamed: 0,age,marital-status,capital-gain,capital-loss,sex,education-num,occupation,class
0,25,Never-married,0,0,Male,7,Machine-op-inspct,<=50K
1,38,Married-civ-spouse,0,0,Male,9,Farming-fishing,<=50K
2,28,Married-civ-spouse,0,0,Male,12,Protective-serv,>50K
3,44,Married-civ-spouse,7688,0,Male,10,Machine-op-inspct,>50K
4,34,Never-married,0,0,Male,6,Other-service,<=50K
...,...,...,...,...,...,...,...,...
39235,53,Married-civ-spouse,0,0,Male,14,Exec-managerial,>50K
39236,22,Never-married,0,0,Male,10,Protective-serv,<=50K
39237,27,Married-civ-spouse,0,0,Female,12,Tech-support,<=50K
39238,58,Widowed,0,0,Female,9,Adm-clerical,<=50K


In [43]:
data3 = pd.get_dummies(data3, drop_first=True)

In [44]:
X = data3.drop('class_ >50K', axis=1)
y = data3['class_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=10)

print(f"X_train 데이터의 수 = {len(X_train)}")
print(f"X_test 데이터의 수 = {len(X_test)}")
print()

print(f"y_train 데이터의 수 = {len(y_train)}")
print(f"y_test 데이터의 수 = {len(y_test)}")

X_train 데이터의 수 = 23544
X_test 데이터의 수 = 15696

y_train 데이터의 수 = 23544
y_test 데이터의 수 = 15696


In [45]:
xgb = XGBClassifier(random_state=10)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       False       0.89      0.93      0.91     11706
        True       0.77      0.65      0.70      3990

    accuracy                           0.86     15696
   macro avg       0.83      0.79      0.81     15696
weighted avg       0.86      0.86      0.86     15696



In [46]:
importance3 = pd.DataFrame(xgb.feature_importances_, columns=['importance'])
col3 = pd.DataFrame(xgb.feature_names_in_, columns=['column'])
xgb_importance = pd.concat([col3, importance3], axis=1)
xgb_importance = xgb_importance[xgb_importance['importance'] > 0.01]. sort_values(by='importance', ascending=False)
xgb_importance

Unnamed: 0,column,importance
5,marital-status_ Married-civ-spouse,0.473459
1,capital-gain,0.090947
3,education-num,0.057546
17,occupation_ Other-service,0.056346
14,occupation_ Farming-fishing,0.047047
13,occupation_ Exec-managerial,0.039511
2,capital-loss,0.035579
19,occupation_ Prof-specialty,0.028145
22,occupation_ Tech-support,0.021765
15,occupation_ Handlers-cleaners,0.017977


In [47]:
y.value_counts()

class_ >50K
False    29265
True      9975
Name: count, dtype: int64

## 클래스 불균형 처리
- 클래스 불균형 : 종속변수(Tagrget)의 클래스(값, 요소)의 비율이 일치하지 않는 경우
- 연봉 데이터의 경우 >50K가 1, <=50K가 3의 비율을 가지고 있음
- train_test_split하는 경우 stratify= 옵션을 반드시 포함
- 머신러닝 알고리즘에 있는 소수 클래스에 가중치 주는 옵션 on

### DecisionTree 모델의 경우 : class_weight='balanced'

In [48]:
dtc = DecisionTreeClassifier(class_weight='balanced', random_state=10)
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       False       0.90      0.81      0.85     11706
        True       0.57      0.75      0.65      3990

    accuracy                           0.79     15696
   macro avg       0.74      0.78      0.75     15696
weighted avg       0.82      0.79      0.80     15696



In [25]:
# for i in range(1,20):   
#     dtc = DecisionTreeClassifier(max_depth=i, class_weight='balanced', random_state=10)
#     dtc.fit(X_train, y_train)
#     pred = dtc.predict(X_test)
    
#     print(i)
#     print(classification_report(y_test, pred))
#     print()

### Random_Forest 모델의 경우 : class_weight='balanced'

In [49]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs = -1, class_weight='balanced', random_state=10)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       False       0.90      0.85      0.87     11706
        True       0.62      0.71      0.66      3990

    accuracy                           0.82     15696
   macro avg       0.76      0.78      0.77     15696
weighted avg       0.83      0.82      0.82     15696



In [50]:
# for i in range(1,20):
#     rfc = RandomForestClassifier(max_depth=i, n_estimators=100, n_jobs = -1, class_weight='balanced', random_state=10)
#     rfc.fit(X_train, y_train)
#     pred = rfc.predict(X_test)
    
#     print(i)
#     print(classification_report(y_test, pred))
#     print()

### Xgboost 모델의 경우 : scale_pos_weight=3

In [51]:
# for i in range(1,21):
#     xgb = XGBClassifier(max_depth=i, n_estimators=500, scale_pos_weight=3, n_jobs=-1, random_state=10)
#     xgb.fit(X_train, y_train)
#     pred = xgb.predict(X_test)
    
#     print(i)
#     print(classification_report(y_test, pred))
#     print()

### LightGBM 모델의 경우 : is_unbalance=True

In [52]:
from lightgbm import LGBMClassifier

In [24]:
# for i in range(1,21):
#     lgbm = LGBMClassifier(max_depth=i, n_estimators=500, is_unbalance=True, n_jobs=-1, random_state=10)
#     lgbm.fit(X_train, y_train)
#     pred = lgbm.predict(X_test)
    
#     print(i)
#     print(classification_report(y_test, pred))
#     print()

## 데이터를 증폭 / 축소하여 class 불균형 해소
- https://imbalanced-learn.org/stable/install.html#getting-started
- imbalanced-learn,smote/cluster-centroids
- 머신러닝 알고리즘을 통해 비슷한 데이터를 생성/축소
- **반드시 train 데이터에만 사용**

In [56]:
X_train

Unnamed: 0,age,capital-gain,capital-loss,education-num,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
21251,32,0,0,4,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3994,38,0,0,10,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1871,58,0,0,13,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
16563,27,0,0,9,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
16892,46,0,0,13,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15385,25,0,1340,13,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
37453,34,0,0,5,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,False
14106,25,0,0,9,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
20368,70,0,0,9,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [54]:
y_train.value_counts()

class_ >50K
False    17559
True      5985
Name: count, dtype: int64

In [57]:
from imblearn.over_sampling import SMOTENC

In [63]:
# list(range(4, X_train.shape[1]))

In [58]:
smt = SMOTENC(categorical_features=list(range(4, X_train.shape[1])), k_neighbors=5, n_jobs=-1,random_state=10)
# fit_resample 적용
smt_X, smt_y = smt.fit_resample(X_train, y_train)



In [59]:
len(smt_X)

35118

In [60]:
smt_y.value_counts()

class_ >50K
False    17559
True     17559
Name: count, dtype: int64

### SMOTENC로 적용한 X와 y 적용한 DecisionTree

In [64]:
dtc2 = DecisionTreeClassifier(class_weight='balanced', random_state=10)
dtc2.fit(smt_X, smt_y)
pred2 = dtc2.predict(X_test)
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

       False       0.90      0.81      0.85     11706
        True       0.57      0.73      0.64      3990

    accuracy                           0.79     15696
   macro avg       0.73      0.77      0.74     15696
weighted avg       0.81      0.79      0.80     15696



### SMOTENC로 적용한 X와 y 적용한 RandomForest

In [61]:
# for i in range(1,20):
#     rfc2 = RandomForestClassifier(max_depth=i, n_estimators=100, n_jobs = -1, class_weight='balanced', random_state=10)
#     rfc2.fit(smt_X, smt_y)
#     pred = rfc2.predict(X_test)
    
#     print(i)
#     print(classification_report(y_test, pred))
#     print()

### SMOTENC로 적용한 X와 y 적용한 XGBoost

In [66]:
# for i in range(1,21):
#     xgb = XGBClassifier(max_depth=i, n_estimators=500, scale_pos_weight=3, n_jobs=-1, random_state=10)
#     xgb.fit(smt_X, smt_y)
#     pred = xgb.predict(X_test)
    
#     print(i)
#     print(classification_report(y_test, pred))
#     print()

### SMOTENC로 적용한 X와 y 적용한 LightGBM

# K-Fold, 하이퍼파라미터 튜닝
- K-Fold 교차검증 : 훈련에 사용할 데이터를 여러 개로 나누어 편향을 줄이고 모델을 균형잡히게 하는 방법
- 하이퍼파라미터 튜닝 : 머신러닝 알고리즘에 있는 여러 파라미터를 조정하여 최고의 성능을 내는 모델 생성하는 것
    - max_depth, n_estimators, learning_reate 등
- gridsearch : 지정해 놓은 모든 경우의 수의 하이퍼파라미터를 탐색
- randomsearch : 지정해 놓은 범위 내에서 랜덤하게 탐색

In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [69]:
model = DecisionTreeClassifier(random_state=10)

In [70]:
params = dict(criterion=['gini', 'entropy', 'log_loss'],
             max_depth = [1, 5, 10, 15],
             random_state=[7,10, 777],
             class_weight = [None, 'balanced'])

In [71]:
grid_cv = GridSearchCV(model, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
pred = grid_cv.predict(X_test)

print('bset_params: ', grid_cv.best_params_)
print('best_scores: ', grid_cv.best_score_)

bset_params:  {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'random_state': 777}
best_scores:  0.8497706422018348


In [112]:
model2 = DecisionTreeClassifier(criterion= 'gini', max_depth=10, random_state=777)
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

       False       0.87      0.94      0.90     11706
        True       0.76      0.57      0.66      3990

    accuracy                           0.85     15696
   macro avg       0.81      0.76      0.78     15696
weighted avg       0.84      0.85      0.84     15696



In [72]:
params = dict(criterion=['gini', 'entropy', 'log_loss'],
             max_depth = [8,9,10,11,12],
             random_state=[777],
             class_weight = [None, 'balanced'])

In [73]:
grid_cv = GridSearchCV(model, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
pred = grid_cv.predict(X_test)

print('bset_params: ', grid_cv.best_params_)
print('best_scores: ', grid_cv.best_score_)

bset_params:  {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'random_state': 777}
best_scores:  0.8497706422018348


### gridSearch에서 score 기준 변경
- scoring=[accuracy, precision, recall, f1, roc_auc]

### class 0에 대한 score를 계산하고 싶은 경우

In [75]:
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score

In [76]:
recall_0_scorer = make_scorer(recall_score, pos_label=0)

In [78]:
grid_cv = GridSearchCV(model, param_grid=params, scoring=recall_0_scorer,cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
pred = grid_cv.predict(X_test)

print('bset_params: ', grid_cv.best_params_)
print('best_scores: ', grid_cv.best_score_)

bset_params:  {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'random_state': 777}
best_scores:  0.9423657383677887


In [79]:
model3 = DecisionTreeClassifier(criterion= 'gini', max_depth=10, random_state=777)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

       False       0.87      0.94      0.90     11706
        True       0.76      0.57      0.66      3990

    accuracy                           0.85     15696
   macro avg       0.81      0.76      0.78     15696
weighted avg       0.84      0.85      0.84     15696



# RandomSearch
- 지정한 파라미터 값에서 랜덤성을 더해 최적 값을 찾아냄

In [80]:
from sklearn.model_selection import RandomizedSearchCV

In [81]:
model = DecisionTreeClassifier(random_state=10)

In [82]:
params = dict(criterion=['gini', 'entropy', 'log_loss'],
             max_depth = [1, 5, 10, 15],
             random_state=[7,10, 777],
             class_weight = [None, 'balanced'])

In [84]:
rand_cv = RandomizedSearchCV(model, param_distributions=params, scoring='f1',
                            cv=3, n_jobs=-1, random_state=10)
rand_cv.fit(X_train, y_train)
pred = rand_cv.predict(X_test)

print('bset_params: ', rand_cv.best_params_)
print('best_scores: ', rand_cv.best_score_)

bset_params:  {'random_state': 10, 'max_depth': 15, 'criterion': 'entropy', 'class_weight': 'balanced'}
best_scores:  0.6847937213314063


In [85]:
model4 = DecisionTreeClassifier(random_state=10, max_depth=15, criterion='entropy', class_weight='balanced')
model4.fit(X_train, y_train)
pred4 = model4.predict(X_test)
print(classification_report(y_test, pred4))

              precision    recall  f1-score   support

       False       0.94      0.78      0.85     11706
        True       0.56      0.85      0.68      3990

    accuracy                           0.79     15696
   macro avg       0.75      0.81      0.76     15696
weighted avg       0.84      0.79      0.80     15696



### XGBoost를 랜덤서치로 하이퍼파라마터 튜닝

In [86]:
xgb = XGBClassifier(n_jobs=-1, random_state=10)

In [88]:
xgb_params = dict(n_estimators=[100, 500, 1000],
             max_depth = [1, 5, 10, 15, 20],
             learning_rate=[0.1,0.5,1],
             booster = ['gbtree', 'gblinear', 'dart'],
            scale_pos_weight=[0,3])

In [89]:
rand_cv = RandomizedSearchCV(xgb, param_distributions=xgb_params,
                            cv=3, scoring='roc_auc', random_state=10)
rand_cv.fit(X_train, y_train)
pred = rand_cv.predict(X_test)

print('bset_params: ', rand_cv.best_params_)
print('best_scores: ', rand_cv.best_score_)

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.



bset_params:  {'scale_pos_weight': 3, 'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.5, 'booster': 'gbtree'}
best_scores:  0.9191755229522637


In [90]:
from sklearn.metrics import roc_auc_score

In [91]:
xgb2 = XGBClassifier(n_jobs=-1, random_state=10,
                    scale_pos_weight=3, n_estimators= 1000, max_depth= 1, learning_rate= 0.5, booster= 'gbtree')
xgb2.fit(X_train, y_train)
pred2 = xgb2.predict(X_test)
print(roc_auc_score(y_test, pred2))

0.8331782171985577


## 데이터 분석 과정 재정리
- 1) 데이터 선택 및 로딩
- 2) 결측값, 이상값 탐지
- 3) EDA(탐색적 데이터 분석)
- 4) 변수 데이터타입, 서열변수, 카테고리변수 처리
- 5) 변수 선택, 파생변수 생성
- 6) 홀드아웃(훈련데이터, 테스트 데이터로 분리)
- 7) 분석에 맞는 알고리즘 불러와서 모델 생성
- 8) 분석 목적에 맞는 지표를 이용해 모델 성능 검증
- 9) 하이퍼파라미터 튜닝(클래스 불균형 해소(모델에서 소수 클래스 가중치 조절/데이터 증폭 및 축소)
- 10) 최종 모델 선택