In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

train = pd.read_csv('./data/train.csv', index_col='no')
test = pd.read_csv('./data/test.csv', index_col='no')

def strip_title(row):
    return row.strip()

# train 공백제거
# except pass 는 수치형나오면 오류나는거를 pass한것
for i in range(len(train.columns)):
    try:
        train.iloc[:,i] = train.iloc[:,i].apply(strip_title)
    except:
        pass
# test 공백제거
for i in range(len(test.columns)):
    try:
        test.iloc[:,i] = test.iloc[:,i].apply(strip_title)
    except:
        pass


# ?을 other로 변환
train['occupation'] = train['occupation'].replace('?','other')
train['workclass'] = train['workclass'].replace('?','other')
train['native-country'] = train['native-country'].replace('?','other')

# ?을 other로 변환
test['occupation'] = test['occupation'].replace('?','other')
test['workclass'] = test['workclass'].replace('?','other')
test['native-country'] = test['native-country'].replace('?','other')

# train.drop(['fnlwgt','capital-gain', 'capital-loss','education'],axis=1,inplace=True)
# test.drop(['fnlwgt','capital-gain', 'capital-loss','education'],axis=1,inplace=True)

train.drop(['fnlwgt','education'],axis=1,inplace=True)
test.drop(['fnlwgt','education'],axis=1,inplace=True)

train['native-country'] = train['native-country'].map({'United-States':'United-States',"other":'United-States'})
train['native-country'].fillna("other-country",inplace=True)
test['native-country'] = test['native-country'].map({'United-States':'United-States',"other":'United-States'})
test['native-country'].fillna("other-country",inplace=True)

# 인코딩할 컬럼명만 선택
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
# categorical_features = ['workclass','occupation']
# 반복문으로 인코딩하기
for feature_name in categorical_features:
    one_hot = pd.get_dummies(train[feature_name], prefix = feature_name)
    train = pd.concat([train,one_hot], axis=1) # 기존 데이터 끝에 one_hot을 붙임.
    train.drop(feature_name, axis=1, inplace=True) # 기존 글자컬럼을 삭제
    
for feature_name in categorical_features:
    one_hot = pd.get_dummies(test[feature_name], prefix = feature_name)
    test = pd.concat([test,one_hot], axis=1) # 기존 데이터 끝에 one_hot을 붙임.
    test.drop(feature_name, axis=1, inplace=True) # 기존 글자컬럼을 삭제
#train.drop("workclass_other", axis = 1, inplace = True)
#test.drop("workclass_other", axis = 1, inplace = True)


train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)
X_train = train.drop(['income'], axis=1)
y_train = train.income
X_test=test
X_train.sort_index(axis=1, inplace=True)
X_test.sort_index(axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=0)

In [48]:
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1, random_state=0 )

In [39]:
from hyperopt import hp

# max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2 사이 정규 분포된 값으로 검색.
xgb_search_space = {'max_depth': hp.quniform('max_depth', 6, 12, 1), 
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 1),
                   }

In [40]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

# fmin()에서 입력된 search_space 값으로 입력된 모든 값은 실수형임.
# XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함.
# 정확도는 높을수록 더 좋은 수치임. -1 * 정확도를 곱해서 큰 정확도 값일수록 최소가 되도록 변환
def objective_func(search_space):
    # 수행 시간 절약을 위해 nestimators는 100으로 축소
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=5)
    
    # accuracy는 cv=3 개수만큼 roc-auc 결과를 리스트로 가짐. 이를 평균해서 반환하되 -1을 곱함.
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}


In [41]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=100, # 최대 반복 횟수를 지정합니다.
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)


100%|█████████████████████████████████████████████| 100/100 [03:55<00:00,  2.36s/trial, best loss: -0.8717353038909093]
best: {'colsample_bytree': 0.49078457271407105, 'learning_rate': 0.09313005317333836, 'max_depth': 9.0, 'min_child_weight': 2.0}


In [42]:
print('colsample_bytree:{0}, learning_rate:{1}, max_depth:{2}, min_child_weight:{3}'.format(
    round(best['colsample_bytree'], 5), round(best['learning_rate'], 5),
    int(best['max_depth']), int(best['min_child_weight'])))

colsample_bytree:0.49078, learning_rate:0.09313, max_depth:9, min_child_weight:2


In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.5f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [46]:
xgb_wrapper = XGBClassifier(n_estimators=400,
                            learning_rate=round(best['learning_rate'], 5),
                            max_depth=int(best['max_depth']),
                            min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )

evals = [(X_tr, y_tr), (X_val, y_val)]
xgb_wrapper.fit(X_tr, y_tr, early_stopping_rounds=50, eval_metric='logloss',
                eval_set=evals, verbose=True)

preds = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, preds, pred_proba)


[0]	validation_0-logloss:0.64455	validation_1-logloss:0.64427
[1]	validation_0-logloss:0.60497	validation_1-logloss:0.60440
[2]	validation_0-logloss:0.56985	validation_1-logloss:0.56910
[3]	validation_0-logloss:0.54150	validation_1-logloss:0.54046
[4]	validation_0-logloss:0.52040	validation_1-logloss:0.51900
[5]	validation_0-logloss:0.49672	validation_1-logloss:0.49616
[6]	validation_0-logloss:0.48043	validation_1-logloss:0.47977
[7]	validation_0-logloss:0.46482	validation_1-logloss:0.46434
[8]	validation_0-logloss:0.45190	validation_1-logloss:0.45140
[9]	validation_0-logloss:0.43842	validation_1-logloss:0.43858
[10]	validation_0-logloss:0.42202	validation_1-logloss:0.42178
[11]	validation_0-logloss:0.41113	validation_1-logloss:0.41056
[12]	validation_0-logloss:0.39629	validation_1-logloss:0.39560
[13]	validation_0-logloss:0.38854	validation_1-logloss:0.38827
[14]	validation_0-logloss:0.37720	validation_1-logloss:0.37684
[15]	validation_0-logloss:0.36871	validation_1-logloss:0.36901
[1



[25]	validation_0-logloss:0.31528	validation_1-logloss:0.32104
[26]	validation_0-logloss:0.31022	validation_1-logloss:0.31630
[27]	validation_0-logloss:0.30607	validation_1-logloss:0.31216
[28]	validation_0-logloss:0.30331	validation_1-logloss:0.31009
[29]	validation_0-logloss:0.29922	validation_1-logloss:0.30626
[30]	validation_0-logloss:0.29610	validation_1-logloss:0.30355
[31]	validation_0-logloss:0.29378	validation_1-logloss:0.30231
[32]	validation_0-logloss:0.29207	validation_1-logloss:0.30105
[33]	validation_0-logloss:0.28961	validation_1-logloss:0.29900
[34]	validation_0-logloss:0.28701	validation_1-logloss:0.29691
[35]	validation_0-logloss:0.28535	validation_1-logloss:0.29518
[36]	validation_0-logloss:0.28423	validation_1-logloss:0.29420
[37]	validation_0-logloss:0.28225	validation_1-logloss:0.29221
[38]	validation_0-logloss:0.28134	validation_1-logloss:0.29165
[39]	validation_0-logloss:0.27958	validation_1-logloss:0.29044
[40]	validation_0-logloss:0.27771	validation_1-logloss:

[155]	validation_0-logloss:0.23285	validation_1-logloss:0.27407
[156]	validation_0-logloss:0.23280	validation_1-logloss:0.27405
[157]	validation_0-logloss:0.23256	validation_1-logloss:0.27411
[158]	validation_0-logloss:0.23251	validation_1-logloss:0.27408
[159]	validation_0-logloss:0.23247	validation_1-logloss:0.27414
[160]	validation_0-logloss:0.23244	validation_1-logloss:0.27412
[161]	validation_0-logloss:0.23240	validation_1-logloss:0.27409
[162]	validation_0-logloss:0.23237	validation_1-logloss:0.27413
[163]	validation_0-logloss:0.23225	validation_1-logloss:0.27421
[164]	validation_0-logloss:0.23219	validation_1-logloss:0.27421
[165]	validation_0-logloss:0.23211	validation_1-logloss:0.27424
[166]	validation_0-logloss:0.23208	validation_1-logloss:0.27421
[167]	validation_0-logloss:0.23206	validation_1-logloss:0.27424
[168]	validation_0-logloss:0.23202	validation_1-logloss:0.27423
[169]	validation_0-logloss:0.23171	validation_1-logloss:0.27422
[170]	validation_0-logloss:0.23167	valid