In [1]:
#데이터처리안한거 3,4 용

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

train = pd.read_csv('./data/train.csv', index_col='no')
test = pd.read_csv('./data/test.csv', index_col='no')

def strip_title(row):
    return row.strip()

# train 공백제거
# except pass 는 수치형나오면 오류나는거를 pass한것
for i in range(len(train.columns)):
    try:
        train.iloc[:,i] = train.iloc[:,i].apply(strip_title)
    except:
        pass
# test 공백제거
for i in range(len(test.columns)):
    try:
        test.iloc[:,i] = test.iloc[:,i].apply(strip_title)
    except:
        pass
    
# ?을 other로 변환
train['occupation'] = train['occupation'].replace('?','other')
train['workclass'] = train['workclass'].replace('?','other')
train['native-country'] = train['native-country'].replace('?','other')

# ?을 other로 변환
test['occupation'] = test['occupation'].replace('?','other')
test['workclass'] = test['workclass'].replace('?','other')
test['native-country'] = test['native-country'].replace('?','other')

# 인코딩할 컬럼명만 선택
categorical_features = ['workclass', 'marital-status', 'occupation', 
                        'education','relationship', 'race', 'sex', 'native-country']
# categorical_features = ['workclass','occupation']
# 반복문으로 인코딩하기
for feature_name in categorical_features:
    one_hot = pd.get_dummies(train[feature_name], prefix = feature_name)
    train = pd.concat([train,one_hot], axis=1) # 기존 데이터 끝에 one_hot을 붙임.
    train.drop(feature_name, axis=1, inplace=True) # 기존 글자컬럼을 삭제
    
for feature_name in categorical_features:
    one_hot = pd.get_dummies(test[feature_name], prefix = feature_name)
    test = pd.concat([test,one_hot], axis=1) # 기존 데이터 끝에 one_hot을 붙임.
    test.drop(feature_name, axis=1, inplace=True) # 기존 글자컬럼을 삭제
train.drop("workclass_other", axis = 1, inplace = True)
test.drop("workclass_other", axis = 1, inplace = True)
train.drop(['fnlwgt'],axis=1,inplace=True)
test.drop(['fnlwgt'],axis=1,inplace=True)

train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)
X_train = train.drop(['income'], axis=1)
y_train = train.income
X_test=test
X_train.sort_index(axis=1, inplace=True)
X_test.sort_index(axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=0)
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1, random_state=0 )

In [3]:
from hyperopt import hp

# max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2 사이 정규 분포된 값으로 검색.
xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 10, 1), 
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 1),
                   }

In [4]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

# fmin()에서 입력된 search_space 값으로 입력된 모든 값은 실수형임.
# XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함.
# 정확도는 높을수록 더 좋은 수치임. -1 * 정확도를 곱해서 큰 정확도 값일수록 최소가 되도록 변환
def objective_func(search_space):
    # 수행 시간 절약을 위해 nestimators는 100으로 축소
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=5)
    
    # accuracy는 cv=3 개수만큼 roc-auc 결과를 리스트로 가짐. 이를 평균해서 반환하되 -1을 곱함.
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}


In [5]:
from hyperopt import fmin, tpe, Trials


trial_val = Trials()
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)


100%|████████████████████████████████████████████████| 50/50 [02:22<00:00,  2.85s/trial, best loss: -0.872053826987198]
best: {'colsample_bytree': 0.48089834801608844, 'learning_rate': 0.1816741392195572, 'max_depth': 5.0, 'min_child_weight': 1.0}


In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.5f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [7]:
xgb_wrapper = XGBClassifier(n_estimators=400,
                            learning_rate=round(best['learning_rate'], 5),
                            max_depth=int(best['max_depth']),
                            min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )

evals = [(X_tr, y_tr), (X_val, y_val)]
xgb_wrapper.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric='logloss',
                eval_set=evals, verbose=True)

preds = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, preds, pred_proba)


[0]	validation_0-logloss:0.60271	validation_1-logloss:0.60180
[1]	validation_0-logloss:0.54522	validation_1-logloss:0.54406
[2]	validation_0-logloss:0.50179	validation_1-logloss:0.50034
[3]	validation_0-logloss:0.46423	validation_1-logloss:0.46229
[4]	validation_0-logloss:0.43406	validation_1-logloss:0.43192
[5]	validation_0-logloss:0.41298	validation_1-logloss:0.41061
[6]	validation_0-logloss:0.39784	validation_1-logloss:0.39567
[7]	validation_0-logloss:0.38163	validation_1-logloss:0.37933
[8]	validation_0-logloss:0.36664	validation_1-logloss:0.36397
[9]	validation_0-logloss:0.35718	validation_1-logloss:0.35487
[10]	validation_0-logloss:0.34894	validation_1-logloss:0.34662
[11]	validation_0-logloss:0.34276	validation_1-logloss:0.34059
[12]	validation_0-logloss:0.33658	validation_1-logloss:0.33463
[13]	validation_0-logloss:0.33112	validation_1-logloss:0.32992
[14]	validation_0-logloss:0.32422	validation_1-logloss:0.32337
[15]	validation_0-logloss:0.32039	validation_1-logloss:0.31944
[1



[23]	validation_0-logloss:0.29907	validation_1-logloss:0.30107
[24]	validation_0-logloss:0.29677	validation_1-logloss:0.29900
[25]	validation_0-logloss:0.29454	validation_1-logloss:0.29718
[26]	validation_0-logloss:0.29273	validation_1-logloss:0.29554
[27]	validation_0-logloss:0.29134	validation_1-logloss:0.29435
[28]	validation_0-logloss:0.29008	validation_1-logloss:0.29386
[29]	validation_0-logloss:0.28857	validation_1-logloss:0.29258
[30]	validation_0-logloss:0.28760	validation_1-logloss:0.29175
[31]	validation_0-logloss:0.28661	validation_1-logloss:0.29129
[32]	validation_0-logloss:0.28575	validation_1-logloss:0.29063
[33]	validation_0-logloss:0.28423	validation_1-logloss:0.28938
[34]	validation_0-logloss:0.28348	validation_1-logloss:0.28891
[35]	validation_0-logloss:0.28273	validation_1-logloss:0.28843
[36]	validation_0-logloss:0.28212	validation_1-logloss:0.28801
[37]	validation_0-logloss:0.28137	validation_1-logloss:0.28769
[38]	validation_0-logloss:0.28020	validation_1-logloss:

[153]	validation_0-logloss:0.24895	validation_1-logloss:0.27443
[154]	validation_0-logloss:0.24878	validation_1-logloss:0.27433
[155]	validation_0-logloss:0.24859	validation_1-logloss:0.27421
[156]	validation_0-logloss:0.24842	validation_1-logloss:0.27422
[157]	validation_0-logloss:0.24831	validation_1-logloss:0.27412
[158]	validation_0-logloss:0.24824	validation_1-logloss:0.27423
[159]	validation_0-logloss:0.24811	validation_1-logloss:0.27430
[160]	validation_0-logloss:0.24800	validation_1-logloss:0.27425
[161]	validation_0-logloss:0.24796	validation_1-logloss:0.27429
[162]	validation_0-logloss:0.24763	validation_1-logloss:0.27429
[163]	validation_0-logloss:0.24748	validation_1-logloss:0.27434
[164]	validation_0-logloss:0.24735	validation_1-logloss:0.27443
[165]	validation_0-logloss:0.24722	validation_1-logloss:0.27450
[166]	validation_0-logloss:0.24715	validation_1-logloss:0.27453
[167]	validation_0-logloss:0.24714	validation_1-logloss:0.27454
[168]	validation_0-logloss:0.24702	valid