In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [4]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
신혼가구 = pd.read_csv('신혼가구_변수추가.csv', encoding='cp949')
신혼가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
신혼가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'Cat_주택 마련 예상 소요연수','Cat_남편/아내의 부모님과 동거 의향','Cat_가족계획 시 중요 고려 사항 1순위',
    'target'    
]

In [9]:
cat = 신혼가구.select_dtypes(include = 'object')
num = 신혼가구.select_dtypes(exclude = 'object')
num_신혼 = num.drop('target',axis=1)
target = 신혼가구.target

In [10]:
scaler=RobustScaler()
scaler.fit(num_신혼)
num_scaled_신혼=scaler.transform(num_신혼)
num_df_scaled_신혼=pd.DataFrame(data=num_scaled_신혼, columns=num_신혼.columns)

In [11]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [12]:
comp =pd.concat([num_df_scaled_신혼, target,cat2],axis=1)

In [13]:
X =comp.drop('target', axis = 1)
y=comp.target
X.shape

(6119, 221)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])

        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [17]:
print(study.best_trial.params)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}


In [18]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7934665564821075


In [19]:
xgb_optuna_0 = XGBClassifier(**study.best_trial.params, random_state = 0)

In [20]:
xgb_optuna_0.fit(X_train, y_train)

In [21]:
xgb_optuna_0_proba = xgb_optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, xgb_optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.778


In [22]:
X_train = X_train.values
y_train = y_train.values

In [23]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [24]:
auc_bootstrap = []

In [25]:
rs = RandomState(seed = 2024)
bootstrap_auc(xgb_optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75485932, 0.77674339])

In [26]:
np.mean(auc_bootstrap)

0.7659957172761449

In [27]:
t_0 = auc_bootstrap
print(t_0)

[0.7667410714285714, 0.7585921479835953, 0.7654621283321941, 0.7725590610047847, 0.7613289260082022, 0.7668398624401913, 0.7666235902255639, 0.7747244531784006, 0.7630537636705399, 0.7657371411483254, 0.7641831852358167, 0.7686982014695831, 0.772334778708134, 0.7681668660287081, 0.773974175495557, 0.7700812756322624, 0.765053614149009, 0.7549929511278196, 0.7722573479152426, 0.7615104878673957, 0.7688770933014354, 0.772564401059467, 0.7658145719412167, 0.7660522043745727, 0.7667384014012304, 0.7605973385167464, 0.76018615430622, 0.7683457578605606, 0.7742865686944633, 0.77296757518797, 0.7682496368762817, 0.762186004784689, 0.7734455100820232, 0.770399008885851, 0.7540477614490773, 0.7642098855092276, 0.7613796565276829, 0.7680066643882433, 0.7690453050239234, 0.7641751751537935, 0.7724415798017771, 0.7600286226930963, 0.7597455997949418, 0.7710825358851674, 0.764989533492823, 0.7552893241626795, 0.7685887303485988, 0.7629189272898155, 0.7705538704716336, 0.7681481758373206, 0.76199109

In [28]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
# 1. 
column_to_drop = 'Cat_기초생활보장 수급가구 여부'

In [30]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(6119, 219)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [32]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])

        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [33]:
print(study.best_trial.params)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}


In [34]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7934665564821075


In [35]:
xgb_optuna_1 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_1.fit(X_train, y_train)

In [36]:
xgb_optuna_1_proba = xgb_optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, xgb_optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.778


In [37]:
X_train = X_train.values
y_train = y_train.values

In [38]:
auc_bootstrap = []
auc_bootstrap

[]

In [39]:
rs = RandomState(seed = 1)
bootstrap_auc(xgb_optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75445267, 0.77591328])

In [40]:
np.mean(auc_bootstrap)

0.7658134518647471

In [41]:
t_1 = auc_bootstrap
print(t_1)

[0.7609310919343815, 0.7544162252221464, 0.7649841934381407, 0.7730610261449078, 0.761820211038961, 0.7576576384142173, 0.7671122052289815, 0.7656196599453178, 0.7599485218728641, 0.7691334159261791, 0.772567071086808, 0.7580795027341081, 0.7658813226247436, 0.7719422846889953, 0.7737338730348599, 0.7680921052631579, 0.7667864618933699, 0.7662177460697198, 0.751302973342447, 0.7625357783663705, 0.7615371881408066, 0.7700118549213943, 0.7638414217361585, 0.7641618250170882, 0.7645756792549556, 0.7603890763841422, 0.7679078733766234, 0.7733200187969924, 0.7601968344155844, 0.7701106459330145, 0.7609230818523581, 0.757011491797676, 0.7685593600478469, 0.7667784518113465, 0.7708876238892687, 0.7710905459671906, 0.7727432928913192, 0.7668879229323308, 0.7649975435748463, 0.7665942199248121, 0.7673311474709502, 0.7736057117224879, 0.7737886085953519, 0.7701640464798359, 0.7515699760765551, 0.7717714029391661, 0.7663459073820915, 0.7587656997607656, 0.7627120001708818, 0.7631205143540669, 0.7

In [42]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
# 2.
column_to_drop_1 = 'Cat_현재 주택의 위치'

In [44]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(6119, 216)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [46]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [47]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [48]:
xgb_optuna_2 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_2.fit(X_train, y_train)

In [49]:
xgb_optuna_2_proba = xgb_optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, xgb_optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.778


In [50]:
X_train = X_train.values
y_train = y_train.values

In [51]:
auc_bootstrap = []

In [52]:
rs = RandomState(seed = 2)
bootstrap_auc(xgb_optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75524854, 0.77664417])

In [53]:
np.mean(auc_bootstrap)

0.7660912675420796

In [54]:
t_2 = auc_bootstrap
print(t_2)

[0.767966613978127, 0.7594892771701983, 0.7608643412508544, 0.7620044429254955, 0.761083283492823, 0.7627573906356799, 0.7724709501025291, 0.7718248034859878, 0.7686741712235134, 0.765657040328093, 0.7681268156185919, 0.7691360859535202, 0.772735282809296, 0.767194976076555, 0.7686421308954203, 0.7691734663362954, 0.7640283236500341, 0.7669680237525633, 0.767360517771702, 0.7603009654818865, 0.7745669215652767, 0.7722466678058783, 0.7644314977785373, 0.7657104408749145, 0.7770633971291866, 0.7609497821257689, 0.7765187115516063, 0.7591234834244702, 0.7635503887559808, 0.766057544429255, 0.763886812200957, 0.7753812799043063, 0.7660789046479837, 0.765293916609706, 0.7648773923444976, 0.764025653622693, 0.7741557373547505, 0.7638601119275461, 0.7660522043745728, 0.7636037893028025, 0.7617120749316473, 0.7674005681818182, 0.7590914430963773, 0.7666850008544087, 0.7598524008885852, 0.7578739106288448, 0.7565682672590567, 0.7689171437115516, 0.7790899478810662, 0.7664874188311688, 0.7643353

In [55]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
# 3.
column_to_drop_2 = 'Cat_가구주 장애 여부'

In [57]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(6119, 214)


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [59]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [60]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [61]:
xgb_optuna_3 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_3.fit(X_train, y_train)

In [62]:
xgb_optuna_3_proba = xgb_optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, xgb_optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.778


In [63]:
X_train = X_train.values
y_train = y_train.values

In [64]:
auc_bootstrap = []

In [65]:
rs = RandomState(seed = 3)
bootstrap_auc(xgb_optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75468803, 0.77629823])

In [66]:
t_3 = auc_bootstrap
print(t_3)

[0.76809744531784, 0.772067775974026, 0.7649708433014354, 0.7770687371838687, 0.7639241925837321, 0.7688130126452495, 0.7642152255639099, 0.7748285842447027, 0.7713441985645932, 0.7643567370129871, 0.7718888841421736, 0.7658866626794258, 0.7628962320574162, 0.7690399649692412, 0.7711332664046479, 0.7639562329118249, 0.7678197624743677, 0.7708983039986329, 0.765392707621326, 0.770364298530417, 0.7671148752563225, 0.7613876666097059, 0.7596441387559809, 0.7742491883116882, 0.7623729066985646, 0.7701373462064252, 0.763825401572112, 0.7551584928229664, 0.7583518455228981, 0.762618549213944, 0.7653793574846207, 0.7689011235475051, 0.7750048060492138, 0.7656623803827751, 0.7680840951811347, 0.7677877221462748, 0.767093515037594, 0.7567231288448393, 0.7801579588174984, 0.7613956766917294, 0.7630057031784006, 0.774226493079289, 0.7707861628503075, 0.7599138115174299, 0.7718942241968558, 0.7639455528024608, 0.7641564849624061, 0.7706660116199591, 0.7640603639781272, 0.7715391105604921, 0.763852

In [67]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [68]:
### 4. 
column_to_drop_3 = 'Cat_가구주 동거 여부'

In [69]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(6119, 212)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [71]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [72]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [73]:
xgb_optuna_4 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_4.fit(X_train, y_train)

In [74]:
xgb_optuna_4_proba = xgb_optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, xgb_optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.778


In [75]:
X_train = X_train.values
y_train = y_train.values

In [76]:
auc_bootstrap = []

In [77]:
rs = RandomState(seed = 4)
bootstrap_auc(xgb_optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75459218, 0.77629189])

In [78]:
np.mean(auc_bootstrap)

0.7659803752990431

In [79]:
t_4 = auc_bootstrap
print(t_4)

[0.767392558099795, 0.7647492310321258, 0.7634275674982912, 0.7595880681818181, 0.7562024735133288, 0.7727966934381408, 0.7686154306220097, 0.7631739149008886, 0.7604691772043746, 0.7636438397129187, 0.765758501367054, 0.774206467874231, 0.7665221291866029, 0.7599325017088174, 0.7666155801435407, 0.7684151785714284, 0.7600900333219412, 0.7629736628503077, 0.7652618762816132, 0.7599431818181819, 0.765659710355434, 0.765555579289132, 0.7672617267600821, 0.7700145249487355, 0.7624930579289132, 0.7695926606288448, 0.7673818779904307, 0.77190490430622, 0.7676355305878332, 0.7642392558099795, 0.7651310449419002, 0.7726845522898154, 0.760549278024607, 0.7617561303827752, 0.7678704929938482, 0.7566990985987696, 0.7674005681818182, 0.7535671565276828, 0.7645970394736842, 0.7656143198906357, 0.7664740686944633, 0.7598817711893369, 0.7762890892002734, 0.7621245941558441, 0.7665862098427888, 0.7608803614149009, 0.7651096847231715, 0.7746443523581682, 0.7668719027682842, 0.7753011790840738, 0.76478

In [80]:
## 5.
column_to_drop_4 = 'Cat_가구주 주민등록상 등재 여부'

In [81]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(6119, 210)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [83]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [84]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [85]:
xgb_optuna_5 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_5.fit(X_train, y_train)

In [86]:
xgb_optuna_5_proba = xgb_optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, xgb_optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.778


In [87]:
X_train = X_train.values
y_train = y_train.values

In [88]:
auc_bootstrap = []

In [89]:
rs = RandomState(seed = 5)
bootstrap_auc(xgb_optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75465546, 0.77739188])

In [90]:
np.mean(auc_bootstrap)

0.765932402917806

In [91]:
t_5 = auc_bootstrap
print(t_5)

[0.7639455528024608, 0.7643433868762817, 0.7676916011619959, 0.7636465097402596, 0.7729141746411483, 0.7748205741626795, 0.7689171437115516, 0.7614944677033494, 0.7751329673615858, 0.7679799641148326, 0.7740195659603555, 0.7720464157552973, 0.7713161632775121, 0.7688130126452495, 0.7693750534005468, 0.7548300794600137, 0.7552652939166098, 0.756910030758715, 0.7630030331510596, 0.7687355818523581, 0.7641538149350651, 0.7517782382091592, 0.7614597573479153, 0.7664660586124402, 0.7638173914900889, 0.7615879186602872, 0.7696514012303486, 0.7571983937115516, 0.7637906912166781, 0.7618549213943951, 0.7670694847915243, 0.7638267365857827, 0.7669066131237184, 0.7626078691045797, 0.7570034817156528, 0.7742598684210527, 0.7624022769993166, 0.7669039430963773, 0.7638467617908408, 0.7671656057758032, 0.7644261577238551, 0.7781233979835953, 0.7725350307587149, 0.7679238935406699, 0.768431198735475, 0.7622233851674641, 0.7797654647983596, 0.7668986030416951, 0.7737285329801777, 0.7691601161995898, 0

In [92]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [93]:
## 6.
column_to_drop_5 = '부채 중 임대 보증금의 비중'

In [94]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(6119, 209)


In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [96]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [97]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [98]:
xgb_optuna_6 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_6.fit(X_train, y_train)

In [99]:
xgb_optuna_proba_6 = xgb_optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, xgb_optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.778


In [100]:
X_train = X_train.values
y_train = y_train.values

In [101]:
auc_bootstrap = []

In [102]:
rs = RandomState(seed = 6)
bootstrap_auc(xgb_optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75493575, 0.77663092])

In [103]:
np.mean(auc_bootstrap)

0.7658492235560493

In [104]:
t_6 = auc_bootstrap
print(t_6)

[0.755137132604238, 0.7698569933356119, 0.7667544215652768, 0.7654861585782637, 0.7604024265208476, 0.7703749786397813, 0.7691627862269308, 0.7661977208646618, 0.7682790071770335, 0.7674352785372522, 0.7654487781954886, 0.7685940704032809, 0.7652084757347914, 0.7611820745044429, 0.7649040926179084, 0.7744494403622693, 0.7632887260765551, 0.7652405160628845, 0.7626105391319207, 0.7711492865686944, 0.7639936132946001, 0.7680413747436774, 0.766893262987013, 0.7608002605946684, 0.7673071172248804, 0.7601060534859877, 0.7733840994531784, 0.7661376452494872, 0.7713255083732058, 0.7687275717703349, 0.7637052503417634, 0.7805451127819548, 0.7651283749145591, 0.7640710440874914, 0.7677102913533834, 0.7546138072453862, 0.781482292378674, 0.7694057587149692, 0.7718835440874915, 0.7650696343130553, 0.7627039900888586, 0.7615612183868763, 0.7646157296650717, 0.7560129015721122, 0.7586695787764868, 0.766356587491456, 0.7735656613123718, 0.7636999102870813, 0.7723000683526999, 0.7590887730690363, 0.7

In [105]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [106]:
## 7 . 
column_to_drop_6 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [107]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(6119, 208)


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [109]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [110]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [111]:
xgb_optuna_7 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_7.fit(X_train, y_train)

In [112]:
xgb_optuna_proba_7 = xgb_optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, xgb_optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.778


In [113]:
X_train = X_train.values
y_train = y_train.values

In [114]:
auc_bootstrap = []

In [115]:
rs = RandomState(seed = 7)
bootstrap_auc(xgb_optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75489943, 0.77710809])

In [116]:
np.mean(auc_bootstrap)

0.7661861623163022

In [117]:
t_7 = auc_bootstrap
print(t_7)

[0.7672830869788108, 0.771600521189337, 0.759777640123035, 0.7599298316814764, 0.7678704929938482, 0.7744868207450444, 0.7694564892344498, 0.7647465610047847, 0.7680787551264525, 0.7678758330485305, 0.7771515080314422, 0.7705725606630213, 0.7791193181818182, 0.7621352742652086, 0.7700732655502391, 0.7709410244360901, 0.7670935150375939, 0.7719235944976077, 0.7625277682843472, 0.7685433398838004, 0.769330997949419, 0.7663806177375256, 0.7711145762132604, 0.7681455058099795, 0.760282275290499, 0.7617347701640464, 0.7704710996240602, 0.7605893284347232, 0.7576229280587833, 0.7625651486671223, 0.7705618805536569, 0.7690079246411483, 0.7674032382091593, 0.7535110859535201, 0.7656784005468216, 0.7657424812030076, 0.7691414260082023, 0.7621459543745728, 0.7628855519480519, 0.7676862611073137, 0.7649655032467533, 0.7667010210184553, 0.7651604152426521, 0.7600126025290499, 0.7677409966678058, 0.7652672163362952, 0.7697501922419685, 0.755006301264525, 0.7665061090225563, 0.7661376452494874, 0.75

In [118]:
np.mean(t_7)

0.7661861623163022

In [119]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [120]:
## 8 
column_to_drop_7 = '소득 중 재산소득의 비중(월평균)'

In [121]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(6119, 207)


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [123]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [124]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [125]:
xgb_optuna_8 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_8.fit(X_train, y_train)

In [126]:
xgb_optuna_proba_8 = xgb_optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, xgb_optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.778


In [127]:
X_train = X_train.values
y_train = y_train.values

In [128]:
auc_bootstrap = []

In [129]:
rs = RandomState(seed = 8)
bootstrap_auc(xgb_optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75536662, 0.77721058])

In [130]:
np.mean(auc_bootstrap)

0.7660004345469497

In [131]:
t_8 = auc_bootstrap
print(t_8)

[0.7761529178058784, 0.7694404690704033, 0.7698409731715653, 0.7634836380724538, 0.7605973385167464, 0.7591341635338346, 0.7661830357142857, 0.7571116178229664, 0.7695873205741627, 0.7697021317498292, 0.7688236927546138, 0.7701213260423787, 0.7692535671565276, 0.7667036910457963, 0.7731811773752564, 0.7656063098086124, 0.7714643497949419, 0.7637933612440191, 0.7541251922419685, 0.7621699846206424, 0.7650162337662338, 0.7665034389952152, 0.7649628332194123, 0.7644368378332194, 0.7694324589883799, 0.7522908834586467, 0.7661883757689679, 0.7579433313397128, 0.774534881237184, 0.774601631920711, 0.7658065618591934, 0.7584666566985646, 0.7690453050239234, 0.7640977443609023, 0.7625117481203008, 0.7656810705741626, 0.7676115003417636, 0.7624343173274095, 0.7599431818181818, 0.7628842169343815, 0.762784090909091, 0.7572971847231715, 0.7590487226589201, 0.7573318950786057, 0.7674699888926861, 0.761115323820916, 0.7537914388243336, 0.7655689294258373, 0.7625304383116883, 0.7701854066985646, 0.7

In [132]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [133]:
#9.
column_to_drop_8 = '자산 중 부동산 자산의 비중'

In [134]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(6119, 206)


In [135]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [136]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [137]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [138]:
xgb_optuna_9 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_9.fit(X_train, y_train)

In [139]:
xgb_optuna_proba_9 = xgb_optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, xgb_optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.778


In [140]:
X_train = X_train.values
y_train = y_train.values

In [141]:
auc_bootstrap = []

In [142]:
rs = RandomState(seed = 9)
bootstrap_auc(xgb_optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75543244, 0.77636699])

In [143]:
np.mean(auc_bootstrap)

0.7662247488839286

In [144]:
t_9 = auc_bootstrap
print(t_9)

[0.7708796138072453, 0.757948671394395, 0.7604050965481887, 0.7708742737525631, 0.7687008714969241, 0.7702788576555024, 0.773936795112782, 0.7610726033834587, 0.7674085782638413, 0.7740943267259056, 0.773640422077922, 0.7691200657894737, 0.7634275674982911, 0.7726445018796992, 0.7627734107997266, 0.7728661141490089, 0.7592970352016405, 0.7683911483253589, 0.7729061645591251, 0.7544696257689679, 0.7661189550580998, 0.7677903921736159, 0.7723134184894054, 0.7712640977443609, 0.7617374401913874, 0.7673792079630897, 0.7607308398838004, 0.7665167891319207, 0.7676729109706083, 0.7804222915242651, 0.767395228127136, 0.7693149777853725, 0.762418297163363, 0.7627627306903623, 0.7669653537252221, 0.7640149735133287, 0.7697101418318523, 0.7815330228981545, 0.7631632347915243, 0.7687489319890636, 0.7705618805536567, 0.7662337662337663, 0.770799512987013, 0.7705632155673274, 0.7646344198564593, 0.7667784518113465, 0.7635637388926861, 0.7655368890977443, 0.7570408620984279, 0.756158418062201, 0.7596

In [145]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [146]:
# 10.Cat_이사 예상 기간
column_to_drop_9 = '소득 중 사적이전소득의 비중(월평균)'

In [147]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(6119, 205)


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [149]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [150]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [151]:
xgb_optuna_10 = XGBClassifier(**study.best_trial.params, random_state=0)
xgb_optuna_10.fit(X_train, y_train)

In [152]:
xgb_optuna_proba_10 = xgb_optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, xgb_optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.778


In [153]:
X_train = X_train.values
y_train = y_train.values

In [154]:
auc_bootstrap = []

In [155]:
rs = RandomState(seed = 10)
bootstrap_auc(xgb_optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75485751, 0.77623839])

In [156]:
np.mean(auc_bootstrap)

0.765806859567242

In [157]:
t_10 = auc_bootstrap
print(t_10)

[0.7759606758373206, 0.7666850008544087, 0.7662097359876965, 0.7604584970950102, 0.7713895890293917, 0.7660415242652083, 0.7566590481886534, 0.7660201640464799, 0.7581649436090225, 0.775610902255639, 0.7651443950786057, 0.768164196001367, 0.7589552717019821, 0.7708742737525631, 0.7697341720779223, 0.7708208732057416, 0.7663298872180452, 0.7618068609022557, 0.7737178528708134, 0.7674673188653451, 0.7643700871496926, 0.7712080271701982, 0.7709730647641831, 0.7716592617908408, 0.7492764225905674, 0.7653606672932332, 0.7626772898154477, 0.7655876196172248, 0.7579646915584416, 0.7575428272385509, 0.7624396573820915, 0.7626185492139439, 0.7717260124743678, 0.7628561816473002, 0.764292656356801, 0.7581649436090226, 0.761921672077922, 0.7690586551606289, 0.7728954844497609, 0.7678117523923444, 0.758848470608339, 0.7665701896787422, 0.7773223897812713, 0.7606106886534518, 0.7704951298701298, 0.7721104964114832, 0.7556377627306904, 0.7715925111073137, 0.763988273239918, 0.7590553977272727, 0.765

In [158]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [159]:
# 11.
column_to_drop_10 = 'Cat_현재 주택의 구조'

In [160]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(6119, 203)


In [161]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [162]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [163]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7934665564821075


In [164]:
xgb_optuna_11 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_11.fit(X_train, y_train)

In [165]:
xgb_optuna_proba_11 = xgb_optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, xgb_optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.778


In [166]:
X_train = X_train.values
y_train = y_train.values

In [167]:
auc_bootstrap = []

In [168]:
rs = RandomState(seed = 11)
bootstrap_auc(xgb_optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75463957, 0.77652672])

In [169]:
np.mean(auc_bootstrap)

0.7660407512922932

In [170]:
t_11 = auc_bootstrap
print(t_11)

[0.7674779989747095, 0.771637901572112, 0.7733066686602872, 0.7606827793916611, 0.7659026828434724, 0.7599298316814764, 0.7708395633971291, 0.7685914003759398, 0.7735256109022557, 0.7610993036568694, 0.7576336081681476, 0.7622954759056734, 0.7729462149692412, 0.7652885765550239, 0.7652672163362952, 0.7700892857142857, 0.7740809765892003, 0.7675714499316473, 0.7640496838687628, 0.7687249017429939, 0.7728047035201641, 0.7682202665755298, 0.7682175965481888, 0.7618148709842788, 0.7665114490772386, 0.7675474196855776, 0.7615078178400546, 0.7701854066985645, 0.763320766404648, 0.7573265550239233, 0.7627120001708818, 0.7743639994873547, 0.7583091250854408, 0.7593557758031442, 0.7692535671565277, 0.7596948692754614, 0.7700358851674641, 0.7615985987696514, 0.7678838431305537, 0.7637319506151743, 0.7672910970608339, 0.769232206937799, 0.7624903879015721, 0.7656276700273411, 0.7670694847915243, 0.7485368250170883, 0.7654434381408066, 0.7682095864661653, 0.7641831852358169, 0.7668772428229665, 0.

In [171]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [172]:
# 12
column_to_drop_11 = '부채 중 비금융기관 대출금의 비중'

In [173]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(6119, 202)


In [174]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [175]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [176]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7940842577984791


In [177]:
xgb_optuna_12 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_12.fit(X_train, y_train)

In [178]:
xgb_optuna_proba_12 = xgb_optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, xgb_optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.778


In [179]:
X_train = X_train.values
y_train = y_train.values

In [180]:
auc_bootstrap = []

In [181]:
rs = RandomState(seed = 12)
bootstrap_auc(xgb_optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75479744, 0.77660455])

In [182]:
np.mean(auc_bootstrap)

0.7661083056540499

In [183]:
t_12 = auc_bootstrap
print(t_12)

[0.7699504442925496, 0.7673845480177717, 0.765357997265892, 0.7664794087491457, 0.7673418275803143, 0.7721665669856459, 0.7657131109022557, 0.7680093344155844, 0.7584906869446344, 0.7642259056732741, 0.7675100393028025, 0.7723774991455912, 0.7602262047163363, 0.7642499359193438, 0.7746870727956254, 0.7612381450786055, 0.7626399094326727, 0.7734642002734108, 0.7735977016404648, 0.7659854536910459, 0.7617614704374573, 0.7684392088174983, 0.7641805152084757, 0.7726738721804511, 0.765021573820916, 0.7661109449760766, 0.7702521573820915, 0.7745215311004785, 0.7756189123376623, 0.7592676649008885, 0.7692909475393028, 0.7574146659261791, 0.7665114490772386, 0.7602876153451812, 0.7679692840054683, 0.7693470181134654, 0.7635690789473684, 0.7660068139097744, 0.7599031314080656, 0.7695419301093642, 0.7620738636363636, 0.766124295112782, 0.7664206681476418, 0.77353629101162, 0.7731811773752564, 0.7634289025119617, 0.7597562799043062, 0.7686554810321258, 0.7687382518796992, 0.7660682245386192, 0.78

In [184]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [185]:
# 13.
column_to_drop_12 = 'Cat_소득 계층'

In [186]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(6119, 200)


In [187]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [188]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [189]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.04, 'max_depth': 5, 'max_leaves': 242, 'subsample': 0.9, 'colsample_bytree': 0.8, 'gamma': 1, 'reg_alpha': 2, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7973772735999465


In [190]:
xgb_optuna_13 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_13.fit(X_train, y_train)

In [191]:
xgb_optuna_proba_13 = xgb_optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, xgb_optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.778


In [192]:
X_train = X_train.values
y_train = y_train.values

In [193]:
auc_bootstrap = []

In [194]:
rs = RandomState(seed = 13)
bootstrap_auc(xgb_optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75721174, 0.77883957])

In [195]:
np.mean(auc_bootstrap)

0.76819068067007

In [196]:
t_13 = auc_bootstrap
print(t_13)

[0.7713201683185236, 0.768866413192071, 0.7570355220437457, 0.755209223342447, 0.7665862098427887, 0.7779872265892003, 0.7705645505809979, 0.7738486842105264, 0.7672029861585784, 0.7749033450102529, 0.7688770933014354, 0.7668665627136022, 0.7724122095010253, 0.773472210355434, 0.7709623846548188, 0.7662684765892003, 0.763788021189337, 0.7729435449419002, 0.7715391105604921, 0.7711626367053999, 0.7717086572966508, 0.7725190105946685, 0.763155224709501, 0.7689812243677374, 0.7710745258031442, 0.7585013670539986, 0.7544402554682159, 0.7716966421736158, 0.7621112440191388, 0.772035735645933, 0.7760461167122351, 0.784424662508544, 0.763886812200957, 0.7753385594668488, 0.773306668660287, 0.7628668617566644, 0.7636758800410117, 0.7753198692754614, 0.7747458133971292, 0.7654834885509227, 0.7627734107997266, 0.7699958347573479, 0.7666422804169514, 0.7624102870813397, 0.7643967874231032, 0.7690453050239234, 0.764057693950786, 0.7590914430963773, 0.7741477272727273, 0.7644074675324676, 0.7690266

In [197]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [198]:
#14.
column_to_drop_13 = 'Cat_가구주 성별'

In [199]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(6119, 198)


In [200]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [201]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [202]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 167, 'learning_rate': 0.05, 'max_depth': 4, 'max_leaves': 494, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 1, 'reg_alpha': 3, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7963004699538392


In [203]:
xgb_optuna_14 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_14.fit(X_train, y_train)

In [204]:
xgb_optuna_proba_14 = xgb_optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, xgb_optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.778


In [205]:
X_train = X_train.values
y_train = y_train.values

In [206]:
auc_bootstrap = []

In [207]:
rs = RandomState(seed = 14)
bootstrap_auc(xgb_optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76028001, 0.77976059])

In [208]:
np.mean(auc_bootstrap)

0.7702774705762988

In [209]:
t_14 = auc_bootstrap
print(t_14)

[0.7726124615516063, 0.7659187030075189, 0.779746774606972, 0.7716485816814764, 0.7751302973342447, 0.7655208689336979, 0.759684189166097, 0.7588404605263157, 0.7607468600478468, 0.772705912508544, 0.7691788063909775, 0.7609818224538618, 0.7717607228298018, 0.7690306198735475, 0.7640683740601504, 0.7772983595352017, 0.768582055280246, 0.766426008202324, 0.7645836893369788, 0.7749300452836637, 0.7749807758031442, 0.7725377007860561, 0.773904754784689, 0.7732092126623377, 0.7770714072112098, 0.7739261150034178, 0.7700625854408749, 0.7650375939849623, 0.7684739191729323, 0.7714803699589883, 0.767096185064935, 0.7669600136705399, 0.7723961893369788, 0.7676061602870814, 0.7587389994873548, 0.7705939208817498, 0.7651443950786057, 0.7771248077580314, 0.7723334436944636, 0.772935534859877, 0.7679052033492823, 0.7709303443267259, 0.7797975051264525, 0.7659721035543403, 0.7751116071428571, 0.7700732655502392, 0.7633367865686943, 0.7609631322624744, 0.7762089883800409, 0.7723214285714286, 0.76874

In [210]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [211]:
## 15.
column_to_drop_14 = 'Cat_현재 대기오염 정도'

In [212]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(6119, 194)


In [213]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [214]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [215]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 90, 'learning_rate': 0.05, 'max_depth': 7, 'max_leaves': 774, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.8, 'gamma': 4, 'reg_alpha': 4, 'reg_lambda': 1, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7957578944732426


In [216]:
xgb_optuna_15 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_15.fit(X_train, y_train)

In [217]:
xgb_optuna_proba_15 = xgb_optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, xgb_optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.780


In [218]:
X_train = X_train.values
y_train = y_train.values

In [219]:
auc_bootstrap = []

In [220]:
rs = RandomState(seed = 15)
bootstrap_auc(xgb_optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76048466, 0.77900257])

In [221]:
np.mean(auc_bootstrap)

0.7698459274072966

In [222]:
t_15 = auc_bootstrap
print(t_15)

[0.7752184082365003, 0.7617988508202325, 0.7669119531784004, 0.7578285201640464, 0.7663752776828434, 0.7702574974367737, 0.7738460141831853, 0.7736003716678058, 0.7639802631578948, 0.765958753417635, 0.7676248504784688, 0.7675340695488723, 0.7691227358168147, 0.7644448479152426, 0.7630110432330828, 0.7749327153110046, 0.7725163405673275, 0.7751623376623377, 0.77029754784689, 0.76953125, 0.77704737696514, 0.7648880724538619, 0.7666769907723856, 0.7700946257689679, 0.7777923145933014, 0.7701747265892003, 0.7682469668489406, 0.7610966336295283, 0.771635231544771, 0.775274478810663, 0.7522388179254955, 0.7714242993848257, 0.7678197624743677, 0.765953413362953, 0.7735816814764183, 0.7653099367737526, 0.7676141703691046, 0.7626585996240601, 0.7710932159945317, 0.7714723598769652, 0.7772316088516746, 0.7729595651059468, 0.7672430365686944, 0.7727139225905675, 0.7751877029220778, 0.7756109022556391, 0.775071556732741, 0.7727940234107997, 0.7633608168147642, 0.7745642515379357, 0.76902661483253

In [223]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [224]:
# 16.
column_to_drop_15 = 'Cat_주택 보유 의식'

In [225]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(6119, 192)


In [226]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [227]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [228]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 152, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 768, 'subsample': 0.7000000000000001, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 3, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.794088431456022


In [229]:
xgb_optuna_16 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_16.fit(X_train, y_train)

In [230]:
xgb_optuna_proba_16 = xgb_optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, xgb_optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.779


In [231]:
X_train = X_train.values
y_train = y_train.values

In [232]:
auc_bootstrap = []

In [233]:
rs = RandomState(seed = 16)
bootstrap_auc(xgb_optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75945216, 0.77819969])

In [234]:
np.mean(auc_bootstrap)

0.768924330757647

In [235]:
t_16 = auc_bootstrap
print(t_16)

[0.7670534646274778, 0.7648720522898155, 0.759985902255639, 0.7651070146958305, 0.7724068694463433, 0.7735309509569378, 0.7620605134996582, 0.768631450786056, 0.7692375469924813, 0.7699531143198907, 0.7600393028024607, 0.7577083689336979, 0.759980562200957, 0.7751943779904307, 0.7722466678058784, 0.7694377990430622, 0.7746416823308271, 0.7648907424812031, 0.7671522556390977, 0.7748179041353384, 0.7720597658920028, 0.7590193523581681, 0.7736751324333561, 0.772708582535885, 0.7598016703691046, 0.776112867395762, 0.7740249060150376, 0.7730770463089542, 0.7721612269309638, 0.770628631237184, 0.7796426435406697, 0.7660762346206424, 0.7689732142857142, 0.769528579972659, 0.7677396616541353, 0.7589712918660287, 0.7624022769993165, 0.7743613294600138, 0.7671949760765551, 0.7730156356801093, 0.7668665627136022, 0.772735282809296, 0.7690853554340397, 0.7745562414559124, 0.7714696898496242, 0.7681161355092276, 0.773301328605605, 0.7752878289473684, 0.769464499316473, 0.7693416780587833, 0.7747564

In [236]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [237]:
# 17.
column_to_drop_16 = 'Cat_현재 주변도로의 보행 안전'

In [238]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(6119, 188)


In [239]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [240]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [241]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956243374318651


In [242]:
xgb_optuna_17 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_17.fit(X_train, y_train)

In [243]:
xgb_optuna_proba_17 = xgb_optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, xgb_optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.776


In [244]:
X_train = X_train.values
y_train = y_train.values

In [245]:
auc_bootstrap = []

In [246]:
rs = RandomState(seed = 17)
bootstrap_auc(xgb_optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75465966, 0.77610259])

In [247]:
np.mean(auc_bootstrap)

0.7659206080720267

In [248]:
t_17 = auc_bootstrap
print(t_17)

[0.7617187500000001, 0.7797921650717702, 0.7635717489747096, 0.7719369446343131, 0.768190896274778, 0.765924043062201, 0.762987012987013, 0.7708716037252221, 0.7566003075871497, 0.7567845394736842, 0.7671148752563226, 0.7726017814422419, 0.7614517472658919, 0.7654647983595352, 0.7606106886534518, 0.7691761363636364, 0.7741103468899522, 0.7746203221120984, 0.7706847018113465, 0.7712747778537252, 0.7570889225905673, 0.7725857612781956, 0.7619670625427205, 0.7690025845864662, 0.7773544301093643, 0.7611767344497606, 0.7607708902939165, 0.7630751238892686, 0.7649761833561176, 0.7635984492481203, 0.7724282296650717, 0.7647599111414901, 0.7688851033834587, 0.7664446983937115, 0.7674486286739577, 0.7710932159945318, 0.7658172419685577, 0.7760728169856459, 0.7675100393028025, 0.767587470095694, 0.7662043959330143, 0.7622073650034176, 0.774404049897471, 0.7675073692754615, 0.7723961893369788, 0.7713468685919345, 0.765926713089542, 0.7665434894053315, 0.7598230305878332, 0.7710531655844157, 0.762

In [249]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [250]:
## 18.
column_to_drop_17 ='부채 중 금융기관 대출금의 비중'

In [251]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(6119, 187)


In [252]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [253]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [254]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 119, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 830, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 2, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7957996310486734


In [255]:
xgb_optuna_18 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_18.fit(X_train, y_train)

In [256]:
xgb_optuna_proba_18 = xgb_optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, xgb_optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.779


In [257]:
X_train = X_train.values
y_train = y_train.values

In [258]:
auc_bootstrap = []

In [259]:
rs = RandomState(seed = 18)
bootstrap_auc(xgb_optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7597751 , 0.77758145])

In [260]:
np.mean(auc_bootstrap)

0.7688180316291439

In [261]:
t_18 = auc_bootstrap
print(t_18)

[0.7678758330485305, 0.7698142728981545, 0.7690880254613808, 0.7697929126794258, 0.7737125128161313, 0.7711639717190704, 0.7654327580314422, 0.7667010210184553, 0.7734214798359534, 0.7739261150034177, 0.7689358339029392, 0.7674726589200275, 0.7673845480177717, 0.7688450529733424, 0.7593103853383458, 0.7667784518113465, 0.7757390635680109, 0.7630644437799043, 0.7734455100820232, 0.7688076725905674, 0.7762223385167464, 0.7650829844497608, 0.763483638072454, 0.7684285287081339, 0.7717954331852357, 0.7665968899521531, 0.7618896317498292, 0.7704337192412851, 0.7722413277511961, 0.7632460056390977, 0.7605279178058784, 0.7698142728981545, 0.7673738679084074, 0.7780860176008202, 0.7696620813397129, 0.7619563824333561, 0.7684712491455912, 0.7683804682159945, 0.7650829844497608, 0.7745001708817499, 0.7745802717019822, 0.7651390550239233, 0.7661910457963089, 0.767795732228298, 0.7735549812030075, 0.7574146659261791, 0.7668078221120984, 0.7683243976418319, 0.7629442925495558, 0.7747938738892686, 0

In [262]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [263]:
# 19
column_to_drop_18 = '총 가구원 수'

In [264]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(6119, 186)


In [265]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [266]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [267]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 6, 'max_leaves': 736, 'subsample': 0.7000000000000001, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 4, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7943597191963204


In [268]:
xgb_optuna_19 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_19.fit(X_train, y_train)

In [269]:
xgb_optuna_proba_19 = xgb_optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, xgb_optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.779


In [270]:
X_train = X_train.values
y_train = y_train.values

In [271]:
auc_bootstrap = []

In [272]:
rs = RandomState(seed = 19)
bootstrap_auc(xgb_optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76046484, 0.77829164])

In [273]:
np.mean(auc_bootstrap)

0.7693510992502564

In [274]:
t_19 = auc_bootstrap
print(t_19)

[0.7718595138414218, 0.7615879186602871, 0.7696113508202324, 0.7703242481203008, 0.7652698863636364, 0.7740996667805877, 0.7775840524606972, 0.7777362440191388, 0.7659373931989064, 0.7758271744702666, 0.7682336167122351, 0.770431049213944, 0.7776054126794257, 0.7660869147300068, 0.7663459073820915, 0.7725670710868079, 0.7713762388926863, 0.7629496326042378, 0.766423338174983, 0.7779605263157895, 0.7629603127136022, 0.7716298914900888, 0.7769859663362952, 0.7770313568010936, 0.7681455058099795, 0.7686314507860561, 0.7753278793574847, 0.769264247265892, 0.7673738679084074, 0.7768925153793574, 0.7753625897129186, 0.776337149692413, 0.7743506493506493, 0.7742331681476418, 0.775245108509911, 0.7773330698906357, 0.7738513542378676, 0.7716379015721121, 0.7709463644907724, 0.7732719583048531, 0.7718541737867395, 0.7774452110389611, 0.7734241498632946, 0.7739928656869447, 0.7679212235133287, 0.7665781997607655, 0.7704977998974709, 0.7709597146274778, 0.7798348855092275, 0.7693763884142173, 0.77

In [275]:
# 20.
column_to_drop_19 = 'Cat_현재 공공기관 접근용이성'

In [276]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(6119, 182)


In [277]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [278]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [279]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7942887670180887


In [280]:
xgb_optuna_20 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_20.fit(X_train, y_train)

In [281]:
xgb_optuna_proba_20 = xgb_optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, xgb_optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.775


In [282]:
X_train = X_train.values
y_train = y_train.values

In [283]:
auc_bootstrap = []

In [284]:
rs = RandomState(seed = 20)
bootstrap_auc(xgb_optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7554044 , 0.77603597])

In [285]:
np.mean(auc_bootstrap)

0.7659673615857827

In [286]:
t_20 = auc_bootstrap
print(t_20)

[0.7681134654818866, 0.7694004186602871, 0.7734481801093643, 0.7583278152768285, 0.7561357228298017, 0.7734294899179768, 0.7703442733253588, 0.7693683783321942, 0.7559247906698565, 0.775309189166097, 0.7629763328776487, 0.7743773496240601, 0.7602048444976077, 0.771437649521531, 0.7689571941216677, 0.7735763414217361, 0.7591929041353384, 0.7601541139781272, 0.7700171949760766, 0.7565709372863978, 0.7665488294600136, 0.7569474111414902, 0.7628481715652768, 0.7658386021872864, 0.7674005681818181, 0.7652565362269309, 0.771002435064935, 0.7605919984620643, 0.7600713431305537, 0.7676862611073137, 0.7632486756664387, 0.7583785457963089, 0.7647118506493505, 0.7665968899521531, 0.7567284688995215, 0.7697048017771702, 0.7629149222488039, 0.7655769395078605, 0.7661750256322625, 0.7713575487012987, 0.7619563824333562, 0.7575348171565277, 0.7639081724196857, 0.7647759313055366, 0.7545096761790842, 0.7643647470950103, 0.7592035842447027, 0.7616733595352015, 0.7638521018455229, 0.7633768369788108, 0.

In [287]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [288]:
# 21
column_to_drop_20 = 'Cat_이사 계획 중인 주택의 유형'

In [289]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(6119, 168)


In [290]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [291]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [292]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7954699121027723


In [293]:
xgb_optuna_21 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_21.fit(X_train, y_train)

In [294]:
xgb_optuna_proba_21 = xgb_optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, xgb_optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.774


In [295]:
X_train = X_train.values
y_train = y_train.values

In [296]:
auc_bootstrap = []

In [297]:
rs = RandomState(seed = 21)
bootstrap_auc(xgb_optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75498941, 0.77631445])

In [298]:
np.mean(auc_bootstrap)

0.7658588002766148

In [299]:
t_21 = auc_bootstrap
print(t_21)

[0.7650162337662337, 0.7607148197197539, 0.7563920454545454, 0.7698409731715653, 0.7578178400546822, 0.7708822838345865, 0.7644848983253589, 0.7712507476076554, 0.7749861158578264, 0.7648747223171566, 0.7643620770676691, 0.7696567412850308, 0.7668011470437459, 0.7748018839712919, 0.7622714456596036, 0.7666422804169515, 0.7624957279562543, 0.7732746283321941, 0.7646130596377307, 0.7716565917634997, 0.7656089798359536, 0.7637292805878331, 0.7627093301435407, 0.7721371966848941, 0.7713789089200274, 0.7658466122693096, 0.7670427845181134, 0.7677183014354066, 0.7587924000341764, 0.7666850008544088, 0.7679105434039646, 0.7702494873547505, 0.7731010765550239, 0.7595773880724539, 0.7654674683868763, 0.7623488764524948, 0.7662043959330143, 0.7679265635680109, 0.7653766874572796, 0.7702895377648667, 0.772270698051948, 0.7650402640123035, 0.7642018754272044, 0.7646344198564593, 0.7660255041011619, 0.7628989020847574, 0.7591314935064936, 0.7755308014354068, 0.7636358296308954, 0.7608883714969241, 

In [300]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [301]:
# 22
column_to_drop_21 = '총 이사 횟수'

In [302]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(6119, 167)


In [303]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [304]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [305]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7954052204108548


In [306]:
xgb_optuna_22 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_22.fit(X_train, y_train)

In [307]:
xgb_optuna_proba_22 = xgb_optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, xgb_optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.775


In [308]:
X_train = X_train.values
y_train = y_train.values

In [309]:
auc_bootstrap = []

In [310]:
rs = RandomState(seed = 22)
bootstrap_auc(xgb_optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75471013, 0.77707715])

In [311]:
np.mean(auc_bootstrap)

0.7661984884975223

In [312]:
t_22 = auc_bootstrap
print(t_22)

[0.7598951213260423, 0.7651710953520163, 0.7577884697539303, 0.7643086765208477, 0.7581916438824334, 0.7662391062884484, 0.7631659048188653, 0.7711706467874231, 0.7643407168489406, 0.7738673744019138, 0.7685780502392344, 0.7664019779562543, 0.7657318010936431, 0.7736778024606972, 0.7640737141148325, 0.7604264567669173, 0.7592649948735475, 0.7715791609706084, 0.7618389012303486, 0.7713975991114149, 0.7616413192071088, 0.7770767472658919, 0.7683537679425838, 0.7662577964798359, 0.7750074760765551, 0.7702721825871497, 0.7600579929938482, 0.7521920924470267, 0.7659187030075189, 0.7531239319890637, 0.7676488807245387, 0.7665061090225564, 0.77143497949419, 0.7615238380041012, 0.7664019779562543, 0.768396488380041, 0.7638387517088175, 0.7656970907382091, 0.7688637431647299, 0.7579273111756665, 0.7748339242993848, 0.762957642686261, 0.7613396061175666, 0.757446706254272, 0.7687249017429939, 0.781751965140123, 0.7687355818523581, 0.7666209201982228, 0.767664900888585, 0.7575187969924813, 0.7700

In [313]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [314]:
# 23.
column_to_drop_22 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [315]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(6119, 163)


In [316]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [317]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [318]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7964340269952169


In [319]:
xgb_optuna_23 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_23.fit(X_train, y_train)

In [320]:
xgb_optuna_proba_23 = xgb_optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, xgb_optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.775


In [321]:
X_train = X_train.values
y_train = y_train.values

In [322]:
auc_bootstrap = []

In [323]:
rs = RandomState(seed = 23)
bootstrap_auc(xgb_optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75552142, 0.7767484 ])

In [324]:
np.mean(auc_bootstrap)

0.766688079395933

In [325]:
t_23 = auc_bootstrap
print(t_23)

[0.7611954246411483, 0.7654968386876281, 0.7799203263841421, 0.7700465652768285, 0.7719102443609023, 0.7748419343814081, 0.7689758843130554, 0.7635463837149693, 0.7594759270334928, 0.7710077751196174, 0.7765534219070405, 0.7604211167122351, 0.7654487781954887, 0.7689144736842105, 0.7697768925153793, 0.7670935150375939, 0.7744814806903623, 0.7553320446001367, 0.7634809680451128, 0.7676782510252904, 0.7656330100820232, 0.7660628844839372, 0.774339969241285, 0.7660842447026658, 0.7626826298701298, 0.765053614149009, 0.7678678229665072, 0.7571583433014353, 0.7664366883116882, 0.7669840439166097, 0.770695381920711, 0.7650162337662337, 0.7722867182159945, 0.7676488807245385, 0.7695178998632947, 0.7616493292891319, 0.7687783022898154, 0.76742993848257, 0.7694511491797676, 0.7647065105946684, 0.764324696684894, 0.753537786226931, 0.7690773453520163, 0.7722867182159945, 0.7676061602870814, 0.7619590524606972, 0.7647866114149009, 0.7670481245727956, 0.7657504912850308, 0.7731544771018455, 0.7675

In [326]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [327]:
# 24
column_to_drop_23 = '소득 중 근로/사업소득의 비중(월평균)'

In [328]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(6119, 162)


In [329]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [330]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [331]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7952570555680766


In [332]:
xgb_optuna_24 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_24.fit(X_train, y_train)

In [333]:
xgb_optuna_proba_24 = xgb_optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, xgb_optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.775


In [334]:
X_train = X_train.values
y_train = y_train.values

In [335]:
auc_bootstrap = []

In [336]:
rs = RandomState(seed = 24)
bootstrap_auc(xgb_optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75623338, 0.77697909])

In [337]:
np.mean(auc_bootstrap)

0.766875485944976

In [338]:
t_24 = auc_bootstrap
print(t_24)

[0.7758538747436774, 0.7768631450786057, 0.7756002221462747, 0.7658332621326043, 0.7607295048701299, 0.7590327024948735, 0.7695793104921395, 0.7623595565618592, 0.7639775931305537, 0.7561223726930963, 0.7660842447026658, 0.7700572453861928, 0.7636491797676008, 0.7683163875598086, 0.7687889823991799, 0.763886812200957, 0.7669119531784004, 0.7590594027682843, 0.7661376452494875, 0.7692268668831169, 0.7685032894736843, 0.7643861073137388, 0.7595052973342447, 0.7775413320232398, 0.7651844454887218, 0.7645115985987696, 0.7706686816473001, 0.7737258629528366, 0.7641030844155844, 0.7644368378332194, 0.7726685321257689, 0.7725243506493507, 0.7718515037593985, 0.7532120428913193, 0.7727699931647299, 0.7619617224880384, 0.7642953263841421, 0.776339819719754, 0.7714963901230347, 0.7568512901572112, 0.7659801136363635, 0.7734107997265891, 0.7711092361585783, 0.7669012730690363, 0.764960163192071, 0.7640390037593985, 0.762447667464115, 0.7666529605263159, 0.7764306006493507, 0.7665408193779903, 0.7

In [339]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [340]:
column_to_drop_24 = 'Cat_현재 문화시설 접근용이성'

In [341]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(6119, 158)


In [342]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [343]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [344]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956702476648385


In [345]:
xgb_optuna_25 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_25.fit(X_train, y_train)

In [346]:
xgb_optuna_proba_25 = xgb_optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, xgb_optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.775


In [347]:
X_train = X_train.values
y_train = y_train.values

In [348]:
auc_bootstrap = []

In [349]:
rs = RandomState(seed = 25)
bootstrap_auc(xgb_optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75552475, 0.77727596])

In [350]:
np.mean(auc_bootstrap)

0.7665307240046139

In [351]:
t_25 = auc_bootstrap
print(t_25)

[0.7635049982911825, 0.7642339157552975, 0.7629469625768968, 0.767728981544771, 0.7702361372180451, 0.7590193523581683, 0.7649334629186603, 0.7719462897300069, 0.767189636021873, 0.7719262645249487, 0.7662257561517429, 0.7657024307928914, 0.7707194121667806, 0.7697501922419686, 0.7742198180109364, 0.7637292805878333, 0.7676862611073136, 0.7633795070061518, 0.7643513969583048, 0.7597482698222832, 0.7678678229665072, 0.7618949718045113, 0.7739434701811346, 0.7734161397812713, 0.7653686773752564, 0.7705538704716336, 0.7623328562884484, 0.7728741242310322, 0.7659347231715653, 0.7681668660287082, 0.7704844497607655, 0.7651710953520164, 0.7649521531100478, 0.7770593920881749, 0.7599725521189336, 0.7681161355092276, 0.7812473299726588, 0.7678357826384143, 0.7760541267942583, 0.761924342105263, 0.7744814806903623, 0.7629496326042379, 0.7704150290498974, 0.7618629314764183, 0.7707648026315789, 0.7627173402255639, 0.7619617224880383, 0.7637733360389611, 0.7683804682159946, 0.7644047975051265, 0.

In [352]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [353]:
# 26
column_to_drop_25 = 'Cat_가족계획 시 중요 고려 사항 1순위'

In [354]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(6119, 151)


In [355]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [356]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [357]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 167, 'learning_rate': 0.05, 'max_depth': 4, 'max_leaves': 494, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 1, 'reg_alpha': 3, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7959290144325079


In [358]:
xgb_optuna_26 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_26.fit(X_train, y_train)

In [359]:
xgb_optuna_proba_26 = xgb_optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, xgb_optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.782


In [360]:
X_train = X_train.values
y_train = y_train.values

In [361]:
auc_bootstrap = []

In [362]:
rs = RandomState(seed = 26)
bootstrap_auc(xgb_optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76294409, 0.7809617 ])

In [363]:
np.mean(auc_bootstrap)

0.772005336049641

In [364]:
t_26 = auc_bootstrap
print(t_26)

[0.7683564379699248, 0.7663779477101845, 0.7716726119275461, 0.7715551307245386, 0.7648987525632263, 0.7728153836295284, 0.760546607997266, 0.7771621881408066, 0.7706366413192072, 0.7767269736842105, 0.7762383586807928, 0.7649614982057417, 0.7655956296992482, 0.7653846975393028, 0.7700438952494872, 0.7689491840396446, 0.7853805322966507, 0.7615772385509227, 0.768959864149009, 0.7623408663704716, 0.7736804724880383, 0.774537551264525, 0.7706606715652767, 0.7671442455570745, 0.7782969497607656, 0.7651937905844157, 0.778112717874231, 0.7793008800410117, 0.774639012303486, 0.7733814294258372, 0.7803261705399864, 0.7776534731715653, 0.7686474709501026, 0.7710691857484622, 0.7728367438482571, 0.7669386534518112, 0.7786440533151059, 0.77400621582365, 0.7725911013328777, 0.7726765422077921, 0.775509441216678, 0.7712774478810663, 0.7685112995557075, 0.7720837961380724, 0.7792207792207793, 0.7720784560833903, 0.7729595651059468, 0.7689011235475051, 0.7688877734107996, 0.7682923573137389, 0.78095

In [365]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [366]:
# 27
column_to_drop_26 = 'Cat_현재 의료시설 접근용이성'

In [367]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(6119, 147)


In [368]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [369]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [370]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 178, 'learning_rate': 0.05, 'max_depth': 4, 'max_leaves': 472, 'subsample': 0.9, 'colsample_bytree': 0.6, 'gamma': 1, 'reg_alpha': 3, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7979782802861459


In [371]:
xgb_optuna_27 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_27.fit(X_train, y_train)

In [372]:
xgb_optuna_proba_27 = xgb_optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, xgb_optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.778


In [373]:
X_train = X_train.values
y_train = y_train.values

In [374]:
auc_bootstrap = []

In [375]:
rs = RandomState(seed = 27)
bootstrap_auc(xgb_optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76161996, 0.78088708])

In [376]:
np.mean(auc_bootstrap)

0.7712838118912337

In [377]:
t_27 = auc_bootstrap
print(t_27)

[0.7697795625427204, 0.7752290883458648, 0.7668852529049898, 0.7693176478127135, 0.7664740686944634, 0.773234577922078, 0.7719155844155845, 0.7783076298701299, 0.7743479793233082, 0.7718007732399179, 0.7634943181818181, 0.7722973983253588, 0.7754186602870813, 0.765488828605605, 0.7648693822624744, 0.77153377050581, 0.7669386534518113, 0.7736163918318524, 0.7733547291524265, 0.7678865131578948, 0.7753919600136705, 0.7741130169172932, 0.7735923615857826, 0.7753492395762132, 0.7649494830827068, 0.7730770463089542, 0.7684765892002734, 0.7678838431305537, 0.7796506536226931, 0.771902234278879, 0.7675127093301435, 0.7766255126452495, 0.7755227913533835, 0.7713228383458647, 0.7685540199931647, 0.7799016361927545, 0.7713575487012988, 0.7782275290498974, 0.7834634526657552, 0.7645596590909091, 0.7604077665755298, 0.7678544728298018, 0.7753732698222829, 0.7720197154818865, 0.7703936688311688, 0.7710531655844156, 0.7640576939507859, 0.77276999316473, 0.7653312969924813, 0.7672216763499657, 0.7663

In [378]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [379]:
# 28
column_to_drop_27  = 'Cat_이사 예상 기간'

In [380]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(6119, 143)


In [381]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [382]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)  

In [383]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 119, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 830, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 2, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7958288466514746


In [384]:
xgb_optuna_28 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_28.fit(X_train, y_train)

In [385]:
xgb_optuna_proba_28 = xgb_optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, xgb_optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.779


In [386]:
X_train = X_train.values
y_train = y_train.values

In [387]:
auc_bootstrap = []

In [388]:
rs = RandomState(seed = 28)
bootstrap_auc(xgb_optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76064727, 0.77893802])

In [389]:
np.mean(auc_bootstrap)

0.7701022306743421

In [390]:
t_28 = auc_bootstrap
print(t_28)

[0.7673338174982913, 0.7679052033492824, 0.773904754784689, 0.7621993549213943, 0.7626158791866029, 0.7635824290840738, 0.7726818822624744, 0.7729061645591251, 0.769197496582365, 0.7690159347231715, 0.761854921394395, 0.7638067113807245, 0.7663378973000683, 0.7752824888926864, 0.7695446001367054, 0.776441280758715, 0.7658920027341081, 0.7719529647983595, 0.7669760338345865, 0.773207877648667, 0.7744948308270676, 0.754274713773069, 0.7732692882775121, 0.774537551264525, 0.7746790627136022, 0.7693149777853726, 0.7708422334244703, 0.7646344198564593, 0.7725029904306219, 0.7714082792207791, 0.7606026785714286, 0.7715791609706083, 0.7734268198906357, 0.7689438439849624, 0.7671522556390977, 0.7712080271701982, 0.7738219839371154, 0.777746924128503, 0.7725403708133972, 0.7679799641148326, 0.7629149222488039, 0.7698276230348597, 0.7693096377306905, 0.7749433954203692, 0.7666529605263159, 0.7638761320915926, 0.7755468215994532, 0.7763798701298701, 0.7678491327751196, 0.7672590567327409, 0.76606

In [391]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [392]:
# 29
column_to_drop_28 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [393]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(6119, 139)


In [394]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [395]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [396]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.79342899356422


In [397]:
xgb_optuna_29 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_29.fit(X_train, y_train)

In [398]:
xgb_optuna_proba_29 = xgb_optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, xgb_optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.776


In [399]:
X_train = X_train.values
y_train = y_train.values

In [400]:
auc_bootstrap = []

In [401]:
rs = RandomState(seed = 29)
bootstrap_auc(xgb_optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75608132, 0.77756089])

In [402]:
np.mean(auc_bootstrap)

0.767090370412893

In [403]:
t_29 = auc_bootstrap
print(t_29)

[0.7682736671223512, 0.7695232399179768, 0.7671629357484622, 0.7630858039986328, 0.767990644224197, 0.7629763328776487, 0.7621032339371154, 0.7727219326725906, 0.7617214200273411, 0.7777042036910459, 0.772836743848257, 0.7606560791182502, 0.7594278665413534, 0.7734642002734108, 0.7670214242993848, 0.7710197902426521, 0.7718114533492824, 0.7751383074162679, 0.7628001110731375, 0.7640042934039646, 0.7649841934381407, 0.7674512987012986, 0.7697154818865345, 0.7713869190020506, 0.7643140165755298, 0.7674245984278879, 0.7649161077409432, 0.7595827281271361, 0.7613689764183185, 0.7723748291182503, 0.7621646445659603, 0.7642979964114833, 0.7603276657552973, 0.7733760893711552, 0.7635984492481203, 0.7784464712918661, 0.7607388499658236, 0.7710718557758032, 0.7609604622351333, 0.7667704417293233, 0.7637399606971975, 0.7660815746753247, 0.7602101845522898, 0.7560102315447711, 0.758349175495557, 0.7652485261449078, 0.7688103426179085, 0.7621139140464798, 0.7665061090225566, 0.7794690917634997, 0.

In [404]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [405]:
column_to_drop_29 = 'Cat_현재 상업시설 접근용이성'

In [406]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(6119, 135)


In [407]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [408]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [409]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7965884523243099


In [410]:
xgb_optuna_30 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_30.fit(X_train, y_train)

In [411]:
xgb_optuna_proba_30 = xgb_optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, xgb_optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.776


In [412]:
X_train = X_train.values
y_train = y_train.values

In [413]:
auc_bootstrap = []

In [414]:
rs = RandomState(seed = 30)
bootstrap_auc(xgb_optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75686651, 0.77795812])

In [415]:
np.mean(auc_bootstrap)

0.7679090481886535

In [416]:
t_30 = auc_bootstrap
print(t_30)

[0.7701373462064252, 0.7685807202665754, 0.7622340652768285, 0.7697688824333561, 0.7610939636021872, 0.7713415285372522, 0.7705618805536568, 0.7600713431305536, 0.7657157809295968, 0.7696887816131237, 0.7661670155502392, 0.7693897385509227, 0.7661910457963089, 0.7612274649692414, 0.7651737653793576, 0.7703856587491456, 0.7590994531784006, 0.7703189080656185, 0.7614357271018455, 0.7660655545112782, 0.771266767771702, 0.7683777981886535, 0.7626746197881067, 0.7700732655502392, 0.7681668660287082, 0.7710932159945317, 0.7825716635338344, 0.7636545198222828, 0.7701854066985645, 0.7651604152426521, 0.7730850563909776, 0.7673845480177717, 0.7689011235475052, 0.7711519565960356, 0.7633394565960356, 0.7692589072112098, 0.7662711466165413, 0.7677823820915927, 0.7677209714627478, 0.7605412679425838, 0.7670988550922762, 0.7765187115516063, 0.7760648069036228, 0.7690720052973343, 0.7756349325017087, 0.7566243378332194, 0.7669706937799042, 0.771832813568011, 0.7669413234791524, 0.7626318993506493, 0

In [417]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [418]:
# 31
column_to_drop_30 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [419]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(6119, 131)


In [420]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [421]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [422]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7960500505012563


In [423]:
xgb_optuna_31= XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_31.fit(X_train, y_train)

In [424]:
xgb_optuna_proba_31 = xgb_optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, xgb_optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.777


In [425]:
X_train = X_train.values
y_train = y_train.values

In [426]:
auc_bootstrap = []

In [427]:
rs = RandomState(seed = 31)
bootstrap_auc(xgb_optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75718131, 0.77789851])

In [428]:
np.mean(auc_bootstrap)

0.767812811725692

In [429]:
t_31 = auc_bootstrap
print(t_31)

[0.7584719967532467, 0.7663645975734792, 0.7561036825017088, 0.7674486286739577, 0.7623435363978127, 0.772802033492823, 0.7695846505468216, 0.7652164858168147, 0.7619991028708133, 0.7645997095010252, 0.7752557886192755, 0.7743773496240602, 0.7693630382775118, 0.7721131664388243, 0.7659267130895421, 0.7705485304169515, 0.7711813268967874, 0.7697528622693096, 0.7751222872522214, 0.7720971462747779, 0.7645382988721804, 0.7654300880041011, 0.7637453007518797, 0.7723935193096378, 0.7714429895762133, 0.7740115558783323, 0.7604398069036227, 0.7653473171565277, 0.7679799641148326, 0.7715524606971975, 0.7763852101845523, 0.7745589114832536, 0.7723187585440875, 0.7711145762132603, 0.7753625897129187, 0.7685860603212576, 0.7704791097060834, 0.7694164388243334, 0.777511961722488, 0.763056433697881, 0.7637159304511278, 0.7714590097402598, 0.7714776999316474, 0.7669386534518112, 0.7653019266917295, 0.7580714926520848, 0.7656650504101161, 0.7702107719583049, 0.7669039430963773, 0.76048252734108, 0.76

In [430]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [431]:
# 32
column_to_drop_31 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [432]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(6119, 127)


In [433]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [434]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [435]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7966176679271113


In [436]:
xgb_optuna_32 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_32.fit(X_train, y_train)

In [437]:
xgb_optuna_proba_32 = xgb_optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, xgb_optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.777


In [438]:
X_train = X_train.values
y_train = y_train.values

In [439]:
auc_bootstrap = []

In [440]:
rs = RandomState(seed = 32)
bootstrap_auc(xgb_optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75723824, 0.77809543])

In [441]:
np.mean(auc_bootstrap)

0.7676911092094583

In [442]:
t_32 = auc_bootstrap
print(t_32)

[0.7671789559125086, 0.7698810235816815, 0.7697234919685577, 0.761053913192071, 0.7694431390977443, 0.7654461081681476, 0.763755980861244, 0.7715845010252905, 0.7704123590225564, 0.7705405203349283, 0.7698730134996583, 0.764458198051948, 0.7600660030758715, 0.7672670668147642, 0.7616333091250853, 0.7690746753246753, 0.7618789516404648, 0.7681401657552974, 0.7722893882433356, 0.7634302375256323, 0.766426008202324, 0.7782916097060834, 0.7728954844497608, 0.7816238038277512, 0.7646824803485986, 0.771701982228298, 0.7700252050580998, 0.7625197582023241, 0.7617401102187287, 0.7589953221120984, 0.7679666139781272, 0.770500469924812, 0.7709784048188653, 0.7653499871838687, 0.7551531527682843, 0.7551584928229665, 0.7643700871496925, 0.7627573906356802, 0.764856032125769, 0.769363038277512, 0.7679799641148325, 0.7716699419002051, 0.7663432373547505, 0.765256536226931, 0.7687168916609705, 0.772137196684894, 0.7731891874572795, 0.7743693395420368, 0.7716379015721121, 0.7754640507518796, 0.7759339

In [443]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [444]:
# 33.
column_to_drop_32 = '자산 중 기타자산의 비중'

In [445]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(6119, 126)


In [446]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [447]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [448]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 152, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 768, 'subsample': 0.7000000000000001, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 3, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7947082196011653


In [449]:
xgb_optuna_33 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_33.fit(X_train, y_train)

In [450]:
xgb_optuna_proba_33 = xgb_optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, xgb_optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.780


In [451]:
X_train = X_train.values
y_train = y_train.values

In [452]:
auc_bootstrap = []

In [453]:
rs = RandomState(seed = 33)
bootstrap_auc(xgb_optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76148259, 0.77922932])

In [454]:
np.mean(auc_bootstrap)

0.7708624522065106

In [455]:
t_33 = auc_bootstrap
print(t_33)

[0.7688210227272727, 0.7703215780929598, 0.768527319719754, 0.7648480220437457, 0.7690720052973342, 0.7735229408749145, 0.7711492865686944, 0.7714216293574846, 0.7740809765892003, 0.7713228383458647, 0.7697128118591934, 0.7707915029049899, 0.7701346761790842, 0.7789617865686945, 0.7692962875939848, 0.7725590610047848, 0.7717393626110732, 0.7720250555365687, 0.7704310492139439, 0.7692829374572795, 0.7667223812371838, 0.7609497821257689, 0.7710504955570744, 0.7682095864661654, 0.7752023880724538, 0.7751035970608339, 0.770030545112782, 0.7715364405331511, 0.7774024906015038, 0.7703883287764866, 0.7678037423103212, 0.7719422846889953, 0.7726631920710867, 0.7697128118591935, 0.773939465140123, 0.7721612269309637, 0.77133618848257, 0.7722787081339713, 0.7721318566302118, 0.7714723598769653, 0.7755454865857827, 0.7671629357484621, 0.7781821385850991, 0.7664286782296651, 0.7691894865003417, 0.7749781057758031, 0.774873974709501, 0.7715951811346549, 0.7714403195488723, 0.7726258116883116, 0.768

In [456]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [457]:
# 34
column_to_drop_33 = 'Cat_현재 주차시설 이용편의성'

In [458]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(6119, 122)


In [459]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [460]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [461]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7968096561740916


In [462]:
xgb_optuna_34 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_34.fit(X_train, y_train)

In [463]:
xgb_optuna_proba_34 = xgb_optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, xgb_optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.778


In [464]:
X_train = X_train.values
y_train = y_train.values

In [465]:
auc_bootstrap = []

In [466]:
rs = RandomState(seed = 34)
bootstrap_auc(xgb_optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75575424, 0.77804386])

In [467]:
np.mean(auc_bootstrap)

0.7672483479205826

In [468]:
t_34 = auc_bootstrap
print(t_34)

[0.7681374957279563, 0.7574360261449078, 0.7668745727956254, 0.7730797163362954, 0.7681748761107314, 0.7673044471975394, 0.7758538747436774, 0.7733333689336979, 0.765795881749829, 0.7641644950444293, 0.7587256493506493, 0.7705752306903624, 0.7554628759398496, 0.7691040456254272, 0.7703055579289132, 0.7716913021189337, 0.7670374444634314, 0.7690212747778536, 0.7661002648667123, 0.7735309509569379, 0.770535180280246, 0.7644101375598087, 0.7667063610731373, 0.7656890806561858, 0.7570542122351331, 0.7669439935064934, 0.7587176392686261, 0.7570996026999317, 0.7648800623718387, 0.7653766874572796, 0.7688690832194122, 0.7727486329460014, 0.764322026657553, 0.7653232869104579, 0.768399158407382, 0.7701773966165414, 0.7635583988380041, 0.7730823863636364, 0.7706046009911142, 0.7745348812371838, 0.7550356715652767, 0.765958753417635, 0.761254165242652, 0.7677770420369104, 0.7697128118591934, 0.7648613721804511, 0.7668104921394395, 0.7682069164388244, 0.7798856160287081, 0.7712801179084074, 0.769

In [469]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [470]:
# 35
column_to_drop_34 = 'Cat_현재 교육환경'

In [471]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(6119, 118)


In [472]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [473]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [474]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 147, 'learning_rate': 0.03, 'max_depth': 8, 'max_leaves': 382, 'subsample': 0.6, 'colsample_bytree': 0.9, 'gamma': 2, 'reg_alpha': 3, 'reg_lambda': 5, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7972854531339993


In [475]:
xgb_optuna_35 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_35.fit(X_train, y_train)

In [476]:
xgb_optuna_proba_35 = xgb_optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, xgb_optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.778


In [477]:
X_train = X_train.values
y_train = y_train.values

In [478]:
auc_bootstrap = []

In [479]:
rs = RandomState(seed = 35)
bootstrap_auc(xgb_optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76040723, 0.77880205])

In [480]:
np.mean(auc_bootstrap)

0.7697188547985732

In [481]:
t_35 = auc_bootstrap
print(t_35)

[0.7709970950102529, 0.7754480305878333, 0.7720223855092276, 0.7702922077922076, 0.7700225350307588, 0.7734882305194807, 0.7741664174641149, 0.767929233595352, 0.7672697368421052, 0.769098705570745, 0.761147364149009, 0.7704937948564593, 0.7649414730006836, 0.7709089841079972, 0.7719289345522898, 0.7633421266233766, 0.7680146744702665, 0.769330997949419, 0.768866413192071, 0.7685006194463431, 0.7754453605604918, 0.7705271701982228, 0.7733840994531785, 0.7741103468899522, 0.7657291310663021, 0.7718274735133288, 0.7691681262816131, 0.7687142216336296, 0.7753038491114148, 0.7708328883287765, 0.7739768455228981, 0.775544151572112, 0.7768070745044429, 0.7687435919343814, 0.7773677802460697, 0.7644021274777854, 0.7786413832877649, 0.768895783492823, 0.7640229835953521, 0.7779925666438825, 0.7723774991455913, 0.7677183014354066, 0.7695766404647983, 0.767029434381408, 0.7699904947026658, 0.7730102956254272, 0.7639802631578948, 0.7721772470950102, 0.7734615302460697, 0.7571850435748462, 0.76232

In [482]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [483]:
# 36
column_to_drop_35 = '현재 주택 거주 기간(총 개월)'

In [484]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(6119, 117)


In [485]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [486]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [487]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7947729112930826


In [488]:
xgb_optuna_36 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_36.fit(X_train, y_train)

In [489]:
xgb_optuna_proba_36 = xgb_optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, xgb_optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.779


In [490]:
X_train = X_train.values
y_train = y_train.values

In [491]:
auc_bootstrap = []

In [492]:
rs = RandomState(seed = 36)
bootstrap_auc(xgb_optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75786757, 0.7787303 ])

In [493]:
np.mean(auc_bootstrap)

0.7683793361244019

In [494]:
t_36 = auc_bootstrap
print(t_36)

[0.7702521573820916, 0.7715551307245387, 0.7758485346889952, 0.7715871710526315, 0.7716405715994532, 0.7730556860902256, 0.7692562371838687, 0.7670721548188653, 0.7765053614149009, 0.7660895847573479, 0.7732532681134654, 0.7551157723855091, 0.7694244489063566, 0.7705912508544087, 0.7633795070061518, 0.7677423316814764, 0.7678758330485305, 0.7615638884142173, 0.7605973385167464, 0.7598630809979494, 0.77133618848257, 0.7601007134313056, 0.765454118250171, 0.7733600692071086, 0.7614010167464115, 0.7711733168147642, 0.7669680237525631, 0.7720197154818865, 0.7776908535543404, 0.7674085782638413, 0.7677797120642513, 0.7654728084415584, 0.7678331126110732, 0.7643674171223515, 0.7664820787764867, 0.7618335611756664, 0.7713174982911823, 0.7691334159261791, 0.7735523111756664, 0.7731811773752564, 0.7664847488038278, 0.7649147727272727, 0.7740275760423786, 0.7715204203691046, 0.767227016404648, 0.7746416823308271, 0.7650669642857142, 0.7723187585440875, 0.7637079203691046, 0.7811965994531783, 0.7

In [495]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [496]:
# 37
column_to_drop_36 = '현재 무주택 기간(총 개월)'

In [497]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(6119, 116)


In [498]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [499]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [500]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956076428016928


In [501]:
xgb_optuna_37 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_37.fit(X_train, y_train)

In [502]:
xgb_optuna_proba_37 = xgb_optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, xgb_optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.778


In [503]:
X_train = X_train.values
y_train = y_train.values

In [504]:
auc_bootstrap = []

In [505]:
rs = RandomState(seed = 37)
bootstrap_auc(xgb_optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75946244, 0.77897894])

In [506]:
np.mean(auc_bootstrap)

0.7694781338110902

In [507]:
t_37 = auc_bootstrap
print(t_37)

[0.7743613294600136, 0.7664019779562543, 0.7692722573479152, 0.7730156356801093, 0.7674406185919344, 0.7706286312371838, 0.7786066729323309, 0.7646317498291182, 0.7727326127819549, 0.7684338687628163, 0.771269437799043, 0.7744788106630212, 0.7656143198906357, 0.7674699888926863, 0.7738006237183869, 0.7707888328776488, 0.7697648773923444, 0.7625945189678742, 0.7666556305536568, 0.7657157809295968, 0.7628722018113466, 0.7674192583732057, 0.7710718557758032, 0.7771862183868764, 0.7734802204374572, 0.7758031442241968, 0.7582583945659604, 0.7690880254613808, 0.7710958860218728, 0.765488828605605, 0.7618015208475735, 0.7663352272727273, 0.7783503503075871, 0.7702014268626111, 0.7685994104579631, 0.7835408834586465, 0.767363187799043, 0.7645756792549556, 0.7723507988721805, 0.7658038918318524, 0.7692802674299384, 0.7664179981203008, 0.770065255468216, 0.7720517558099795, 0.76305376367054, 0.7754213303144224, 0.7598710910799726, 0.7751249572795627, 0.7751596676349966, 0.7727486329460014, 0.767

In [508]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [509]:
# 38
column_to_drop_37 = 'Cat_현재 청소/쓰레기 처리상태'

In [510]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(6119, 112)


In [511]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [512]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [513]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 183, 'learning_rate': 0.060000000000000005, 'max_depth': 3, 'max_leaves': 992, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 2, 'reg_alpha': 2, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7988964849456173


In [514]:
xgb_optuna_38 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_38.fit(X_train, y_train)

In [515]:
xgb_optuna_proba_38 = xgb_optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, xgb_optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.782


In [516]:
X_train = X_train.values
y_train = y_train.values

In [517]:
auc_bootstrap = []

In [518]:
rs = RandomState(seed = 38)
bootstrap_auc(xgb_optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76514079, 0.78233764])

In [519]:
np.mean(auc_bootstrap)

0.7739755819324589

In [520]:
t_38 = auc_bootstrap
print(t_38)

[0.7772823393711551, 0.7790205271701982, 0.781351461038961, 0.7690266148325359, 0.7746203221120984, 0.7722413277511961, 0.7651417250512644, 0.7817466250854408, 0.7708929639439508, 0.7692081766917294, 0.7745856117566645, 0.7736430921052632, 0.7676008202323992, 0.7678678229665072, 0.7718995642515379, 0.7723000683526999, 0.7696540712576897, 0.7671789559125086, 0.7733306989063568, 0.7677717019822283, 0.7719903451811347, 0.7744761406356802, 0.7689251537935748, 0.7741931177375256, 0.7705832407723855, 0.7648453520164047, 0.7718942241968558, 0.769696791695147, 0.7791994190020506, 0.7715204203691046, 0.7701720565618593, 0.7772743292891319, 0.7718942241968558, 0.7737472231715652, 0.7723294386534518, 0.7680547248803828, 0.775674982911825, 0.7751650076896788, 0.7812206296992481, 0.7736911525974026, 0.767365857826384, 0.7731758373205742, 0.7773303998632946, 0.7714242993848257, 0.7752744788106629, 0.7799016361927547, 0.7749487354750513, 0.773036995898838, 0.7753866199589885, 0.7788149350649349, 0.78

In [521]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [522]:
# 38
column_to_drop_38 = 'Cat_가구주 종사상 지위'

In [523]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(6119, 107)


In [524]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [525]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [526]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 165, 'learning_rate': 0.060000000000000005, 'max_depth': 5, 'max_leaves': 970, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'gamma': 3, 'reg_alpha': 2, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.802281321213032


In [527]:
xgb_optuna_39 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_39.fit(X_train, y_train)

In [528]:
xgb_optuna_proba_39 = xgb_optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, xgb_optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.782


In [529]:
X_train = X_train.values
y_train = y_train.values

In [530]:
auc_bootstrap = []

In [531]:
rs = RandomState(seed = 39)
bootstrap_auc(xgb_optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76301098, 0.78186998])

In [532]:
np.mean(auc_bootstrap)

0.7725691343504356

In [533]:
t_39 = auc_bootstrap
print(t_39)

[0.7711439465140123, 0.7820136278195489, 0.7780593173274094, 0.7722653579972659, 0.7672643967874231, 0.7744654605263158, 0.7756803229665072, 0.778649393369788, 0.7789404263499657, 0.771699312200957, 0.770698051948052, 0.7672056561859193, 0.7825129229323309, 0.7719636449077238, 0.7791833988380041, 0.7719262645249487, 0.767793062200957, 0.769528579972659, 0.7715951811346548, 0.7756883330485305, 0.7699397641831852, 0.7708902939166097, 0.7742598684210527, 0.7762330186261106, 0.7680226845522898, 0.7735149307928914, 0.777479921394395, 0.7832391703691044, 0.7705965909090909, 0.782483552631579, 0.7755681818181818, 0.7654461081681476, 0.7631311944634314, 0.7783156399521531, 0.767427268455229, 0.7757951341421736, 0.7755521616541353, 0.769464499316473, 0.7776000726247437, 0.773805963773069, 0.7697261619958988, 0.7701747265892003, 0.7715124102870814, 0.7738753844839372, 0.7678117523923446, 0.774804553998633, 0.7794557416267942, 0.7665354793233083, 0.7737258629528367, 0.7759232954545454, 0.77380863

In [534]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [535]:
# 40
column_to_drop_39 = 'Cat_현재 대중교통 접근용이성'

In [536]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(6119, 103)


In [537]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [538]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [539]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7972854531339993


In [540]:
xgb_optuna_40 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_40.fit(X_train, y_train)

In [541]:
xgb_optuna_proba_40 = xgb_optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, xgb_optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.776


In [542]:
X_train = X_train.values
y_train = y_train.values

In [543]:
auc_bootstrap = []

In [544]:
rs = RandomState(seed = 40)
bootstrap_auc(xgb_optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75921567, 0.77993408])

In [545]:
np.mean(auc_bootstrap)

0.7697318771894225

In [546]:
t_40 = auc_bootstrap
print(t_40)

[0.7729809253246753, 0.7683057074504444, 0.7704977998974709, 0.7630003631237184, 0.7791246582365003, 0.7753599196855776, 0.7709303443267259, 0.7735923615857827, 0.7693229878673958, 0.7746056369617225, 0.7527714883800412, 0.7731891874572796, 0.776777704203691, 0.7670935150375939, 0.7741210269993165, 0.7804623419343815, 0.7654514482228298, 0.7695926606288448, 0.7647412209501026, 0.7714750299043062, 0.769397748632946, 0.7725430408407382, 0.772305408407382, 0.7732612781954886, 0.7595613679084073, 0.7601407638414217, 0.7586028280929595, 0.7695259099453179, 0.7699718045112782, 0.7735443010936431, 0.7648720522898155, 0.7648720522898155, 0.7759980562200955, 0.7694377990430623, 0.7663699376281614, 0.7709677247095011, 0.7696567412850308, 0.773170497265892, 0.7755174512987014, 0.7706259612098427, 0.7739982057416267, 0.776310449419002, 0.7698463132262474, 0.7694271189336979, 0.7565655972317157, 0.7719903451811346, 0.7669520035885168, 0.7635477187286398, 0.7686741712235133, 0.7692081766917294, 0.77

In [547]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [548]:
# 41.
column_to_drop_40 = '자산 중 금융자산의 비중'

In [549]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(6119, 102)


In [550]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [551]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [552]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 119, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 830, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 2, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7959707510079383


In [553]:
xgb_optuna_41 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_41.fit(X_train, y_train)

In [554]:
xgb_optuna_proba_41 = xgb_optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, xgb_optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.777


In [555]:
X_train = X_train.values
y_train = y_train.values

In [556]:
auc_bootstrap = []

In [557]:
rs = RandomState(seed = 41)
bootstrap_auc(xgb_optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76263814, 0.78018486])

In [558]:
np.mean(auc_bootstrap)

0.7714232313738892

In [559]:
t_41 = auc_bootstrap
print(t_41)

[0.7773757903280929, 0.7715578007518796, 0.7796960440874916, 0.7685353298017771, 0.7736778024606973, 0.7716432416267942, 0.7790231971975393, 0.7632566857484621, 0.7723935193096377, 0.7686875213602187, 0.766992053998633, 0.7724228896103895, 0.767494019138756, 0.7740836466165413, 0.7655128588516746, 0.7720757860560493, 0.7766976033834586, 0.7646077195830485, 0.7745135210184553, 0.7701213260423788, 0.7659934637730691, 0.7807507048872181, 0.7746603725222148, 0.7678785030758715, 0.7655128588516746, 0.7670828349282297, 0.7708529135338346, 0.7765454118250171, 0.7685833902939165, 0.7620338132262475, 0.7738273239917977, 0.769467169343814, 0.7825903537252221, 0.7686314507860561, 0.7780593173274095, 0.7732212277853725, 0.7687355818523582, 0.7665087790498974, 0.7680387047163363, 0.7682576469583049, 0.7756509526657553, 0.7704390592959671, 0.770097295796309, 0.7664553785030759, 0.7755308014354068, 0.780176649008886, 0.7779097957963089, 0.7754587106971976, 0.7677343215994532, 0.7722520078605604, 0.77

In [560]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [561]:
# 42.
column_to_drop_41 = '소득 대비 생활비의 비율'

In [562]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(6119, 101)


In [563]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [564]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [565]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7980909690398083


In [566]:
xgb_optuna_42 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_42.fit(X_train, y_train)

In [567]:
xgb_optuna_proba_42 = xgb_optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, xgb_optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.777


In [568]:
X_train = X_train.values
y_train = y_train.values

In [569]:
auc_bootstrap = []

In [570]:
rs = RandomState(seed = 42)
bootstrap_auc(xgb_optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75902663, 0.77938946])

In [571]:
np.mean(auc_bootstrap)

0.769047523149137

In [572]:
t_42 = auc_bootstrap
print(t_42)

[0.7690773453520164, 0.7731491370471634, 0.7712827879357484, 0.7642259056732741, 0.7614998077580315, 0.7713655587833219, 0.7731064166097061, 0.769266917293233, 0.7662471163704716, 0.7734348299726589, 0.7672376965140122, 0.7668051520847573, 0.7677049512987013, 0.7666796607997266, 0.7729782552973342, 0.769897043745728, 0.7651070146958305, 0.7739020847573478, 0.7654514482228297, 0.7590380425495558, 0.7676061602870814, 0.7623809167805877, 0.7758031442241967, 0.7746043019480519, 0.767227016404648, 0.7661483253588516, 0.7710264653110048, 0.7623088260423787, 0.7714082792207793, 0.7631712448735475, 0.7705218301435407, 0.768329737696514, 0.7673658578263842, 0.7688984535201641, 0.7689438439849625, 0.7616413192071086, 0.7805477828092959, 0.7693309979494191, 0.7637025803144224, 0.7694030886876282, 0.7694725093984962, 0.767291097060834, 0.77133618848257, 0.7734241498632948, 0.7689892344497606, 0.7696407211209841, 0.7657691814764183, 0.769029284859877, 0.7599618720095693, 0.7645943694463431, 0.76753

In [573]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [574]:
# 43.
column_to_drop_42 = 'Cat_남편/아내의 부모님과 동거 의향'

In [575]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(6119, 96)


In [576]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [577]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [578]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7973981418876619


In [579]:
xgb_optuna_43 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_43.fit(X_train, y_train)

In [580]:
xgb_optuna_proba_43 = xgb_optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, xgb_optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.781


In [581]:
X_train = X_train.values
y_train = y_train.values

In [582]:
auc_bootstrap = []

In [583]:
rs = RandomState(seed = 43)
bootstrap_auc(xgb_optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75973465, 0.77917566])

In [584]:
np.mean(auc_bootstrap)

0.7695654356950615

In [585]:
t_43 = auc_bootstrap
print(t_43)

[0.764359407040328, 0.769130745898838, 0.7664794087491456, 0.7686875213602187, 0.7768818352699931, 0.7653873675666438, 0.7713068181818181, 0.7693657083048531, 0.7779258159603555, 0.77000117481203, 0.7700812756322625, 0.7633768369788106, 0.7694831895078607, 0.7748419343814081, 0.7651684253246753, 0.7721051563568012, 0.7696273709842789, 0.7632246454203693, 0.7664446983937115, 0.7676515507518797, 0.7704524094326726, 0.7779391660970609, 0.7716352315447711, 0.768861073137389, 0.7715604707792207, 0.7786226930963772, 0.7690212747778538, 0.7696353810663021, 0.7719609748803827, 0.7695526102187287, 0.7685887303485988, 0.774270548530417, 0.7725110005126452, 0.7748526144907724, 0.7691734663362952, 0.7697875726247437, 0.7690239448051948, 0.7742972488038278, 0.773704502734108, 0.7619056519138756, 0.7691440960355435, 0.7651203648325358, 0.762258095522898, 0.7687222317156528, 0.7707114020847573, 0.7729462149692413, 0.7693123077580314, 0.773971505468216, 0.7690159347231715, 0.7697154818865346, 0.766631

In [586]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [587]:
## 44
column_to_drop_43 = '가구주 나이'

In [588]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(6119, 95)


In [589]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [590]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [591]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7968493059207507


In [592]:
xgb_optuna_44 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_44.fit(X_train, y_train)

In [593]:
xgb_optuna_proba_44 = xgb_optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, xgb_optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.779


In [594]:
X_train = X_train.values
y_train = y_train.values

In [595]:
auc_bootstrap = []

In [596]:
rs = RandomState(seed = 44)
bootstrap_auc(xgb_optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75986121, 0.77961835])

In [597]:
np.mean(auc_bootstrap)

0.7703874630201214

In [598]:
t_44 = auc_bootstrap
print(t_44)

[0.7757711038961039, 0.7657745215311006, 0.7759927161654135, 0.7637639909432673, 0.7607014695830485, 0.770166716507177, 0.7745135210184553, 0.7631765849282297, 0.7683137175324675, 0.7787214841079972, 0.7737498931989063, 0.7608376409774437, 0.7805824931647299, 0.7717714029391661, 0.7713228383458646, 0.7690933655160628, 0.7713842489747095, 0.7669039430963773, 0.7639348726930963, 0.7697875726247437, 0.7751062670881749, 0.7661630105092276, 0.7732132177033493, 0.7659827836637048, 0.7752210782638415, 0.7664126580656185, 0.7736297419685577, 0.7752958390293917, 0.7773998205741628, 0.7785826426862611, 0.7745802717019822, 0.7742251580656185, 0.7691093856801093, 0.7734375, 0.772935534859877, 0.7682576469583049, 0.7737151828434723, 0.771368228810663, 0.7687222317156528, 0.77395682031784, 0.7809135765550239, 0.7698863636363636, 0.7772636491797676, 0.7575241370471634, 0.7680573949077238, 0.7749166951469584, 0.7683777981886535, 0.7647011705399864, 0.7737952836637046, 0.7678918532125769, 0.76986233339

In [599]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [600]:
# 45
column_to_drop_44 = '소득 중 정부 보조금의 비중(월평균)'

In [601]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(6119, 94)


In [602]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [603]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [604]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956431188908087


In [605]:
xgb_optuna_45 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_45.fit(X_train, y_train)

In [606]:
xgb_optuna_proba_45 = xgb_optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, xgb_optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.778


In [607]:
X_train = X_train.values
y_train = y_train.values

In [608]:
auc_bootstrap = []

In [609]:
rs = RandomState(seed = 45)
bootstrap_auc(xgb_optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76035156, 0.77940711])

In [610]:
np.mean(auc_bootstrap)

0.7700478829353212

In [611]:
t_45 = auc_bootstrap
print(t_45)

[0.7831991199589884, 0.7711466165413534, 0.7777976546479837, 0.7674886790840738, 0.7698089328434723, 0.7578311901913874, 0.7761182074504442, 0.7689812243677375, 0.773121101760082, 0.7709036440533152, 0.7632593557758031, 0.7713415285372522, 0.7709356843814081, 0.7684125085440875, 0.7697381771189337, 0.7757711038961038, 0.7686581510594668, 0.7703162380382775, 0.7674326085099111, 0.7648960825358853, 0.770596590909091, 0.771565810833903, 0.7609631322624743, 0.7643433868762817, 0.761582578605605, 0.7653513221975392, 0.7726765422077922, 0.7782969497607656, 0.7770954374572796, 0.765825252050581, 0.7704443993506493, 0.7742037978468899, 0.7688584031100478, 0.7723988593643198, 0.7653526572112098, 0.7619296821599453, 0.7709116541353385, 0.7719262645249488, 0.7660842447026657, 0.7745749316473001, 0.7690212747778538, 0.7801392686261108, 0.771037145420369, 0.7639669130211894, 0.7661323051948051, 0.7687195616883118, 0.7754026401230348, 0.7771648581681476, 0.7664687286397812, 0.7711412764866712, 0.766

In [612]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [613]:
# 46.
column_to_drop_45 = 'Cat_이사 계획 첫 번째 이유'

In [614]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(6119, 82)


In [615]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [616]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [617]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7935855057220845


In [618]:
xgb_optuna_46 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_46.fit(X_train, y_train)

In [619]:
xgb_optuna_proba_46 = xgb_optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, xgb_optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.779


In [620]:
X_train = X_train.values
y_train = y_train.values

In [621]:
auc_bootstrap = []

In [622]:
rs = RandomState(seed = 46)
bootstrap_auc(xgb_optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75905643, 0.77962165])

In [623]:
np.mean(auc_bootstrap)

0.7693514223235646

In [624]:
t_46 = auc_bootstrap
print(t_46)

[0.7648680472488039, 0.7734535201640464, 0.7689838943950786, 0.7700412252221464, 0.7659640934723172, 0.7785612824675325, 0.7698262880211895, 0.7605252477785374, 0.7665167891319206, 0.7648320018796994, 0.7665461594326726, 0.7641004143882433, 0.7660201640464799, 0.7726658620984279, 0.7669800388755981, 0.7681882262474368, 0.7678144224196854, 0.765758501367054, 0.7672056561859194, 0.7635103383458647, 0.7695178998632946, 0.7636117993848257, 0.7680120044429255, 0.7592703349282297, 0.77390742481203, 0.7751463174982911, 0.7703536184210527, 0.7724549299384825, 0.7762490387901573, 0.7787375042720437, 0.7705618805536569, 0.7710878759398496, 0.7624877178742311, 0.7743666695146958, 0.7648159817156528, 0.7710932159945318, 0.7774906015037594, 0.7654007177033493, 0.7754320104237867, 0.7714242993848258, 0.7738620343472318, 0.7659387282125769, 0.7625891789131921, 0.7778377050580998, 0.7637079203691047, 0.7666716507177034, 0.770831553315106, 0.774374679596719, 0.775526796394395, 0.7776855134996583, 0.770

In [625]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [626]:
# 47.
column_to_drop_46 = 'Cat_가구주 최종 학력'

In [627]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(6119, 79)


In [628]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [629]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [630]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7926610405762986


In [631]:
xgb_optuna_47 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_47.fit(X_train, y_train)

In [632]:
xgb_optuna_proba_47 = xgb_optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, xgb_optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.778


In [633]:
X_train = X_train.values
y_train = y_train.values

In [634]:
auc_bootstrap = []

In [635]:
rs = RandomState(seed = 47)
bootstrap_auc(xgb_optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75883499, 0.77842244])

In [636]:
np.mean(auc_bootstrap)

0.7688400446695575

In [637]:
t_47 = auc_bootstrap
print(t_47)

[0.7744814806903623, 0.7646477699931647, 0.7662524564251538, 0.7728180536568694, 0.7673498376623377, 0.7714216293574847, 0.7720597658920026, 0.772203947368421, 0.7712667677717019, 0.7738486842105263, 0.7705405203349283, 0.7622741156869447, 0.7626105391319207, 0.7654167378673958, 0.7459015080314422, 0.7641965353725223, 0.7735496411483256, 0.7732853084415583, 0.7594305365686945, 0.7669092831510594, 0.7663085269993164, 0.7680814251537936, 0.7583999060150376, 0.7596067583732057, 0.7736831425153794, 0.7780352870813397, 0.7729408749145592, 0.7666209201982228, 0.7617534603554341, 0.7672830869788106, 0.7675901401230347, 0.7681374957279563, 0.7670801649008886, 0.7751383074162681, 0.7650936645591251, 0.768594070403281, 0.771902234278879, 0.7722079524094327, 0.7697261619958988, 0.7694591592617909, 0.7628935620300752, 0.7722653579972658, 0.7593450956937798, 0.7683430878332194, 0.7637159304511278, 0.7666476204716336, 0.7765881322624744, 0.7548274094326726, 0.7665514994873547, 0.7689491840396445, 0.

In [638]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [639]:
# 48
column_to_drop_47 = '소득 대비 주거관리비의 비율'

In [640]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(6119, 78)


In [641]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [642]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [643]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7928634629671367


In [644]:
xgb_optuna_48 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_48.fit(X_train, y_train)

In [645]:
xgb_optuna_proba_48 = xgb_optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, xgb_optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.781


In [646]:
X_train = X_train.values
y_train = y_train.values

In [647]:
auc_bootstrap = []

In [648]:
rs = RandomState(seed = 48)
bootstrap_auc(xgb_optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76086805, 0.78053997])

In [649]:
np.mean(auc_bootstrap)

0.7706986693918746

In [650]:
t_48 = auc_bootstrap
print(t_48)

[0.7717046522556391, 0.7626933099794941, 0.7823340311004785, 0.7663165370813396, 0.7736964926520847, 0.7680867652084756, 0.7743800196514012, 0.7753025140977444, 0.7719943502221462, 0.7768604750512647, 0.7703749786397813, 0.7687596120984278, 0.7658345971462748, 0.7737579032809296, 0.7762890892002734, 0.7685139695830485, 0.7753038491114148, 0.7618122009569378, 0.7725163405673274, 0.7679319036226931, 0.763419557416268, 0.7703776486671223, 0.7751543275803146, 0.7677369916267943, 0.7716459116541353, 0.7716939721462749, 0.7662444463431305, 0.7725924363465482, 0.7672964371155161, 0.7674860090567328, 0.7740249060150376, 0.7667183761961722, 0.7681855562200957, 0.7730236457621324, 0.7682069164388243, 0.7692028366370472, 0.7699944997436774, 0.7689438439849624, 0.7727099175495556, 0.770846238465482, 0.7738460141831852, 0.7767269736842104, 0.7710958860218728, 0.7759313055365686, 0.7722306476418319, 0.7674860090567328, 0.7753225393028025, 0.7678851781442242, 0.7655342190704032, 0.7819308569719754, 0

In [651]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [652]:
# 49
column_to_drop_48 = '중기부채부담지표'

In [653]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(6119, 77)


In [654]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [655]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [656]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7940007846476181


In [657]:
xgb_optuna_49 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_49.fit(X_train, y_train)

In [658]:
xgb_optuna_proba_49 = xgb_optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, xgb_optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.780


In [659]:
X_train = X_train.values
y_train = y_train.values

In [660]:
auc_bootstrap = []

In [661]:
rs = RandomState(seed = 49)
bootstrap_auc(xgb_optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7619141 , 0.78092713])

In [662]:
np.mean(auc_bootstrap)

0.7713953396007776

In [663]:
t_49 = auc_bootstrap
print(t_49)

[0.7787294941900205, 0.7747818587662338, 0.7704524094326726, 0.770366968557758, 0.7692228618421052, 0.7752597936602871, 0.7702014268626111, 0.7755227913533835, 0.7766348577409433, 0.7734321599453178, 0.7713121582365003, 0.7670668147641831, 0.7679185534859878, 0.766939988465482, 0.7684619040498974, 0.7740302460697198, 0.7766655630553656, 0.7760354366028708, 0.7798562457279562, 0.7645943694463431, 0.775106267088175, 0.7756990131578947, 0.7769165456254272, 0.7736030416951469, 0.7757924641148326, 0.762818801264525, 0.7638067113807245, 0.7720170454545454, 0.7691414260082023, 0.7708729387388928, 0.7718715289644567, 0.7713335184552289, 0.7713975991114149, 0.7683137175324676, 0.7786360432330828, 0.7725991114149009, 0.7758912551264525, 0.7694858595352015, 0.769931754101162, 0.7773223897812713, 0.7738647043745728, 0.7621913448393711, 0.7717794130211894, 0.7684125085440875, 0.7737165178571428, 0.772203947368421, 0.7748753097231715, 0.7677409966678058, 0.7690025845864661, 0.771417624316473, 0.7739

In [664]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [665]:
# 50
column_to_drop_49 = 'Cat_현재 주택의 유형'

In [666]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(6119, 66)


In [667]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [668]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [669]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7922937587125101


In [670]:
xgb_optuna_50 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_50.fit(X_train, y_train)

In [671]:
xgb_optuna_proba_50 = xgb_optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, xgb_optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.777


In [672]:
X_train = X_train.values
y_train = y_train.values

In [673]:
auc_bootstrap = []

In [674]:
rs = RandomState(seed = 50)
bootstrap_auc(xgb_optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76210173, 0.78012732])

In [675]:
np.mean(auc_bootstrap)

0.7713015575604493

In [676]:
t_50 = auc_bootstrap
print(t_50)

[0.7769392408578263, 0.7862696514012304, 0.7662297611927547, 0.7758645548530416, 0.7655489042207793, 0.7673204673615859, 0.7608416460184553, 0.7602408898667122, 0.7721932672590568, 0.7738887346206426, 0.7756149072966507, 0.7741450572453861, 0.7837705058099795, 0.768678176264525, 0.7698329630895422, 0.7718555088004101, 0.7762890892002734, 0.7694110987696514, 0.7693363380041012, 0.7687182266746411, 0.7648827323991797, 0.7735176008202324, 0.7770807523069034, 0.7741317071086808, 0.7702922077922078, 0.7710878759398496, 0.7664633885850991, 0.7775333219412166, 0.7650602892173615, 0.7640336637047163, 0.770912989149009, 0.7722626879699248, 0.78154637303486, 0.7705512004442926, 0.7722546778879016, 0.7690186047505128, 0.7686701661825017, 0.7675394096035543, 0.7684765892002734, 0.7653553272385509, 0.7715177503417635, 0.7688730882604238, 0.7776748333902939, 0.7654033877306904, 0.7778003246753247, 0.7754760658749146, 0.7689171437115515, 0.7690399649692412, 0.7699918297163364, 0.767793062200957, 0.77

In [677]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [678]:
# 51
column_to_drop_50 = 'Cat_이사 계획 중인 주택의 점유형태'

In [679]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(6119, 46)


In [680]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [681]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [682]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7918012671224302


In [683]:
xgb_optuna_51 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_51.fit(X_train, y_train)

In [684]:
xgb_optuna_proba_51 = xgb_optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, xgb_optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.775


In [685]:
X_train = X_train.values
y_train = y_train.values

In [686]:
auc_bootstrap = []

In [687]:
rs = RandomState(seed = 51)
bootstrap_auc(xgb_optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75886172, 0.77798232])

In [688]:
np.mean(auc_bootstrap)

0.7685401258383886

In [689]:
t_51 = auc_bootstrap
print(t_51)

[0.768248301862611, 0.7762984342959671, 0.7729115046138073, 0.7694911995898839, 0.7681134654818865, 0.7607348449248121, 0.7769846313226247, 0.7743332941729322, 0.7766482078776487, 0.7707888328776487, 0.7691374209671908, 0.7671228853383458, 0.7652231608851674, 0.7704123590225564, 0.7700385551948051, 0.7679132134313056, 0.7681775461380724, 0.7690706702836638, 0.7676608958475735, 0.7634409176349966, 0.7646037145420369, 0.7712187072795625, 0.7685847253075873, 0.7687769672761449, 0.7695138948222829, 0.7738500192241968, 0.7704884548017771, 0.7613382711038961, 0.7594038362952837, 0.7669813738892686, 0.7604131066302118, 0.7642325807416269, 0.7629069121667806, 0.770880948820916, 0.7609391020164047, 0.7664980989405332, 0.7717727379528366, 0.7727299427546138, 0.7702294621496923, 0.7757363935406699, 0.7670828349282297, 0.777999241712235, 0.7679025333219412, 0.7719529647983595, 0.770517825102529, 0.7724122095010253, 0.7657771915584415, 0.7665501644736842, 0.7678317775974026, 0.7671762858851675, 0.7

In [690]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [691]:
# 52
column_to_drop_51 = '소득 대비 주택 임대료의 비율'

In [692]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(6119, 45)


In [693]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [694]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [695]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7918179617526022


In [696]:
xgb_optuna_52 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_52.fit(X_train, y_train)

In [697]:
xgb_optuna_proba_52 = xgb_optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, xgb_optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.778


In [698]:
X_train = X_train.values
y_train = y_train.values

In [699]:
auc_bootstrap = []

In [700]:
rs = RandomState(seed = 52)
bootstrap_auc(xgb_optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76211174, 0.77888879])

In [701]:
np.mean(auc_bootstrap)

0.7707858297643968

In [702]:
t_52 = auc_bootstrap
print(t_52)

[0.7678544728298018, 0.7695299149863295, 0.7717927631578947, 0.7675447496582366, 0.7752424384825701, 0.772584426264525, 0.7745869467703348, 0.7757924641148325, 0.766408653024607, 0.7734601952323992, 0.7769966464456596, 0.7715270954374573, 0.7629603127136023, 0.7695886555878332, 0.7675767899863295, 0.7676088303144224, 0.7733266938653452, 0.769345683099795, 0.7741864426691729, 0.7740649564251538, 0.77568699803486, 0.7699918297163362, 0.7693790584415583, 0.7765761171394396, 0.7678865131578947, 0.7703229131066301, 0.7757857890464799, 0.7723534688995215, 0.7722720330656186, 0.7669052781100478, 0.7685179746240602, 0.7788883608168148, 0.7698329630895421, 0.7770593920881751, 0.7684632390635682, 0.7745629165242652, 0.7778083347573479, 0.7756335974880384, 0.7708675986842105, 0.7734642002734108, 0.7758939251537935, 0.7784130959501026, 0.7727366178229665, 0.7638841421736158, 0.7752237482911826, 0.7818253908920026, 0.7719890101674641, 0.7659814486500341, 0.7717740729665071, 0.7693350029904306, 0.77

In [703]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [704]:
# 53
column_to_drop_52 = 'Cat_이사 계획 중인 거주 지역'

In [705]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(6119, 38)


In [706]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [707]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [708]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.78918855750048


In [709]:
xgb_optuna_53 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_53.fit(X_train, y_train)

In [710]:
xgb_optuna_proba_53 = xgb_optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, xgb_optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.778


In [711]:
X_train = X_train.values
y_train = y_train.values

In [712]:
auc_bootstrap = []

In [713]:
rs = RandomState(seed = 53)
bootstrap_auc(xgb_optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76059343, 0.7773063 ])

In [714]:
np.mean(auc_bootstrap)

0.7690367308986245

In [715]:
t_53 = auc_bootstrap
print(t_53)

[0.7635009932501708, 0.7701066408920028, 0.7680734150717703, 0.7634182224025974, 0.7729729152426521, 0.7665381493506493, 0.7718448286910459, 0.7690039196001368, 0.7699971697710184, 0.774166417464115, 0.7670774948735475, 0.7717206724196857, 0.763805376367054, 0.7718074483082707, 0.7725991114149009, 0.763654519822283, 0.7756643028024608, 0.7708996390123035, 0.7671789559125086, 0.7666529605263158, 0.7677249765037594, 0.7657932117224882, 0.7663712726418319, 0.7664874188311688, 0.7669359834244702, 0.7712373974709501, 0.7710758608168147, 0.7692989576213259, 0.7747084330143541, 0.7649641682330827, 0.7660001388414217, 0.7604771872863979, 0.7703429383116883, 0.7688877734107997, 0.7798709308783323, 0.76963004101162, 0.7677436666951469, 0.7746817327409432, 0.7707287572624744, 0.7699290840738209, 0.7613048957621327, 0.7626412444463431, 0.7626519245557075, 0.7693990836466166, 0.7611767344497608, 0.7745535714285714, 0.7730717062542719, 0.7779631963431305, 0.7632887260765551, 0.7722813781613125, 0.76

In [716]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [717]:
# 54
column_to_drop_53 = 'Cat_현재 거주 지역'

In [718]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(6119, 21)


In [719]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [720]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [721]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7790152672392925


In [722]:
xgb_optuna_54 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_54.fit(X_train, y_train)

In [723]:
xgb_optuna_proba_54 = xgb_optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, xgb_optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.763


In [724]:
X_train = X_train.values
y_train = y_train.values

In [725]:
auc_bootstrap = []

In [726]:
rs = RandomState(seed = 54)
bootstrap_auc(xgb_optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74977385, 0.76564292])

In [727]:
np.mean(auc_bootstrap)

0.7580786062724283

In [728]:
t_54 = auc_bootstrap
print(t_54)

[0.7516781121838688, 0.7547940340909091, 0.7625170881749829, 0.7561864533492823, 0.76028494531784, 0.762000437884484, 0.7580020719412167, 0.7506034261790842, 0.7604077665755298, 0.763323436431989, 0.7596347936602871, 0.7594131813909774, 0.7598163555194806, 0.7615905886876282, 0.755757913961039, 0.760264920112782, 0.7588751708817498, 0.7582770847573479, 0.7607708902939165, 0.7554628759398496, 0.7592476396958305, 0.7578538854237868, 0.7609884975222145, 0.7578365302460697, 0.7616960547676007, 0.7608296308954203, 0.7530051157723854, 0.7544656207279563, 0.7571476631920709, 0.7586468835440876, 0.7600953733766234, 0.7586001580656186, 0.7603223257006152, 0.7540023709842789, 0.751651411910458, 0.7533815896274778, 0.7587523496240601, 0.7540504314764185, 0.7620271381578947, 0.7551518177546138, 0.7506167763157896, 0.7552706339712918, 0.7490214349794941, 0.760166129101162, 0.7556804831681476, 0.759780310150376, 0.7505179853041695, 0.7516874572795627, 0.7541198521872864, 0.7623795817669172, 0.753731

In [729]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [730]:
# 55
column_to_drop_54 = 'Cat_주택 마련 예상 소요연수'

In [731]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(6119, 15)


In [732]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [733]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [734]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7721516874097447


In [735]:
xgb_optuna_55 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_55.fit(X_train, y_train)

In [736]:
xgb_optuna_proba_55 = xgb_optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, xgb_optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.753


In [737]:
X_train = X_train.values
y_train = y_train.values

In [738]:
auc_bootstrap = []

In [739]:
rs = RandomState(seed = 55)
bootstrap_auc(xgb_optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74205263, 0.75673785])

In [740]:
np.mean(auc_bootstrap)

0.7496376372394054

In [741]:
t_55 = auc_bootstrap
print(t_55)

[0.7452219860731374, 0.7494566494360904, 0.7424211273923444, 0.7592743399692413, 0.7480976055194806, 0.7573505852699932, 0.7522668532125769, 0.7479333988380041, 0.7528355690362268, 0.7536686175666439, 0.751938439849624, 0.7536152170198223, 0.7473313076725906, 0.7529009847060834, 0.7567538341592619, 0.7523910094839372, 0.7508744339542037, 0.7517181625939849, 0.753691312799043, 0.7557645890293916, 0.7482110816814764, 0.7550516917293233, 0.7403224859022557, 0.7499799747949418, 0.752588591507177, 0.7482230968045113, 0.746128460355434, 0.7535751666097059, 0.7498624935919342, 0.751352368848257, 0.7547339584757349, 0.7432368207450444, 0.7487330720266575, 0.7496635765550238, 0.7501108061346549, 0.7434477529049898, 0.7501201512303487, 0.7420486585782639, 0.7496555664730007, 0.7519023944805195, 0.7500774307928915, 0.7543454694976076, 0.7514725200786057, 0.7529103298017772, 0.7462139012303486, 0.7479213837149693, 0.7512161974538619, 0.7512268775632264, 0.7526139567669172, 0.7534616904477102, 0.75

In [742]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [743]:
# 56
column_to_drop_55 = '장기부채부담지표'

In [744]:
if not column_to_drop_55.startswith('Cat_'):
    comp_56 = comp_55.drop(column_to_drop_55, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']
else:
    comp_56 = comp_55.drop(comp_55.filter(regex='^' + column_to_drop_55).columns, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']

print(X_56.shape)

(6119, 14)


In [745]:
X_train, X_test, y_train, y_test = train_test_split(X_56, y_56, test_size=0.2, shuffle=True, stratify=y_56, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [746]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [747]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7667238457750064


In [748]:
xgb_optuna_56 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_56.fit(X_train, y_train)

In [749]:
xgb_optuna_proba_56 = xgb_optuna_56.predict_proba(X_test)[:, 1]
auc_56 = roc_auc_score(y_test, xgb_optuna_proba_56)
print(decimal.Decimal(auc_56).quantize(decimal.Decimal('1.000')))

0.747


In [750]:
X_train = X_train.values
y_train = y_train.values

In [751]:
auc_bootstrap = []

In [752]:
rs = RandomState(seed = 56)
bootstrap_auc(xgb_optuna_56, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74047822, 0.75177841])

In [753]:
np.mean(auc_bootstrap)

0.7464051753139952

In [754]:
t_56 = auc_bootstrap
print(t_56)

[0.7513336786568695, 0.7510666759227613, 0.7438135466507176, 0.7494219390806561, 0.7498972039473685, 0.7507596227785373, 0.7471283855946684, 0.7519531249999999, 0.7423824119958988, 0.7506327964798358, 0.7474300986842105, 0.7463193673103211, 0.7446065447710184, 0.7493044578776487, 0.7490614853896104, 0.7456465204203692, 0.7400968685919345, 0.749146926264525, 0.7492523923444976, 0.7471484107997265, 0.748128310833903, 0.7437241007347917, 0.7506074312200958, 0.7427949312200958, 0.7488358680792891, 0.7456211551606289, 0.7504966250854409, 0.7451872757177033, 0.7383706959159262, 0.7474474538619275, 0.7495954908578263, 0.7492750875768968, 0.7449296180792893, 0.7493765486158579, 0.7475742801606288, 0.740620193950786, 0.7448067968215996, 0.7440965695488722, 0.7503057181305537, 0.7498705036739576, 0.7537820937286397, 0.7453875277682843, 0.7452046308954203, 0.7491375811688311, 0.7444089627477785, 0.7461311303827749, 0.747591635338346, 0.7472512068523581, 0.7477705271701983, 0.7490681604579631, 0.7

In [755]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [756]:
# 57
column_to_drop_56 = 'Cat_현재 주택의 점유형태'

In [757]:
if not column_to_drop_56.startswith('Cat_'):
    comp_57 = comp_56.drop(column_to_drop_56, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']
else:
    comp_57 = comp_56.drop(comp_56.filter(regex='^' + column_to_drop_56).columns, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']

print(X_57.shape)

(6119, 10)


In [758]:
X_train, X_test, y_train, y_test = train_test_split(X_57, y_57, test_size=0.2, shuffle=True, stratify=y_57, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [759]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [760]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7547892720306514


In [761]:
xgb_optuna_57 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_57.fit(X_train, y_train)

In [762]:
xgb_optuna_proba_57 = xgb_optuna_57.predict_proba(X_test)[:, 1]
auc_57 = roc_auc_score(y_test, xgb_optuna_proba_57)
print(decimal.Decimal(auc_57).quantize(decimal.Decimal('1.000')))

0.734


In [763]:
X_train = X_train.values
y_train = y_train.values

In [764]:
auc_bootstrap = []

In [765]:
rs = RandomState(seed = 57)
bootstrap_auc(xgb_optuna_57, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72641762, 0.73767816])

In [766]:
np.mean(auc_bootstrap)

0.7323691072496582

In [767]:
t_57 = auc_bootstrap
print(t_57)

[0.7307344177204375, 0.734441750683527, 0.7317717233424469, 0.7350558569719754, 0.7267307117224879, 0.7331013969583048, 0.7334725307587151, 0.7339798359535201, 0.736760669429255, 0.7348956553315107, 0.7345445467361587, 0.7359623312542722, 0.7293500085440875, 0.731084191302119, 0.7298786739576212, 0.731682277426521, 0.7330226311517429, 0.7272713922590568, 0.7318157787935748, 0.7289107890464799, 0.73741616114149, 0.7301456766917294, 0.7335179212235132, 0.7355084266062885, 0.7260618698735476, 0.731289783407382, 0.729180461807929, 0.7287412423103211, 0.7290109150717703, 0.731000085440875, 0.72896018455229, 0.7275116947197539, 0.7280777405160628, 0.7346500128161312, 0.7323844946172248, 0.7296891020164047, 0.7383893861073136, 0.7301082963089541, 0.734107997265892, 0.7377512495727957, 0.7350184765892004, 0.7310641660970609, 0.734604622351333, 0.7372212491455913, 0.7305288256151742, 0.736528377050581, 0.7355631621667805, 0.7315647962235133, 0.7344350756151742, 0.7349717511107313, 0.73502782168

In [768]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [769]:
# 58
column_to_drop_57 = '현재 주택의 면적(㎡)'

In [770]:
if not column_to_drop_57.startswith('Cat_'):
    comp_58 = comp_57.drop(column_to_drop_57, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']
else:
    comp_58 = comp_57.drop(comp_57.filter(regex='^' + column_to_drop_57).columns, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']

print(X_58.shape)

(6119, 9)


In [771]:
X_train, X_test, y_train, y_test = train_test_split(X_58, y_58, test_size=0.2, shuffle=True, stratify=y_58, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [772]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [773]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7172597434035342


In [774]:
xgb_optuna_58 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_58.fit(X_train, y_train)

In [775]:
xgb_optuna_proba_58 = xgb_optuna_58.predict_proba(X_test)[:, 1]
auc_58 = roc_auc_score(y_test, xgb_optuna_proba_58)
print(decimal.Decimal(auc_58).quantize(decimal.Decimal('1.000')))

0.706


In [776]:
X_train = X_train.values
y_train = y_train.values

In [777]:
auc_bootstrap = []

In [778]:
rs = RandomState(seed = 58)
bootstrap_auc(xgb_optuna_58, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.70108103, 0.70645319])

In [779]:
np.mean(auc_bootstrap)

0.7042479641041522

In [780]:
t_58 = auc_bootstrap
print(t_58)

[0.706453189080656, 0.7057963623547504, 0.7012786760936431, 0.706453189080656, 0.7059245236671222, 0.706453189080656, 0.706453189080656, 0.7019355028195488, 0.706453189080656, 0.7029848235645932, 0.7036416502904989, 0.7018073415071769, 0.706453189080656, 0.7033693075017087, 0.7057963623547504, 0.706453189080656, 0.7012786760936431, 0.7058964883800409, 0.706453189080656, 0.706453189080656, 0.7029848235645932, 0.706453189080656, 0.7009702879357483, 0.7014989533492821, 0.7019355028195488, 0.7037017259056731, 0.705652180878332, 0.7059245236671222, 0.705652180878332, 0.706453189080656, 0.706453189080656, 0.706453189080656, 0.706453189080656, 0.7001572646103895, 0.706453189080656, 0.706453189080656, 0.706453189080656, 0.7013067113807244, 0.7019355028195488, 0.7057963623547504, 0.7057963623547504, 0.7018073415071769, 0.7013067113807244, 0.7057963623547504, 0.7042904669343812, 0.7059245236671222, 0.7064251537935747, 0.7059245236671222, 0.7059245236671222, 0.7003014460868079, 0.7018073415071769

In [781]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc