In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [3]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [4]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [5]:
고령가구 = pd.read_csv('고령가구_변수추가.csv', encoding='cp949')
고령가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [6]:
고령가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [7]:
cat = 고령가구.select_dtypes(include = 'object')
num = 고령가구.select_dtypes(exclude = 'object')
num_고령 = num.drop('target',axis=1)
target = 고령가구.target

In [8]:
scaler=RobustScaler()
scaler.fit(num_고령)
num_scaled_고령=scaler.transform(num_고령)
num_df_scaled_고령=pd.DataFrame(data=num_scaled_고령, columns=num_고령.columns)

In [9]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [10]:
comp =pd.concat([num_df_scaled_고령, target,cat2],axis=1)

In [11]:
X =comp.drop('target', axis = 1)
y=comp.target
X.shape

(10564, 210)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])

        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)

{'n_estimators': 117, 'learning_rate': 0.06999999999999999, 'max_depth': 4, 'max_leaves': 572, 'subsample': 0.8, 'colsample_bytree': 0.4, 'gamma': 3, 'reg_alpha': 3, 'reg_lambda': 1, 'booster': 'gbtree', 'objective': 'binary:logistic'}


In [16]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.8515285615119352


In [17]:
xgb_optuna_0 = XGBClassifier(**study.best_trial.params, random_state = 0)

In [18]:
xgb_optuna_0.fit(X_train, y_train)

In [19]:
xgb_optuna_0_proba = xgb_optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, xgb_optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.855


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
from sklearn.utils import resample
from numpy.random import RandomState

In [22]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [23]:
auc_bootstrap = []
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [24]:
rs = RandomState(seed = 2024)
bootstrap_auc(xgb_optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84952286, 0.85637334])

In [25]:
np.mean(auc_bootstrap)

0.8530372277347532

In [26]:
t_0 = auc_bootstrap
print(t_0)

[0.8492802337671651, 0.8510648596321393, 0.8534679645763867, 0.8535943494317164, 0.8495343479975619, 0.8538901437739772, 0.8521207557993619, 0.8524658491986663, 0.8541707002258793, 0.8513723064787925, 0.8562421569681977, 0.8555286651608046, 0.85342225090531, 0.8502859345308523, 0.8515748807859166, 0.8526612527338568, 0.8509922555663118, 0.8546359040550716, 0.8509644688250689, 0.8551100713491807, 0.8539340647520706, 0.8517532537377649, 0.8528252841418379, 0.8517299487289807, 0.8537547954537307, 0.8534840988132373, 0.8516842350579039, 0.856133699042702, 0.8548788139543222, 0.8542450969846903, 0.8523824889749382, 0.8544180918575884, 0.8556371230863001, 0.8534249399447851, 0.8502285683553835, 0.8535522211466066, 0.8496405650568284, 0.8533182747122728, 0.8503997705352982, 0.8551593704062243, 0.8533469578000072, 0.8555259761213294, 0.8539555770678714, 0.8515820515578503, 0.8540774801907424, 0.851702161987738, 0.8524022085977555, 0.8495760281094258, 0.8588039152414758, 0.8540496934494999, 0.8

auc_bootstrap = []
for _ in range(2000):

    X_test_re, y_test_re = resample(X_test, y_test, replace = True, random_state=0)

    proba_0 = xgb_optuna_0.predict_proba(X_test_re)[:,1]

    auc_bootstrap.append(roc_auc_score(y_test_re,proba_0))

In [27]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
# 1. 
column_to_drop = '부채 중 임대 보증금의 비중'

In [29]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(10564, 209)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [31]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])

        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [32]:
print(study.best_trial.params)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}


In [33]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.849657396211205


In [34]:
xgb_optuna_1 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_1.fit(X_train, y_train)

In [35]:
xgb_optuna_1_proba = xgb_optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, xgb_optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.852


In [36]:
X_train = X_train.values
y_train = y_train.values

In [37]:
auc_bootstrap = []
auc_bootstrap

[]

In [38]:
rs = RandomState(seed = 1)
bootstrap_auc(xgb_optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84792219, 0.85346202])

In [39]:
decimal.Decimal('0.76368714').quantize(decimal.Decimal('1.000')), decimal.Decimal('0.77841037').quantize(decimal.Decimal('1.000'))

(Decimal('0.764'), Decimal('0.778'))

In [40]:
np.mean(auc_bootstrap)

0.8506254542235847

In [41]:
decimal.Decimal('0.7709262407256606').quantize(decimal.Decimal('1.000'))

Decimal('0.771')

In [42]:
t_1 = auc_bootstrap
print(t_1)

[0.8513570685884335, 0.8490884156179413, 0.8479164425800438, 0.8486371051593703, 0.8491816356530781, 0.8509187551539923, 0.8507834068337456, 0.8497140654691477, 0.8519343157290882, 0.8482664658850525, 0.8492394500017926, 0.8491852210390449, 0.8487285325015238, 0.8500071707719337, 0.8498507583091318, 0.8511957262199277, 0.8487711089598795, 0.8510316948119466, 0.8501541715965725, 0.850352264171238, 0.8523896597468718, 0.8512620558603134, 0.8485546412821341, 0.8527428202646014, 0.8482458499157433, 0.8518769495536195, 0.8487244989423111, 0.8519307303431214, 0.8484237746943459, 0.8504239718905741, 0.8504396579541789, 0.8495657201247715, 0.8518903947509949, 0.8526464630167437, 0.8519535871786598, 0.8512683302857553, 0.850683464199921, 0.8496141228353232, 0.8482037216306335, 0.8511401527374423, 0.8470070990642141, 0.8489145243985516, 0.8512293392133662, 0.8525985084794379, 0.8519863038256068, 0.8517268115162598, 0.852869653293177, 0.850710802767918, 0.8490543544512567, 0.849722580760819, 0.84

In [43]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
# 2.
column_to_drop_1 = 'Cat_가구주 동거 여부'

In [45]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(10564, 207)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [47]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [48]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8500170742084285


In [49]:
xgb_optuna_2 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_2.fit(X_train, y_train)

In [50]:
xgb_optuna_2_proba = xgb_optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, xgb_optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.853


In [51]:
X_train = X_train.values
y_train = y_train.values

In [52]:
auc_bootstrap = []

In [53]:
rs = RandomState(seed = 2)
bootstrap_auc(xgb_optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84900005, 0.85454035])

In [54]:
decimal.Decimal('0.763322').quantize(decimal.Decimal('1.000')), decimal.Decimal('0.77834399').quantize(decimal.Decimal('1.000'))

(Decimal('0.763'), Decimal('0.778'))

In [55]:
np.mean(auc_bootstrap)

0.8517533431483274

In [56]:
decimal.Decimal('0.7709589705907302').quantize(decimal.Decimal('1.000'))

Decimal('0.771')

In [57]:
t_2 = auc_bootstrap
print(t_2)

[0.8517129181456383, 0.8526361550320893, 0.8533415797210568, 0.8497266143200315, 0.852087590979169, 0.8506041375354056, 0.850838980316231, 0.8510357283711593, 0.8523246746262235, 0.8520042307554409, 0.8492484134667098, 0.8524255136065397, 0.8527912229751533, 0.8501219031228711, 0.8498243160876269, 0.8526953139005414, 0.8509904628733285, 0.8503531605177298, 0.8523842816679216, 0.8504038040945108, 0.8544844214979743, 0.8516295579219103, 0.8484071922842494, 0.8499802803771827, 0.8505118138467606, 0.8508537700333442, 0.8510195941343086, 0.8525460722096734, 0.8535002330500879, 0.8507645835574199, 0.8501528270768348, 0.8526729052382489, 0.8507090100749346, 0.8516927503495751, 0.8509541608404146, 0.8508201570399053, 0.8509913592198199, 0.8541635294539457, 0.8524371661109318, 0.8497207880678357, 0.8506516438994658, 0.8513956114875766, 0.8502975870352445, 0.8503235810835037, 0.852200978810369, 0.8505024022085976, 0.8515883259832921, 0.8528925101287155, 0.8539609551468216, 0.8493873471729232, 0.

In [58]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
# 3.
column_to_drop_2 = '부채 중 비금융기관 대출금의 비중'

In [60]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(10564, 206)


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [62]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [63]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8493509001746609


In [64]:
xgb_optuna_3 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_3.fit(X_train, y_train)

In [65]:
xgb_optuna_3_proba = xgb_optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, xgb_optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.852


In [66]:
X_train = X_train.values
y_train = y_train.values

In [67]:
auc_bootstrap = []

In [68]:
rs = RandomState(seed = 3)
bootstrap_auc(xgb_optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8480277 , 0.85356535])

In [69]:
decimal.Decimal('0.76336265').quantize(decimal.Decimal('1.000')), decimal.Decimal('0.7781291').quantize(decimal.Decimal('1.000'))

(Decimal('0.763'), Decimal('0.778'))

In [70]:
decimal.Decimal('0.7709458006045286').quantize(decimal.Decimal('1.000'))

Decimal('0.771')

In [71]:
t_3 = auc_bootstrap
print(t_3)

[0.8480840593739916, 0.852914470617762, 0.8515892223297838, 0.8515000358538597, 0.8515901186762755, 0.8503069986734073, 0.8513369007923702, 0.8510679968448602, 0.8496921049801011, 0.850889623893012, 0.8533886379118714, 0.8518424402136889, 0.8499224660284681, 0.8502581477896095, 0.8537019110107202, 0.8550338818973863, 0.850095909074612, 0.8496705926643002, 0.8515565056828368, 0.8492264529776632, 0.8501514825570973, 0.8510756157900397, 0.8517622172026819, 0.8514368434261949, 0.8527916711483992, 0.852601197518913, 0.8507726506758453, 0.8510276612527339, 0.8498767523573912, 0.8499197769889929, 0.8506050338818973, 0.8496880714208883, 0.8540936144275931, 0.8515125847047434, 0.8496629737191208, 0.8512687784590012, 0.8512696748054928, 0.8500820157039906, 0.8517474274855689, 0.8494792226883223, 0.8507076655551971, 0.8491928399842241, 0.85118900362124, 0.8510890609874153, 0.8535513248001148, 0.8541375354056865, 0.8490498727187982, 0.851648381198236, 0.8507592054784696, 0.8476125811193576, 0.8509

In [72]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [73]:
### 4. 
column_to_drop_3 = 'Cat_이사 계획 첫 번째 이유'

In [74]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(10564, 194)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [76]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [77]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8492949191634198


In [78]:
xgb_optuna_4 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_4.fit(X_train, y_train)

In [79]:
xgb_optuna_4_proba = xgb_optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, xgb_optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.852


In [80]:
X_train = X_train.values
y_train = y_train.values

In [81]:
auc_bootstrap = []

In [82]:
rs = RandomState(seed = 4)
bootstrap_auc(xgb_optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84827272, 0.8538117 ])

In [83]:
np.mean(auc_bootstrap)

0.8510601905632641

In [84]:
t_4 = auc_bootstrap
print(t_4)

[0.8508013337635797, 0.8522699974902298, 0.8515125847047436, 0.8547676669893514, 0.8505933813775053, 0.8501980925746657, 0.8495572048331004, 0.8500699150263527, 0.8502599404825929, 0.8502187085439749, 0.8513705137858092, 0.8513005987594564, 0.8510738230970564, 0.8522592413323296, 0.8510572406869599, 0.8513176293427989, 0.8503787063927432, 0.8525348678785271, 0.85081208992148, 0.8521570578322757, 0.8526106091570758, 0.852146749847621, 0.8517994155820874, 0.8517720770140905, 0.8531685848481589, 0.8499905883618372, 0.8551813308952709, 0.849706894697214, 0.8521700548564053, 0.8518070345272668, 0.8526132981965509, 0.8523968305188052, 0.8487361514467033, 0.8498673407192284, 0.8520373955756337, 0.850035853859668, 0.8502796601054103, 0.8504293499695242, 0.8502447025922341, 0.851015560575096, 0.851991681904557, 0.8506471621670073, 0.852612850023305, 0.8501219031228713, 0.8494621921049801, 0.8505709727152128, 0.8504916460506974, 0.8506789824674627, 0.8507986447241046, 0.8507695134631242, 0.85205

In [85]:
## 5.
column_to_drop_4 = '소득 중 재산소득의 비중(월평균)'

In [86]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(10564, 193)


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [88]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [89]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'learning_rate': 0.09999999999999999, 'max_depth': 9, 'max_leaves': 692, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'gamma': 7, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8496839871915446


In [90]:
xgb_optuna_5 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_5.fit(X_train, y_train)

In [91]:
xgb_optuna_5_proba = xgb_optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, xgb_optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.852


In [92]:
X_train = X_train.values
y_train = y_train.values

In [93]:
auc_bootstrap = []

In [94]:
rs = RandomState(seed = 5)
bootstrap_auc(xgb_optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8478362 , 0.85387807])

In [95]:
np.mean(auc_bootstrap)

0.8509639977949877

In [96]:
t_5 = auc_bootstrap
print(t_5)

[0.8507390376824067, 0.8521790183213224, 0.852792119321645, 0.8535002330500878, 0.8508981391846832, 0.8532250546771359, 0.8495392779032662, 0.8495733390699509, 0.8531219748305906, 0.8507860958732206, 0.8504705819081424, 0.850737693162669, 0.8507264888315227, 0.8491108242802338, 0.850719766232835, 0.8538291922125417, 0.855014162274569, 0.8515461976981822, 0.85207235308881, 0.8507081137284428, 0.8509680542110358, 0.8509985299917536, 0.8544485676383062, 0.852404001290739, 0.8515807070381127, 0.8518115162597254, 0.8496226381269942, 0.8541662184934208, 0.8506722598687748, 0.849313398587358, 0.8472553870424151, 0.8521458535011294, 0.8492658922232978, 0.8492587214513643, 0.8532232619841527, 0.8518894984045033, 0.8515314079810692, 0.8505575275178372, 0.8505978631099638, 0.8512163421892367, 0.8515659173209996, 0.8494837044207809, 0.8518531963715894, 0.8501689613136854, 0.8516985766017713, 0.8509801548886737, 0.8502222939299415, 0.8497176508551146, 0.8476937004768564, 0.8515058621060557, 0.85145

In [97]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [98]:
## 6.
column_to_drop_5 = '자산 중 부동산 자산의 비중'

In [99]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(10564, 192)


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [101]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [102]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8476490774329348


In [103]:
xgb_optuna_6 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_6.fit(X_train, y_train)

In [104]:
xgb_optuna_proba_6 = xgb_optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, xgb_optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.853


In [105]:
X_train = X_train.values
y_train = y_train.values

In [106]:
auc_bootstrap = []

In [107]:
rs = RandomState(seed = 6)
bootstrap_auc(xgb_optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84739767, 0.85356657])

In [108]:
np.mean(auc_bootstrap)

0.850584813873651

In [109]:
t_6 = auc_bootstrap
print(t_6)

[0.8482808074289196, 0.8499014018859129, 0.8506449213007781, 0.8518509555053602, 0.8512055860313363, 0.8516761679394788, 0.8500591588684522, 0.8532026460148434, 0.8492739593417231, 0.8501658241009644, 0.8517182962245887, 0.8523426015560575, 0.8502859345308522, 0.8508497364741314, 0.8507305223907353, 0.8479563299989243, 0.8501738912193898, 0.84932505109175, 0.8508398766627226, 0.8508891757197663, 0.8509994263382452, 0.850053780789502, 0.8506552292854326, 0.85031103223262, 0.8511464271628841, 0.8505888996450468, 0.8504257645835576, 0.851594600408734, 0.850823742425872, 0.8511643540927181, 0.8476533648847299, 0.8477187981786241, 0.8526603563873651, 0.8511150550356748, 0.8508658707109822, 0.8531578286902586, 0.8512701229787388, 0.8519495536194471, 0.8499412893047936, 0.850739934028898, 0.8494406797891793, 0.8512351654655623, 0.8498852676490625, 0.8497086873901976, 0.8518330285755262, 0.8500631924276648, 0.8518240651106093, 0.8506704671757914, 0.8500439209780932, 0.8527015883259833, 0.85021

In [110]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [111]:
## 7 . 
column_to_drop_6 = 'Cat_가구주 주민등록상 등재 여부'

In [112]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(10564, 190)


In [113]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [114]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [115]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.848116518876797


In [116]:
xgb_optuna_7 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_7.fit(X_train, y_train)

In [117]:
xgb_optuna_proba_7 = xgb_optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, xgb_optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.853


In [118]:
X_train = X_train.values
y_train = y_train.values

In [119]:
auc_bootstrap = []

In [120]:
rs = RandomState(seed = 7)
bootstrap_auc(xgb_optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84728759, 0.85328258])

In [121]:
np.mean(auc_bootstrap)

0.8503357431608763

In [122]:
t_7 = auc_bootstrap
print(t_7)

[0.8507457602810942, 0.8513364526191245, 0.8508327058907891, 0.8495652719515256, 0.8508120899214802, 0.8506785342942167, 0.8495114911620236, 0.848533577139579, 0.8504176974651321, 0.8517344304614393, 0.8512746047111971, 0.8543553476031696, 0.8508712487899323, 0.8513292818471908, 0.8488948047757341, 0.8519522426589221, 0.8501174213904127, 0.8506579183249077, 0.8497122727761646, 0.8516134236850595, 0.8556684952135098, 0.8537054963966872, 0.8500322684737012, 0.8497839804955003, 0.8475717973539851, 0.8510451400093221, 0.8486859560431681, 0.8496369796708616, 0.8498538955218529, 0.8485264063676455, 0.8477806460865513, 0.8502572514431179, 0.8497409558638989, 0.8541464988706033, 0.8494209601663619, 0.8517254669965222, 0.8519092180273207, 0.8517451866193395, 0.8498772005306371, 0.8482691549245277, 0.8516663081280701, 0.8508873830267828, 0.8501936108422071, 0.8519970599835072, 0.8467157864544118, 0.8516815460184288, 0.8520436700010756, 0.8484564913412929, 0.8510119751891292, 0.8500188232763257, 

In [123]:
np.mean(t_7)

0.8503357431608763

In [124]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [125]:
## 8 
column_to_drop_7 = '부채 중 금융기관 대출금의 비중'

In [126]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(10564, 189)


In [127]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [128]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [129]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 182, 'learning_rate': 0.03, 'max_depth': 5, 'max_leaves': 776, 'subsample': 0.5, 'colsample_bytree': 0.5, 'gamma': 3, 'reg_alpha': 2, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8503347664472212


In [130]:
xgb_optuna_8 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_8.fit(X_train, y_train)

In [131]:
xgb_optuna_proba_8 = xgb_optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, xgb_optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.856


In [132]:
X_train = X_train.values
y_train = y_train.values

In [133]:
auc_bootstrap = []

In [134]:
rs = RandomState(seed = 8)
bootstrap_auc(xgb_optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84937603, 0.85530225])

In [135]:
np.mean(auc_bootstrap)

0.8524791769746513

In [136]:
t_8 = auc_bootstrap
print(t_8)

[0.8491556416048188, 0.8504105266931986, 0.8542952923882257, 0.8477492739593417, 0.8514951059481553, 0.8532886952780467, 0.8526944175540496, 0.8508972428381916, 0.8523430497293034, 0.8534500376465528, 0.8540532788354666, 0.8506803269872001, 0.8534814097737622, 0.8557536481302213, 0.8538677351116848, 0.8529265712954, 0.8520167796063247, 0.854147395217095, 0.8547165752393244, 0.8504132157326736, 0.8538238141335914, 0.8529498763041842, 0.8519522426589223, 0.8554641282134021, 0.8533971532035423, 0.8516958875622962, 0.8509429565092683, 0.8551046932702306, 0.8500609515614356, 0.8514951059481555, 0.8525967157864545, 0.8521745365888639, 0.8532411889139865, 0.8534719981355994, 0.8548465454806209, 0.8521449571546378, 0.8527795704707613, 0.8514574593955039, 0.8526213653149762, 0.8502913126098025, 0.8533586103043993, 0.8518428883869349, 0.8530448890323044, 0.851424294575311, 0.8525115628697431, 0.8542540604496074, 0.8514897278692051, 0.852108206948478, 0.8533308235631565, 0.8503155139650783, 0.852

In [137]:
np.mean(t_8)

0.8524791769746513

In [138]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [139]:
#9.
column_to_drop_8 = 'Cat_이사 예상 기간'

In [140]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(10564, 185)


In [141]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [142]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [143]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8487868914864078


In [144]:
xgb_optuna_9 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_9.fit(X_train, y_train)

In [145]:
xgb_optuna_proba_9 = xgb_optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, xgb_optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.852


In [146]:
X_train = X_train.values
y_train = y_train.values

In [147]:
auc_bootstrap = []

In [148]:
rs = RandomState(seed = 9)
bootstrap_auc(xgb_optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84776106, 0.85309185])

In [149]:
np.mean(auc_bootstrap)

0.8504468789215159

In [150]:
t_9 = auc_bootstrap
print(t_9)

[0.8504176974651321, 0.849522247319924, 0.850600103976193, 0.8524855688214836, 0.8501487935176222, 0.8530628159621383, 0.8512589186475925, 0.8490561471442402, 0.8492721666487397, 0.8498225233946434, 0.8503773618730055, 0.8482337492381056, 0.8505678355024919, 0.8509918073930658, 0.8499368075723351, 0.8517326377684558, 0.8496974830590512, 0.8501402782259512, 0.8476511240185007, 0.8493514933132552, 0.8499896920153456, 0.8511845218887812, 0.8508618371517694, 0.8493900362123982, 0.8494043777562656, 0.849701516618264, 0.8523278118389443, 0.8494474023878671, 0.8499816248969202, 0.8509250295794341, 0.8481840020078161, 0.850468789215159, 0.8488540210103618, 0.8478281524506113, 0.8495361406905454, 0.8487509411638163, 0.8508300168513141, 0.8488437130257073, 0.8491054462012837, 0.85004212828511, 0.8537583808396974, 0.851173765730881, 0.851971065935248, 0.8502635258685598, 0.8495244881861531, 0.8492587214513643, 0.8495052167365817, 0.8515937040622423, 0.8507390376824064, 0.8517855222114661, 0.85064

In [151]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [152]:
# 10.Cat_이사 예상 기간
column_to_drop_9 = 'Cat_현재 주택의 위치'

In [153]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(10564, 181)


In [154]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [155]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [156]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8492221438488065


In [157]:
xgb_optuna_10 = XGBClassifier(**study.best_trial.params, random_state=0)
xgb_optuna_10.fit(X_train, y_train)

In [158]:
xgb_optuna_proba_10 = xgb_optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, xgb_optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.854


In [159]:
X_train = X_train.values
y_train = y_train.values

In [160]:
auc_bootstrap = []

In [161]:
rs = RandomState(seed = 10)
bootstrap_auc(xgb_optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84745507, 0.85364788])

In [162]:
np.mean(auc_bootstrap)

0.8506877771951525

In [163]:
t_10 = auc_bootstrap
print(t_10)

[0.8501747875658815, 0.8511993116058943, 0.8499498045964647, 0.8497337850919651, 0.8489862321178876, 0.8493142949338496, 0.8494361980567209, 0.850017926929834, 0.8510720304040731, 0.847297515327525, 0.8516358323473523, 0.8464155103796923, 0.8498745114911621, 0.8502904162633107, 0.8506650890968414, 0.8505718690617047, 0.8484555949948012, 0.8488517801441324, 0.8489611344161199, 0.8495894733068016, 0.8508371876232477, 0.8520714567423183, 0.850253666057151, 0.8498915420745042, 0.8508896238930121, 0.851648381198236, 0.852337671650353, 0.8504266609300493, 0.8508604926320318, 0.8513373489656161, 0.852929260334875, 0.8480818185077623, 0.849656699293679, 0.8494388870961959, 0.8499659388333154, 0.8523143666415689, 0.8502438062457424, 0.8497570901007494, 0.8519549316983972, 0.8510693413645979, 0.850663296403858, 0.8513041841454231, 0.854661001756839, 0.8502948979957692, 0.8500071707719336, 0.8506256498512065, 0.8505844179125883, 0.8502348427808253, 0.8496136746620773, 0.8498395539779857, 0.849857

In [164]:
from scipy.stats import bartlett

bartlett(t_9, t_10)

BartlettResult(statistic=31.79181555992297, pvalue=1.7161438288739004e-08)

In [165]:
import pingouin as pg
ca = pg.ttest(t_9, t_10, paired=False)
print(ca)

               T   dof alternative         p-val         CI95%   cohen-d  \
T-test -5.263081  3998   two-sided  1.490979e-07  [-0.0, -0.0]  0.166433   

             BF10    power  
T-test  3.343e+04  0.99952  


In [166]:
import pingouin as pg
ca = pg.ttest(t_7, t_10, paired=False)
print(ca)

               T   dof alternative         p-val         CI95%   cohen-d  \
T-test -7.246421  3998   two-sided  5.109926e-13  [-0.0, -0.0]  0.229152   

             BF10  power  
T-test  6.915e+09    1.0  


In [167]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [168]:
# 11.
column_to_drop_10 = 'Cat_현재 주택의 구조'

In [169]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(10564, 179)


In [170]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [171]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [172]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'learning_rate': 0.03, 'max_depth': 5, 'max_leaves': 992, 'subsample': 0.4, 'colsample_bytree': 0.5, 'gamma': 1, 'reg_alpha': 3, 'reg_lambda': 1, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8502437973039544


In [173]:
xgb_optuna_11 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_11.fit(X_train, y_train)

In [174]:
xgb_optuna_proba_11 = xgb_optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, xgb_optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.856


In [175]:
X_train = X_train.values
y_train = y_train.values

In [176]:
auc_bootstrap = []

In [177]:
rs = RandomState(seed = 11)
bootstrap_auc(xgb_optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84924091, 0.85579928])

In [178]:
np.mean(auc_bootstrap)

0.8527479738087554

In [179]:
t_11 = auc_bootstrap
print(t_11)

[0.8484968269334194, 0.8530565415366964, 0.8546457638664803, 0.8497732243375999, 0.8553234018142054, 0.8516492775447277, 0.8532815245061132, 0.8482646731920691, 0.8560781255602166, 0.8548770212613388, 0.8547721487218098, 0.8549935463052598, 0.8545310315155425, 0.8523726291635294, 0.8505028503818435, 0.8549612778315585, 0.8551719192571081, 0.8509044136101251, 0.8506758452547418, 0.853437040622423, 0.8563049012226167, 0.8527132408303755, 0.8524613674662078, 0.8510756157900399, 0.851699472948263, 0.8501962998816823, 0.852559069233803, 0.8522973360582268, 0.8527069664049335, 0.8542513714101323, 0.8542262737083647, 0.8540335592126492, 0.8540192176687821, 0.8544351224409306, 0.8498673407192284, 0.8535513248001148, 0.8543105302785845, 0.8539735039977053, 0.855918575884694, 0.8510460363558137, 0.8515282707683482, 0.8514807644042881, 0.8511769029436019, 0.852523215374135, 0.8554484421497974, 0.8542092431250226, 0.8516905094833458, 0.8528243877953461, 0.853744935642322, 0.8536248252124341, 0.850

In [180]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [181]:
# 12
column_to_drop_11 = 'Cat_소득 계층'

In [182]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(10564, 177)


In [183]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [184]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [185]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.09999999999999999, 'max_depth': 8, 'max_leaves': 978, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 6, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8503305678713781


In [186]:
xgb_optuna_12 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_12.fit(X_train, y_train)

In [187]:
xgb_optuna_proba_12 = xgb_optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, xgb_optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.854


In [188]:
X_train = X_train.values
y_train = y_train.values

In [189]:
auc_bootstrap = []

In [190]:
rs = RandomState(seed = 12)
bootstrap_auc(xgb_optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84831386, 0.85509354])

In [191]:
np.mean(auc_bootstrap)

0.8518173711950092

In [192]:
t_12 = auc_bootstrap
print(t_12)

[0.8507000466100176, 0.8537108744756373, 0.8547022336954573, 0.8501792692983401, 0.8497257179735398, 0.8549514180201498, 0.8515524721236241, 0.8544539457172564, 0.8499614571008569, 0.8548752285683554, 0.8514130902441649, 0.8537350758309132, 0.8556398121257753, 0.852063389623893, 0.8544933849628912, 0.8504427951668997, 0.8492883008855904, 0.8485604675343301, 0.8514009895665269, 0.8477286579900327, 0.8513409343515829, 0.8487755906923381, 0.850636406009107, 0.8535289161378221, 0.8522534150801333, 0.8500017926929834, 0.8539510953354128, 0.8503146176185866, 0.8514951059481553, 0.8537879602739235, 0.8530211358502744, 0.8495464486751998, 0.8538273995195582, 0.8529552543831344, 0.8513853035029221, 0.8530574378831882, 0.8529202968699581, 0.8520338101896668, 0.8521835000537809, 0.8506771897744793, 0.8537610698791726, 0.8507986447241045, 0.8505687318489834, 0.8512315800795955, 0.8558110143056898, 0.8548447527876376, 0.8536113800150587, 0.8534455559140942, 0.8510549998207307, 0.8477707862751425, 0

In [193]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [194]:
# 13.
column_to_drop_12 = '총 가구원 수'

In [195]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(10564, 176)


In [196]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [197]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [198]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 155, 'learning_rate': 0.04, 'max_depth': 8, 'max_leaves': 726, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'gamma': 4, 'reg_alpha': 3, 'reg_lambda': 8, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.850213007747772


In [199]:
xgb_optuna_13 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_13.fit(X_train, y_train)

In [200]:
xgb_optuna_proba_13 = xgb_optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, xgb_optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.855


In [201]:
X_train = X_train.values
y_train = y_train.values

In [202]:
auc_bootstrap = []

In [203]:
rs = RandomState(seed = 13)
bootstrap_auc(xgb_optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84958835, 0.8559393 ])

In [204]:
np.mean(auc_bootstrap)

0.8528769038399484

In [205]:
t_13 = auc_bootstrap
print(t_13)

[0.8537727223835646, 0.8526764906242157, 0.853496647664121, 0.8510603778996809, 0.8554775734107776, 0.8522641712380337, 0.8522731347029507, 0.8540308701731743, 0.8542217919759063, 0.8553350543185974, 0.8533675737693162, 0.8539824674626223, 0.8515793625183752, 0.8550867663403965, 0.8553000968054211, 0.8563860205801154, 0.8545713671076691, 0.8530000717077193, 0.8512997024129649, 0.8532743537341795, 0.8504454842063749, 0.8545175863181671, 0.8526343623391057, 0.8526567710013983, 0.8524004159047721, 0.8525671363522284, 0.8531183894446237, 0.8537664479581226, 0.8547085081208992, 0.8513328672331576, 0.8535674590369653, 0.8545184826646589, 0.8512880499085727, 0.852392348786347, 0.8562538094725898, 0.8501532752500807, 0.8551566813667493, 0.8511966225664194, 0.8531372127209493, 0.8539080707038114, 0.8525492094223943, 0.8535719407694238, 0.8518240651106092, 0.8559436735864616, 0.8521377863827042, 0.8526182281022552, 0.8546054282743538, 0.8542791581513749, 0.8533859488723962, 0.8526325696461223, 0

In [206]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [207]:
#14.
column_to_drop_13 = 'Cat_가구주 최종 학력'

In [208]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(10564, 173)


In [209]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [210]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [211]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 160, 'learning_rate': 0.05, 'max_depth': 3, 'max_leaves': 782, 'subsample': 0.6, 'colsample_bytree': 0.30000000000000004, 'gamma': 1, 'reg_alpha': 1, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8512318621523579


In [212]:
xgb_optuna_14 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_14.fit(X_train, y_train)

In [213]:
xgb_optuna_proba_14 = xgb_optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, xgb_optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.856


In [214]:
X_train = X_train.values
y_train = y_train.values

In [215]:
auc_bootstrap = []

In [216]:
rs = RandomState(seed = 14)
bootstrap_auc(xgb_optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.85049194, 0.85593137])

In [217]:
np.mean(auc_bootstrap)

0.8532372310960524

In [218]:
t_14 = auc_bootstrap
print(t_14)

[0.8551288946255065, 0.8514108493779355, 0.8539761930371805, 0.8559087160732854, 0.8521037252160195, 0.853424939944785, 0.8543580366426446, 0.8516940948693126, 0.8547604962174179, 0.8543670001075616, 0.8526352586855975, 0.8528261804883296, 0.8532331217955613, 0.8563335843103512, 0.8527231006417841, 0.8515058621060557, 0.852833351260263, 0.8546287332831379, 0.8526504965759565, 0.852863827040981, 0.8520114015273744, 0.8540389372915994, 0.853043096339321, 0.8534365924491771, 0.8541823527302714, 0.8542881216162919, 0.8539277903266287, 0.8535123337277257, 0.8539349610985624, 0.8530404072998459, 0.8542316517873149, 0.8546574163708723, 0.8540407299845829, 0.8526442221505145, 0.8528871320497652, 0.8533102075938475, 0.8542988777741924, 0.8537377648703882, 0.8524533003477824, 0.8562475350471478, 0.8548322039367537, 0.8523305008784196, 0.8505844179125882, 0.8553735972177405, 0.8511383600444586, 0.854661001756839, 0.8552176329281848, 0.8533801226202, 0.8546323186691047, 0.8515264780753649, 0.85331

In [219]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [220]:
## 15.
column_to_drop_14 = 'Cat_가구주 성별'

In [221]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(10564, 171)


In [222]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [223]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [224]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 196, 'learning_rate': 0.09, 'max_depth': 7, 'max_leaves': 978, 'subsample': 0.9, 'colsample_bytree': 0.7000000000000001, 'gamma': 6, 'reg_alpha': 4, 'reg_lambda': 8, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8495475334766447


In [225]:
xgb_optuna_15 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_15.fit(X_train, y_train)

In [226]:
xgb_optuna_proba_15 = xgb_optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, xgb_optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.854


In [227]:
X_train = X_train.values
y_train = y_train.values

In [228]:
auc_bootstrap = []

In [229]:
rs = RandomState(seed = 15)
bootstrap_auc(xgb_optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84804782, 0.85456466])

In [230]:
np.mean(auc_bootstrap)

0.8512935893298913

In [231]:
t_15 = auc_bootstrap
print(t_15)

[0.8518859130185364, 0.852400864078018, 0.849554963966871, 0.8523824889749382, 0.8476533648847298, 0.8536624717650856, 0.8512602631673298, 0.8514637338209458, 0.8515668136674914, 0.8564393531963715, 0.8517577354702234, 0.8538507045283424, 0.8541971424473845, 0.8519208705317128, 0.8481409773762146, 0.849226452977663, 0.8527876375891865, 0.8509151697680255, 0.8497337850919652, 0.8528172170234126, 0.8484780036570938, 0.8539582661073464, 0.8511553906278011, 0.850928614965401, 0.8498915420745042, 0.8528208024093794, 0.8523215374135027, 0.8500188232763257, 0.8509044136101251, 0.8510178014413252, 0.8509465418952351, 0.8528943028216988, 0.8501183177369044, 0.8517424975798644, 0.8494765336488475, 0.8529122297515328, 0.8504705819081423, 0.8511652504392098, 0.8536799505216736, 0.8508811086013409, 0.8527087590979169, 0.8506704671757914, 0.8530888100103977, 0.8478900003585386, 0.8537570363199599, 0.8528521745365888, 0.8480011473235094, 0.8544539457172565, 0.8531945788964181, 0.8511580796672763, 0.8

In [232]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [233]:
# 16.
column_to_drop_15 = 'Cat_현재 청소/쓰레기 처리상태'

In [234]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(10564, 167)


In [235]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [236]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [237]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.09999999999999999, 'max_depth': 8, 'max_leaves': 978, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 6, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8496496988221595


In [238]:
xgb_optuna_16 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_16.fit(X_train, y_train)

In [239]:
xgb_optuna_proba_16 = xgb_optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, xgb_optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.854


In [240]:
X_train = X_train.values
y_train = y_train.values

In [241]:
auc_bootstrap = []

In [242]:
rs = RandomState(seed = 16)
bootstrap_auc(xgb_optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84817145, 0.85502266])

In [243]:
np.mean(auc_bootstrap)

0.8516628771377864

In [244]:
t_16 = auc_bootstrap
print(t_16)

[0.8546260442436628, 0.8527728478720734, 0.853541464988706, 0.8504508622853251, 0.8527661252733858, 0.8521906708257143, 0.8535333978702807, 0.8540523824889749, 0.850331648201929, 0.8533245491377148, 0.8506704671757914, 0.8504696855616506, 0.8507134918073931, 0.8519163887992541, 0.8524299953389982, 0.8537619662256641, 0.8520768348212685, 0.8514951059481554, 0.8499802803771828, 0.8503657093686136, 0.8542415115987237, 0.8528521745365889, 0.8522677566240007, 0.8517182962245886, 0.8522372808432829, 0.8521951525581729, 0.8481373919902477, 0.8527437166110933, 0.8536651608045606, 0.850665985443333, 0.8507511383600446, 0.8512002079523862, 0.8515408196192319, 0.8513902334086264, 0.8527580581549604, 0.8506839123731669, 0.8514144347639023, 0.8516430031192858, 0.8527876375891864, 0.8502931053027858, 0.8512477143164461, 0.8527132408303755, 0.8503715356208097, 0.8516456921587608, 0.8553977985730163, 0.8526836613961493, 0.8520302248037002, 0.8498341758990355, 0.849385106306694, 0.8514117457244275, 0.8

In [245]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [246]:
# 17.
column_to_drop_16 = 'Cat_현재 공공기관 접근용이성'

In [247]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(10564, 163)


In [248]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [249]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [250]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8488645651395046


In [251]:
xgb_optuna_17 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_17.fit(X_train, y_train)

In [252]:
xgb_optuna_proba_17 = xgb_optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, xgb_optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.852


In [253]:
X_train = X_train.values
y_train = y_train.values

In [254]:
auc_bootstrap = []

In [255]:
rs = RandomState(seed = 17)
bootstrap_auc(xgb_optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84850008, 0.85372283])

In [256]:
np.mean(auc_bootstrap)

0.8512199190599118

In [257]:
t_17 = auc_bootstrap
print(t_17)

[0.8526137463697967, 0.8513234555949948, 0.8495554121401168, 0.8501039761930372, 0.8484627657667347, 0.8502348427808255, 0.8509958409522786, 0.8508201570399054, 0.8507000466100176, 0.8501299702412964, 0.8502823491448854, 0.8511894517944857, 0.8528091499049874, 0.8500385428991432, 0.8510146642286041, 0.851009286149654, 0.8513812699437096, 0.8509743286364778, 0.8519849593058695, 0.8526885913018536, 0.8516528629306945, 0.8517725251873365, 0.852329156358682, 0.8529888673765731, 0.8522395217095121, 0.8508336022372809, 0.8515470940446739, 0.851433258040228, 0.8514798680577964, 0.8548738840486177, 0.8516017711806676, 0.8530861209709224, 0.8519849593058694, 0.8508313613710516, 0.8527715033523358, 0.8518760532071278, 0.851158527840522, 0.848396436126349, 0.8519746513212146, 0.853365781076333, 0.8529050589795991, 0.8494653293177012, 0.8526980029400166, 0.851403678606002, 0.8537897529669068, 0.8496280162059446, 0.8518899465777492, 0.8514574593955039, 0.8521960489046646, 0.8523143666415689, 0.8513

In [258]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [259]:
## 18.
column_to_drop_17 ='Cat_가구주 장애 여부'

In [260]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(10564, 161)


In [261]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [262]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [263]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 179, 'learning_rate': 0.060000000000000005, 'max_depth': 5, 'max_leaves': 790, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'gamma': 3, 'reg_alpha': 4, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8515509539164315


In [264]:
xgb_optuna_18 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_18.fit(X_train, y_train)

In [265]:
xgb_optuna_proba_18 = xgb_optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, xgb_optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.856


In [266]:
X_train = X_train.values
y_train = y_train.values

In [267]:
auc_bootstrap = []

In [268]:
rs = RandomState(seed = 18)
bootstrap_auc(xgb_optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84829681, 0.85558556])

In [269]:
np.mean(auc_bootstrap)

0.8520512329245994

In [270]:
t_18 = auc_bootstrap
print(t_18)

[0.8499946219210498, 0.8534760316948119, 0.8521736402423722, 0.8508595962855401, 0.8524801907425335, 0.8495706500304758, 0.8517344304614392, 0.851704851027213, 0.8505019540353518, 0.8501792692983399, 0.8512943243340145, 0.8505781434871464, 0.8518016564483166, 0.8510756157900399, 0.851740704886881, 0.8521969452511564, 0.8519262486106629, 0.855276791796637, 0.8505369115485282, 0.8514951059481554, 0.8510621705926644, 0.8505933813775055, 0.8529561507296259, 0.8535988311641747, 0.8502240866229251, 0.8497033093112474, 0.8523215374135025, 0.856614140762253, 0.8504141120791653, 0.8500242013552759, 0.8548573016385214, 0.8509555053601521, 0.8489942992363126, 0.8542477860241655, 0.8501407263991969, 0.8544467749453228, 0.8531999569753683, 0.8524658491986663, 0.8504938869169265, 0.8490140188591303, 0.8514726972858628, 0.8532869025850631, 0.8519154924527625, 0.8518832239790614, 0.8504499659388334, 0.8520409809616005, 0.8504634111362088, 0.8533980495500342, 0.8542872252698002, 0.8542208956294145, 0.8

In [271]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [272]:
# 19
column_to_drop_18 = 'Cat_현재 상업시설 접근용이성'

In [273]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(10564, 157)


In [274]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [275]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [276]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8487239128487617


In [277]:
xgb_optuna_19 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_19.fit(X_train, y_train)

In [278]:
xgb_optuna_proba_19 = xgb_optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, xgb_optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.853


In [279]:
X_train = X_train.values
y_train = y_train.values

In [280]:
auc_bootstrap = []

In [281]:
rs = RandomState(seed = 19)
bootstrap_auc(xgb_optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84743286, 0.8534112 ])

In [282]:
np.mean(auc_bootstrap)

0.8505081388261446

In [283]:
t_19 = auc_bootstrap
print(t_19)

[0.8500349575131763, 0.8526352586855974, 0.8519011509088952, 0.8516313506148938, 0.8487236025958195, 0.8494792226883224, 0.8518608153167688, 0.8497570901007494, 0.8508918647592413, 0.8487567674160124, 0.8497409558638989, 0.8500439209780933, 0.8502411172062674, 0.8509662615180524, 0.8499542863289233, 0.847900756516439, 0.8535737334624073, 0.8524013122512639, 0.8494370944032125, 0.8514744899788462, 0.8525357642250189, 0.8469882757878886, 0.8510299021189631, 0.8488930120827508, 0.8522292137248575, 0.8504508622853251, 0.8519898892115736, 0.8490606288766988, 0.8482252339464343, 0.8520149869133412, 0.8498933347674877, 0.849937255745581, 0.8498261087806103, 0.8479581226919077, 0.8515125847047436, 0.849477429995339, 0.8506552292854326, 0.8507215589258184, 0.8501487935176222, 0.8499273959341723, 0.848892115736259, 0.8477053529812485, 0.8514144347639023, 0.8513131476103403, 0.8520302248037, 0.8521655731239468, 0.8492748556882148, 0.8513059768384067, 0.849322362052275, 0.8482180631745007, 0.84771

In [284]:
# 20.
column_to_drop_19 = 'Cat_현재 대기오염 정도'

In [285]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(10564, 153)


In [286]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [287]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [288]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 148, 'learning_rate': 0.09, 'max_depth': 7, 'max_leaves': 892, 'subsample': 0.5, 'colsample_bytree': 0.5, 'gamma': 4, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8512654507591026


In [289]:
xgb_optuna_20 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_20.fit(X_train, y_train)

In [290]:
xgb_optuna_proba_20 = xgb_optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, xgb_optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.854


In [291]:
X_train = X_train.values
y_train = y_train.values

In [292]:
auc_bootstrap = []

In [293]:
rs = RandomState(seed = 20)
bootstrap_auc(xgb_optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84769675, 0.85419761])

In [294]:
np.mean(auc_bootstrap)

0.8510890163941773

In [295]:
t_20 = auc_bootstrap
print(t_20)

[0.8515838442508337, 0.8474113513319709, 0.84869223046861, 0.8502070560395827, 0.8512082750708114, 0.8510083898031623, 0.8522534150801333, 0.8505539421318705, 0.8504078376537234, 0.8508129862679717, 0.8515999784876842, 0.8514623893012083, 0.8503235810835037, 0.8519280413036463, 0.8539484062959379, 0.850024201355276, 0.8536777096554443, 0.850110250618479, 0.850325373776487, 0.8522319027643325, 0.8504360725682119, 0.851478075364813, 0.8542065540855472, 0.8525895450145208, 0.8505243626976444, 0.8483507224552724, 0.8518133089527088, 0.8526729052382489, 0.8489234878634684, 0.8512037933383529, 0.8521395790756876, 0.8507233516188017, 0.8506767416012334, 0.8521718475493887, 0.8510316948119465, 0.8551844681079919, 0.8518384066544763, 0.8530807428919722, 0.846216521458535, 0.8524290989925065, 0.849665662758596, 0.8506561256319243, 0.8501322111075257, 0.8501057688860205, 0.8527302714137176, 0.8534025312824925, 0.8441504427951669, 0.8523448424222868, 0.8495840952278513, 0.8503934961098563, 0.85335

In [296]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [297]:
# 21
column_to_drop_20 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [298]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(10564, 149)


In [299]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [300]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [301]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 178, 'learning_rate': 0.060000000000000005, 'max_depth': 5, 'max_leaves': 616, 'subsample': 0.8, 'colsample_bytree': 0.2, 'gamma': 3, 'reg_alpha': 7, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8501752205651842


In [302]:
xgb_optuna_21 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_21.fit(X_train, y_train)

In [303]:
xgb_optuna_proba_21 = xgb_optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, xgb_optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.855


In [304]:
X_train = X_train.values
y_train = y_train.values

In [305]:
auc_bootstrap = []

In [306]:
rs = RandomState(seed = 21)
bootstrap_auc(xgb_optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.850318  , 0.85585861])

In [307]:
np.mean(auc_bootstrap)

0.8531366948567638

In [308]:
t_21 = auc_bootstrap
print(t_21)

[0.8501989889211574, 0.8513212147287657, 0.8515022767200888, 0.8514511849700621, 0.8527213079488007, 0.8555681044064394, 0.8525622064465239, 0.8543401097128105, 0.8541294682872611, 0.854855508945538, 0.8529265712954, 0.8536570936861353, 0.8532635975762791, 0.8523959341723135, 0.8518814312860779, 0.8543611738553656, 0.8503908070703812, 0.8528297658742963, 0.8541931088881718, 0.8517783514395325, 0.8512486106629379, 0.8520490480800258, 0.8552310781255602, 0.8524075866767056, 0.8518195833781507, 0.8538076798967409, 0.8548626797174717, 0.8517299487289807, 0.8542361335197733, 0.8532304327560862, 0.8532331217955612, 0.851782833171991, 0.8522525187336417, 0.8530359255673874, 0.8517864185579578, 0.8541222975153275, 0.8513938187945933, 0.8497723279911081, 0.8518375103079845, 0.8548071062349862, 0.8523663547380875, 0.854706715427916, 0.8533366498153525, 0.8543903051163458, 0.8512763974041806, 0.8528154243304292, 0.8522077014090568, 0.8537323867914381, 0.8520705603958265, 0.8523385679968449, 0.854

In [309]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [310]:
# 22
column_to_drop_21 = '자산 중 기타자산의 비중'

In [311]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(10564, 148)


In [312]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [313]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [314]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 163, 'learning_rate': 0.04, 'max_depth': 5, 'max_leaves': 848, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'gamma': 4, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.850042265663487


In [315]:
xgb_optuna_22 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_22.fit(X_train, y_train)

In [316]:
xgb_optuna_proba_22 = xgb_optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, xgb_optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.855


In [317]:
X_train = X_train.values
y_train = y_train.values

In [318]:
auc_bootstrap = []

In [319]:
rs = RandomState(seed = 22)
bootstrap_auc(xgb_optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84924324, 0.85541859])

In [320]:
np.mean(auc_bootstrap)

0.8523950712147287

In [321]:
t_22 = auc_bootstrap
print(t_22)

[0.8525590692338029, 0.8532716646947044, 0.8489064572801261, 0.8514686637266502, 0.8512826718296225, 0.8510738230970565, 0.8534509339930444, 0.8522937506722598, 0.8517953820228747, 0.8551055896167222, 0.8522301100713492, 0.8534755835215662, 0.8544557384102398, 0.8521843964002725, 0.85288444301029, 0.8546412821340217, 0.8542164138969561, 0.8535835932738158, 0.8531165967516403, 0.8504141120791655, 0.854043419024058, 0.8524264099530315, 0.8527006919794915, 0.8504069413072318, 0.8512602631673301, 0.8509904628733282, 0.8548832956867807, 0.8545220680506256, 0.8529194005234664, 0.8527616435409272, 0.8534446595676026, 0.850737244989423, 0.8544906959234162, 0.848874188806425, 0.8535826969273241, 0.8526236061812055, 0.8513032877989316, 0.8501712021799146, 0.8534321107167187, 0.8511724212111433, 0.8516438994657775, 0.8515202036499229, 0.855449786669535, 0.8550096805421104, 0.8537570363199598, 0.8534204582123265, 0.8506874977591339, 0.8469425621168118, 0.855333261625614, 0.8501819583378151, 0.8515

In [322]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [323]:
# 23.
column_to_drop_22 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [324]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(10564, 144)


In [325]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [326]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [327]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8496133111648528


In [328]:
xgb_optuna_23 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_23.fit(X_train, y_train)

In [329]:
xgb_optuna_proba_23 = xgb_optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, xgb_optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.851


In [330]:
X_train = X_train.values
y_train = y_train.values

In [331]:
auc_bootstrap = []

In [332]:
rs = RandomState(seed = 23)
bootstrap_auc(xgb_optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8473053 , 0.85242928])

In [333]:
np.mean(auc_bootstrap)

0.8499725388566204

In [334]:
t_23 = auc_bootstrap
print(t_23)

[0.8502890717435734, 0.850475063640601, 0.8489629271091033, 0.8507744433688287, 0.8487576637625042, 0.850116525043921, 0.8495486895414291, 0.8485631565738052, 0.8506588146713994, 0.8507551719192572, 0.8497243734538023, 0.8486456204510415, 0.8492313828833674, 0.849650424868237, 0.8495733390699509, 0.8476166146785702, 0.8480213151195725, 0.8490736259008282, 0.8482409200100391, 0.8500515399232726, 0.8495231436664157, 0.8504454842063748, 0.8494895306729768, 0.8516456921587608, 0.8512230647879244, 0.8522000824638774, 0.8500492990570435, 0.8497243734538024, 0.8490476318525688, 0.8474965042486824, 0.8494783263418306, 0.849342978021584, 0.8499341185328599, 0.8513001505862106, 0.8503231329102577, 0.8515076547990391, 0.852003334408949, 0.8480674769638952, 0.8502635258685598, 0.8515363378867735, 0.849790703094188, 0.8478147072532358, 0.8494532286400631, 0.8477022157685276, 0.8485506077229215, 0.8509537126671686, 0.8494128930479365, 0.8505727654081962, 0.8516981284285252, 0.8483740274640565, 0.850

In [335]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [336]:
# 24
column_to_drop_23 = 'Cat_현재 주차시설 이용편의성'

In [337]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(10564, 140)


In [338]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [339]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [340]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 168, 'learning_rate': 0.04, 'max_depth': 8, 'max_leaves': 1014, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'gamma': 4, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8502214048994582


In [341]:
xgb_optuna_24 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_24.fit(X_train, y_train)

In [342]:
xgb_optuna_proba_24 = xgb_optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, xgb_optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.855


In [343]:
X_train = X_train.values
y_train = y_train.values

In [344]:
auc_bootstrap = []

In [345]:
rs = RandomState(seed = 24)
bootstrap_auc(xgb_optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84911154, 0.85572808])

In [346]:
np.mean(auc_bootstrap)

0.8524762510756159

In [347]:
t_24 = auc_bootstrap
print(t_24)

[0.8534751353483202, 0.8535818005808326, 0.8533128966333227, 0.8515901186762754, 0.8527159298698506, 0.8521440608081461, 0.8500717077193359, 0.8539035889713527, 0.8547739414147933, 0.8529149187910079, 0.8513983005270518, 0.8517962783693664, 0.8525205263346599, 0.8533899824316089, 0.8535289161378221, 0.8520239503782584, 0.8547174715858163, 0.8504634111362088, 0.8546188734717293, 0.8526451184970061, 0.8539501989889211, 0.8515910150227672, 0.8498861639955541, 0.8530664013481052, 0.8525931304004877, 0.8557267577354701, 0.8498727187981787, 0.8534840988132373, 0.8514189164963609, 0.8549415582087414, 0.8543293535549102, 0.8534625864974364, 0.8520284321107168, 0.8560790219067081, 0.8551306873184898, 0.8519172851457459, 0.8539403391775126, 0.853496647664121, 0.8504078376537234, 0.8541214011688358, 0.8522982324047184, 0.8534392814886522, 0.8539672295722635, 0.8547291240902082, 0.8509250295794342, 0.8558809293320426, 0.8561121867269011, 0.8537610698791724, 0.8517550464307482, 0.8516860277508875, 

In [348]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [349]:
column_to_drop_24 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [350]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(10564, 139)


In [351]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [352]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [353]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8503571588517176


In [354]:
xgb_optuna_25 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_25.fit(X_train, y_train)

In [355]:
xgb_optuna_proba_25 = xgb_optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, xgb_optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.853


In [356]:
X_train = X_train.values
y_train = y_train.values

In [357]:
auc_bootstrap = []

In [358]:
rs = RandomState(seed = 25)
bootstrap_auc(xgb_optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84896981, 0.85408671])

In [359]:
np.mean(auc_bootstrap)

0.8514843412749633

In [360]:
t_25 = auc_bootstrap
print(t_25)

[0.8513194220357823, 0.8507892330859418, 0.8512450252769711, 0.8514565630490122, 0.8506695708292998, 0.8549173568534653, 0.8504172492918862, 0.8518805349395862, 0.8503943924563478, 0.8521225484923451, 0.8520804202072353, 0.8498861639955542, 0.8506256498512064, 0.8519966118102613, 0.8508734896561615, 0.8516291097486646, 0.850322684737012, 0.8513055286651607, 0.8529525653436593, 0.8523206410670108, 0.8541482915635868, 0.8487509411638163, 0.8532138503459898, 0.8511056433975117, 0.8509434046825142, 0.8504696855616508, 0.852547416729411, 0.8511827291957978, 0.8515309598078232, 0.8501254885088381, 0.852320192893765, 0.8515869814635546, 0.8510316948119465, 0.8532492560324119, 0.8499063317916173, 0.8509039654368793, 0.8538408447169338, 0.8517725251873363, 0.8521973934244023, 0.8502460471119717, 0.8522184575669571, 0.8519141479330251, 0.8549469362876914, 0.851346312430533, 0.8521467498476212, 0.8518142052992004, 0.8532210211179234, 0.8524102757161809, 0.8516416585995482, 0.8517483238320606, 0.8

In [361]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [362]:
# 26
column_to_drop_25 = 'Cat_현재 문화시설 접근용이성'

In [363]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(10564, 135)


In [364]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [365]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [366]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 167, 'learning_rate': 0.09, 'max_depth': 8, 'max_leaves': 1024, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'gamma': 5, 'reg_alpha': 3, 'reg_lambda': 8, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8495216422589456


In [367]:
xgb_optuna_26 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_26.fit(X_train, y_train)

In [368]:
xgb_optuna_proba_26 = xgb_optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, xgb_optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.854


In [369]:
X_train = X_train.values
y_train = y_train.values

In [370]:
auc_bootstrap = []

In [371]:
rs = RandomState(seed = 26)
bootstrap_auc(xgb_optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8478273 , 0.85482387])

In [372]:
np.mean(auc_bootstrap)

0.8513613399035531

In [373]:
t_26 = auc_bootstrap
print(t_26)

[0.8495867842673263, 0.8524685382381414, 0.8501783729518483, 0.8493725574558101, 0.8545198271843963, 0.850964468825069, 0.8519688250690187, 0.8528808576243232, 0.8527042773654584, 0.8510352801979133, 0.8514538740095371, 0.8534634828439283, 0.8495504822344125, 0.8495195582804489, 0.8482879782008533, 0.8526639417733319, 0.8507099064214263, 0.8495356925172994, 0.8556147144240078, 0.848297838012262, 0.8510935427198738, 0.851794485676383, 0.8530224803700118, 0.8514816607507798, 0.84966521458535, 0.8501111469649708, 0.8538838693485353, 0.8504185938116239, 0.8523797999354631, 0.8494478505611128, 0.8532438779534617, 0.8546986483094905, 0.8504678928686673, 0.852690832168083, 0.8517254669965222, 0.8528270768348214, 0.8512499551826754, 0.8532107131332688, 0.8507609981714532, 0.8529059553260909, 0.8519396938080384, 0.8530565415366964, 0.8514888315227135, 0.8481589043060485, 0.8476461941127964, 0.850800437417088, 0.853900003585386, 0.8513866480226597, 0.8511177440751496, 0.8516761679394786, 0.85237

In [374]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [375]:
# 27
column_to_drop_26 = 'Cat_현재 대중교통 접근용이성'

In [376]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(10564, 131)


In [377]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [378]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [379]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'learning_rate': 0.09999999999999999, 'max_depth': 8, 'max_leaves': 62, 'subsample': 0.9, 'colsample_bytree': 0.2, 'gamma': 3, 'reg_alpha': 2, 'reg_lambda': 2, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8517035021720631


In [380]:
xgb_optuna_27 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_27.fit(X_train, y_train)

In [381]:
xgb_optuna_proba_27 = xgb_optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, xgb_optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.857


In [382]:
X_train = X_train.values
y_train = y_train.values

In [383]:
auc_bootstrap = []

In [384]:
rs = RandomState(seed = 27)
bootstrap_auc(xgb_optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84857389, 0.85663787])

In [385]:
np.mean(auc_bootstrap)

0.8528516602577892

In [386]:
t_27 = auc_bootstrap
print(t_27)

[0.8498592736008033, 0.8498072855042844, 0.8547811121867269, 0.8551441325158654, 0.8532331217955613, 0.8503011724212111, 0.852263274891542, 0.8546592090638556, 0.85272354881503, 0.8529185041769747, 0.8552651392922448, 0.8556371230863002, 0.8499901401885912, 0.848503997705353, 0.8532680793087377, 0.8589562941450647, 0.851738015847406, 0.8489306586354021, 0.8512557814348716, 0.8538247104800831, 0.8559078197267936, 0.8525743071241619, 0.852834247606755, 0.8484376680649672, 0.8554444085905848, 0.8561014305690008, 0.8511930371804526, 0.8523896597468719, 0.8514789717113047, 0.8534383851421606, 0.8529812484313937, 0.8541886271557134, 0.8564510057007637, 0.8502312573948585, 0.8560870890251336, 0.8516815460184289, 0.8507332114302104, 0.8551728156035998, 0.8518554372378186, 0.8544987630418414, 0.8539089670503028, 0.8538659424187014, 0.8502393245132838, 0.8545247570901008, 0.8514592520884874, 0.8495105948155319, 0.8514431178516368, 0.8562712882291779, 0.8532797318131298, 0.8538766985766019, 0.853

In [387]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [388]:
# 28
column_to_drop_27  = '자산 중 금융자산의 비중'

In [389]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(10564, 130)


In [390]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [391]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)  

In [392]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'learning_rate': 0.04, 'max_depth': 3, 'max_leaves': 228, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'gamma': 3, 'reg_alpha': 3, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8502354001522683


In [393]:
xgb_optuna_28 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_28.fit(X_train, y_train)

In [394]:
xgb_optuna_proba_28 = xgb_optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, xgb_optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.855


In [395]:
X_train = X_train.values
y_train = y_train.values

In [396]:
auc_bootstrap = []

In [397]:
rs = RandomState(seed = 28)
bootstrap_auc(xgb_optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84998994, 0.85517891])

In [398]:
np.mean(auc_bootstrap)

0.8526053204886881

In [399]:
t_28 = auc_bootstrap
print(t_28)

[0.8515677100139831, 0.8497266143200315, 0.8535244344053639, 0.853613172708042, 0.8528405220321968, 0.851597737621455, 0.851657344663153, 0.8530879136639059, 0.8511777992900935, 0.8512253056541537, 0.8542070022587931, 0.8519791330536732, 0.8499991036535084, 0.8512486106629378, 0.8525061847907929, 0.850071707719336, 0.8550518088272203, 0.8521091032949698, 0.8538982108924026, 0.8516474848517442, 0.8550580832526622, 0.8507789251012872, 0.8513929224481016, 0.8539233085941702, 0.854613495392779, 0.8530135169050947, 0.8500250977017676, 0.8519235595711878, 0.851971065935248, 0.8528405220321968, 0.851110573303216, 0.8526137463697967, 0.8515798106916209, 0.8545122082392169, 0.8512181348822201, 0.8530744684665306, 0.8510343838514216, 0.8525644473127533, 0.852511562869743, 0.8544835251514826, 0.8517344304614392, 0.8522820981678679, 0.856241260621706, 0.8533281345236814, 0.8529131260980245, 0.8537234233265211, 0.8531031515542649, 0.8514565630490122, 0.8520266394177333, 0.8520795238607436, 0.851560

In [400]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [401]:
# 29
column_to_drop_28 = 'Cat_현재 교육환경'

In [402]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(10564, 126)


In [403]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [404]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [405]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 83, 'learning_rate': 0.08, 'max_depth': 8, 'max_leaves': 774, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.30000000000000004, 'gamma': 4, 'reg_alpha': 9, 'reg_lambda': 1, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8502542937435622


In [406]:
xgb_optuna_29 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_29.fit(X_train, y_train)

In [407]:
xgb_optuna_proba_29 = xgb_optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, xgb_optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.852


In [408]:
X_train = X_train.values
y_train = y_train.values

In [409]:
auc_bootstrap = []

In [410]:
rs = RandomState(seed = 29)
bootstrap_auc(xgb_optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84838376, 0.85392869])

In [411]:
np.mean(auc_bootstrap)

0.8511826922215051

In [412]:
t_29 = auc_bootstrap
print(t_29)

[0.8481741421964075, 0.8514493922770786, 0.8504580330572586, 0.8495930586927682, 0.8525406941307232, 0.8511509088953427, 0.8512755010576889, 0.852296439711735, 0.8535495321071314, 0.8470425047506365, 0.8515475422179197, 0.8515139292244811, 0.8501425190921804, 0.8508833494675703, 0.8514955541214011, 0.8508909684127497, 0.8498915420745042, 0.8507520347065362, 0.8502133304650246, 0.8530601269226633, 0.849755297407766, 0.849934566706106, 0.8509088953425836, 0.8497440930766198, 0.8505924850310136, 0.8515273744218566, 0.8524192391810979, 0.8534088057079344, 0.8532447742999534, 0.8529382237997922, 0.8525451758631817, 0.8510334875049299, 0.8512235129611703, 0.8486877487361514, 0.8506381987020903, 0.852956150729626, 0.8499977591337708, 0.85328510989208, 0.8508416693557062, 0.8497665017389122, 0.8500775339715321, 0.8534473486070776, 0.8495119393352695, 0.8536705388835109, 0.8523815926284464, 0.8514525294897994, 0.8510845792549568, 0.8508353949302642, 0.8494577103725216, 0.848945896525761, 0.8524

In [413]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [414]:
column_to_drop_29 = 'Cat_현재 주변도로의 보행 안전'

In [415]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(10564, 122)


In [416]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [417]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [418]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8500744547449506


In [419]:
xgb_optuna_30 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_30.fit(X_train, y_train)

In [420]:
xgb_optuna_proba_30 = xgb_optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, xgb_optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.853


In [421]:
X_train = X_train.values
y_train = y_train.values

In [422]:
auc_bootstrap = []

In [423]:
rs = RandomState(seed = 30)
bootstrap_auc(xgb_optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84836989, 0.853746  ])

In [424]:
np.mean(auc_bootstrap)

0.8511277564895485

In [425]:
t_30 = auc_bootstrap
print(t_30)

[0.850868559750457, 0.8509330966978595, 0.8521843964002724, 0.850800437417088, 0.8526141945430425, 0.8507013911297552, 0.8502379799935463, 0.850946990068481, 0.8521615395647343, 0.8506175827327813, 0.8537288014054714, 0.8504661001756839, 0.8532214692911693, 0.8507928184719087, 0.8512902907748019, 0.8527118963106379, 0.8500900828224159, 0.8522991287512102, 0.8515578502025742, 0.8520284321107168, 0.850158205155785, 0.8515134810512351, 0.8481347029507729, 0.8519356602488257, 0.850779373274533, 0.8512302355598581, 0.8517779032662867, 0.8491498153526227, 0.8490346348284393, 0.8523932451328387, 0.8515273744218566, 0.8490301530959807, 0.8511500125488508, 0.8505328779893154, 0.8480289340647522, 0.8522319027643326, 0.8511401527374421, 0.8487132946111648, 0.8501142841776917, 0.8522717901832132, 0.8510406582768635, 0.8516313506148937, 0.8520109533541286, 0.8504100785199528, 0.8518509555053603, 0.8513651357068587, 0.8499775913377074, 0.8520929690581192, 0.8476121329461117, 0.8517389121938977, 0.84

In [426]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [427]:
# 31
column_to_drop_30 = '현재 무주택 기간(총 개월)'

In [428]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(10564, 121)


In [429]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [430]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [431]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 9, 'max_leaves': 254, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 2, 'reg_alpha': 4, 'reg_lambda': 5, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8517804760625196


In [432]:
xgb_optuna_31= XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_31.fit(X_train, y_train)

In [433]:
xgb_optuna_proba_31 = xgb_optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, xgb_optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.855


In [434]:
X_train = X_train.values
y_train = y_train.values

In [435]:
auc_bootstrap = []

In [436]:
rs = RandomState(seed = 31)
bootstrap_auc(xgb_optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8490704 , 0.85573142])

In [437]:
np.mean(auc_bootstrap)

0.8525057919669428

In [438]:
t_31 = auc_bootstrap
print(t_31)

[0.8518500591588685, 0.8521180667598867, 0.8538569789537843, 0.8532976587429637, 0.8513360044458786, 0.8524775017030584, 0.8531174930981319, 0.8493026424294576, 0.8506919794915923, 0.8547022336954574, 0.8525214226811515, 0.8533487504929907, 0.8562780108278657, 0.8517424975798644, 0.8544987630418415, 0.8536068982826002, 0.8526540819619233, 0.8521969452511563, 0.8512584704743464, 0.8523367753038615, 0.8542800544978667, 0.8510514144347638, 0.8512799827901473, 0.8534356961026854, 0.8517330859417016, 0.8519629988168227, 0.8519029436018789, 0.8518375103079846, 0.8545615072962603, 0.8533048295148972, 0.8533209637517479, 0.8511374636979671, 0.8525133555627263, 0.8512342691190706, 0.8531560359972752, 0.8521772256283389, 0.8523367753038614, 0.8521888781327309, 0.8526352586855974, 0.8546036355813702, 0.8511804883295688, 0.8499444265175147, 0.8503500233050088, 0.8523349826108781, 0.8516564483166612, 0.8542800544978668, 0.8537951310458571, 0.8540048761249148, 0.8550804919149547, 0.8542890179627838,

In [439]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [440]:
# 32
column_to_drop_31 = '소득 중 정부 보조금의 비중(월평균)'

In [441]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(10564, 120)


In [442]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [443]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [444]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8472194231716601


In [445]:
xgb_optuna_32 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_32.fit(X_train, y_train)

In [446]:
xgb_optuna_proba_32 = xgb_optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, xgb_optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.852


In [447]:
X_train = X_train.values
y_train = y_train.values

In [448]:
auc_bootstrap = []

In [449]:
rs = RandomState(seed = 32)
bootstrap_auc(xgb_optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84736304, 0.85319711])

In [450]:
np.mean(auc_bootstrap)

0.8503073870155247

In [451]:
t_32 = auc_bootstrap
print(t_32)

[0.8497301997059983, 0.8497257179735398, 0.8494675701839304, 0.8481606969990321, 0.8515488867376573, 0.8503092395396364, 0.8513185256892905, 0.8512656412462801, 0.8536042092431251, 0.8499919328815747, 0.8478156035997275, 0.8509564017066438, 0.8506234089849773, 0.8483417589903552, 0.8505992076297013, 0.8514565630490123, 0.8479142017138145, 0.8482834964683947, 0.8501649277544728, 0.8474906779964864, 0.8522354881502994, 0.8499202251622388, 0.8494487469076046, 0.847939747588828, 0.8481051235165464, 0.8481992398981751, 0.8496782116094798, 0.8532627012297873, 0.8505198809651858, 0.8511670431321932, 0.8498772005306372, 0.8520194686457997, 0.8493734538023018, 0.851435947079703, 0.8484986196264028, 0.8509689505575275, 0.8507690652898785, 0.8489109390125846, 0.8498099745437596, 0.849259617797856, 0.8499515972894482, 0.8514749381520922, 0.8521736402423721, 0.8526092646373382, 0.8513516905094833, 0.8501066652325124, 0.8514350507332114, 0.8515865332903088, 0.8502975870352443, 0.8502868308773439, 0.

In [452]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [453]:
# 33.
column_to_drop_32 = '소득 중 근로/사업소득의 비중(월평균)'

In [454]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(10564, 119)


In [455]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [456]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [457]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8492725267589234


In [458]:
xgb_optuna_33 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_33.fit(X_train, y_train)

In [459]:
xgb_optuna_proba_33 = xgb_optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, xgb_optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.853


In [460]:
X_train = X_train.values
y_train = y_train.values

In [461]:
auc_bootstrap = []

In [462]:
rs = RandomState(seed = 33)
bootstrap_auc(xgb_optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84711464, 0.852785  ])

In [463]:
np.mean(auc_bootstrap)

0.8501020031103224

In [464]:
t_33 = auc_bootstrap
print(t_33)

[0.8531569323437669, 0.8499291886271557, 0.8502993797282277, 0.8490023663547382, 0.850394392456348, 0.8499551826754149, 0.8508327058907892, 0.8468116955290237, 0.8480925746656628, 0.8511867627550105, 0.8500250977017676, 0.8493107095478829, 0.8513005987594564, 0.849295471657524, 0.8511365673514755, 0.8521718475493888, 0.8510245240400129, 0.8514269836147862, 0.8514458068911118, 0.8494209601663619, 0.850253666057151, 0.849573339069951, 0.8501720985264063, 0.8500466100175683, 0.8506866014126422, 0.8508873830267827, 0.8472276003011723, 0.8527437166110933, 0.8494317163242622, 0.8498287978200854, 0.8504759599870927, 0.850244702592234, 0.8508963464916999, 0.8525321788390521, 0.8477627191567173, 0.8497481266358323, 0.8471146606432183, 0.852054426158976, 0.8486312789071742, 0.8508604926320318, 0.8483847836219569, 0.8512710193252303, 0.8505382560682659, 0.8501407263991968, 0.8507928184719085, 0.8510881646409236, 0.8507004947832635, 0.8510021153777204, 0.8465607185113477, 0.8494612957584884, 0.852

In [465]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [466]:
# 34
column_to_drop_33 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [467]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(10564, 115)


In [468]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [469]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [470]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 138, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 672, 'subsample': 0.6, 'colsample_bytree': 0.9, 'gamma': 5, 'reg_alpha': 1, 'reg_lambda': 2, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8506202696045502


In [471]:
xgb_optuna_34 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_34.fit(X_train, y_train)

In [472]:
xgb_optuna_proba_34 = xgb_optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, xgb_optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.856


In [473]:
X_train = X_train.values
y_train = y_train.values

In [474]:
auc_bootstrap = []

In [475]:
rs = RandomState(seed = 34)
bootstrap_auc(xgb_optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84830833, 0.85562869])

In [476]:
np.mean(auc_bootstrap)

0.8521610826521101

In [477]:
t_34 = auc_bootstrap
print(t_34)

[0.8544602201426983, 0.8533111039403392, 0.8504750636406009, 0.8541975906206304, 0.853642752142268, 0.8491780502671112, 0.8522883725933096, 0.8517953820228747, 0.8517460829658313, 0.8499390484385643, 0.8546251478971711, 0.8517716288408448, 0.8526926248610662, 0.8508963464916999, 0.8517900039439247, 0.8525330751855438, 0.8538856620415187, 0.854927216664874, 0.8498332795525438, 0.8517514610447815, 0.8502043670001075, 0.8522417625757414, 0.8542917070022588, 0.8534616901509448, 0.8525178372951848, 0.8512046896848446, 0.8526191244487469, 0.8507323150837188, 0.8538614606862429, 0.8544010612742461, 0.8550791473952171, 0.855279929009358, 0.8546574163708723, 0.8498592736008032, 0.8514009895665269, 0.8519405901545302, 0.8479025492094223, 0.848676992578251, 0.847918683446273, 0.8535567028790649, 0.8524559893872575, 0.8551692302176329, 0.8501685131404396, 0.8556227815424331, 0.8561695529023698, 0.8533326162561399, 0.8522668602775089, 0.8538363629844754, 0.8518204797246423, 0.853792442006382, 0.852

In [478]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [479]:
# 35
column_to_drop_34 = '소득 대비 주거관리비의 비율'

In [480]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(10564, 114)


In [481]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [482]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [483]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 153, 'learning_rate': 0.04, 'max_depth': 8, 'max_leaves': 1024, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 5, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8492949191634198


In [484]:
xgb_optuna_35 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_35.fit(X_train, y_train)

In [485]:
xgb_optuna_proba_35 = xgb_optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, xgb_optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.853


In [486]:
X_train = X_train.values
y_train = y_train.values

In [487]:
auc_bootstrap = []

In [488]:
rs = RandomState(seed = 35)
bootstrap_auc(xgb_optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84906029, 0.85497942])

In [489]:
np.mean(auc_bootstrap)

0.8521157555304578

In [490]:
t_35 = auc_bootstrap
print(t_35)

[0.8523520131942204, 0.8524685382381414, 0.8526182281022553, 0.8498108708902514, 0.8540228030547488, 0.8521189631063785, 0.8517711806675989, 0.8516026675271593, 0.8517980710623498, 0.8518276504965759, 0.851773869707074, 0.8482575024201355, 0.8536077946290919, 0.8536499229142017, 0.8490785558065326, 0.8521548169660463, 0.8523403606898283, 0.8517523573912732, 0.8527401312251265, 0.8540232512279946, 0.8508291205048224, 0.8510406582768635, 0.8511258111935751, 0.8536812950414114, 0.8507305223907354, 0.8507063210354595, 0.8531434871463912, 0.8523878670538884, 0.8540927180811015, 0.8538766985766018, 0.8509295113118928, 0.8513848553296763, 0.8531094259797068, 0.852135097343229, 0.8513965078340684, 0.852854863576064, 0.854541787673443, 0.850937578430318, 0.8520391882686171, 0.8509680542110358, 0.8525456240364275, 0.8513947151410849, 0.8535414649887061, 0.8523672510845793, 0.852278512781901, 0.8507914739521709, 0.8499184324692552, 0.8509079989960919, 0.8540846509626762, 0.8498664443727368, 0.853

In [491]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [492]:
# 36
column_to_drop_35 = '중기부채부담지표'

In [493]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(10564, 113)


In [494]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [495]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [496]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8483880267813158


In [497]:
xgb_optuna_36 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_36.fit(X_train, y_train)

In [498]:
xgb_optuna_proba_36 = xgb_optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, xgb_optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.852


In [499]:
X_train = X_train.values
y_train = y_train.values

In [500]:
auc_bootstrap = []

In [501]:
rs = RandomState(seed = 36)
bootstrap_auc(xgb_optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84715381, 0.85279035])

In [502]:
np.mean(auc_bootstrap)

0.849994874914847

In [503]:
t_36 = auc_bootstrap
print(t_36)

[0.8494630884514718, 0.8503719837940555, 0.8501604460220142, 0.8493985515040694, 0.8507636872109283, 0.8515856369438171, 0.8496943458463304, 0.8504571367107671, 0.8489521709512029, 0.8511383600444588, 0.8497768097235666, 0.850322684737012, 0.8507601018249614, 0.8475153275250081, 0.847142447384461, 0.8507475529740777, 0.8475673156215267, 0.8503388189738625, 0.8483740274640564, 0.8505360152020366, 0.8520813165537271, 0.8507385895091606, 0.8495545157936252, 0.8512916352945394, 0.8533370979885984, 0.8501326592807716, 0.8506265461976982, 0.8498045964648095, 0.8483677530386146, 0.8498601699472947, 0.8489503782582195, 0.8494810153813057, 0.8496800043024633, 0.850752931053028, 0.852433580724965, 0.8507385895091606, 0.8522731347029507, 0.8506408877415654, 0.8499076763113549, 0.8488589509160661, 0.8501962998816823, 0.8505234663511526, 0.849844932056936, 0.8496333942848948, 0.8516833387114122, 0.8516734789000036, 0.8509752249829694, 0.8522139758344985, 0.8497158581621311, 0.8516806496719371, 0.85

In [504]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [505]:
# 37
column_to_drop_36 = 'Cat_현재 거주 지역'

In [506]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(10564, 96)


In [507]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [508]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [509]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8481962918178154


In [510]:
xgb_optuna_37 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_37.fit(X_train, y_train)

In [511]:
xgb_optuna_proba_37 = xgb_optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, xgb_optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.851


In [512]:
X_train = X_train.values
y_train = y_train.values

In [513]:
auc_bootstrap = []

In [514]:
rs = RandomState(seed = 37)
bootstrap_auc(xgb_optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84742827, 0.85267364])

In [515]:
np.mean(auc_bootstrap)

0.850107697599584

In [516]:
t_37 = auc_bootstrap
print(t_37)

[0.8491153060126923, 0.8504611702699796, 0.8492372091355634, 0.8497051020042308, 0.8499157434297803, 0.8498664443727366, 0.8500000000000001, 0.8491368183284931, 0.8508649743644903, 0.8522229392994155, 0.84918791007852, 0.8470277150335235, 0.8493564232189594, 0.8508963464916999, 0.8528521745365888, 0.8500636406009108, 0.8503155139650782, 0.8484296009465419, 0.8493886916926608, 0.8492856118461153, 0.8471370693055108, 0.8489987809687712, 0.849465329317701, 0.8497042056577391, 0.8487173281703775, 0.8509142734215338, 0.8541733892653545, 0.8474301746082964, 0.8488401276397404, 0.849835968592019, 0.8485869097558352, 0.8486536875694668, 0.8483076978236708, 0.8487531820300456, 0.8506597110178912, 0.8510236276935212, 0.8515578502025744, 0.8506256498512066, 0.8522578968125918, 0.8491807393065864, 0.8509958409522786, 0.8479643971173497, 0.8517541500842566, 0.8499713169122656, 0.8486617546878921, 0.8501774766053565, 0.8508981391846833, 0.8499569753683984, 0.8513812699437094, 0.852233695457316, 0.85

In [517]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [518]:
# 38
column_to_drop_37 = 'Cat_가구주 종사상 지위'

In [519]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(10564, 91)


In [520]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [521]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [522]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 108, 'learning_rate': 0.09999999999999999, 'max_depth': 10, 'max_leaves': 468, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 6, 'reg_alpha': 10, 'reg_lambda': 6, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8475371154104527


In [523]:
xgb_optuna_38 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_38.fit(X_train, y_train)

In [524]:
xgb_optuna_proba_38 = xgb_optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, xgb_optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.852


In [525]:
X_train = X_train.values
y_train = y_train.values

In [526]:
auc_bootstrap = []

In [527]:
rs = RandomState(seed = 38)
bootstrap_auc(xgb_optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84810102, 0.85314694])

In [528]:
np.mean(auc_bootstrap)

0.8506816027123446

In [529]:
t_38 = auc_bootstrap
print(t_38)

[0.8510975762790866, 0.8472553870424151, 0.8512925316410311, 0.8508797640816033, 0.8498785450503745, 0.850071707719336, 0.8525232153741351, 0.851692750349575, 0.848503997705353, 0.8513360044458786, 0.8476296117026998, 0.851603563873651, 0.8495513785809042, 0.8494859452870102, 0.8490422537736186, 0.8496526657344664, 0.8492578251048727, 0.8516905094833459, 0.8501644795812269, 0.8495643756050338, 0.8485443332974794, 0.8532344663152991, 0.851740704886881, 0.848324728407013, 0.8504392097809329, 0.8495935068660141, 0.8508497364741315, 0.8520656304901222, 0.8520736976085476, 0.8505422896274784, 0.8493734538023018, 0.8500721558925819, 0.8507775805815496, 0.8499408411315478, 0.851702161987738, 0.8529718367932309, 0.8507520347065362, 0.8492658922232977, 0.8517752142268116, 0.850026890394751, 0.8518258578035925, 0.85, 0.8509922555663116, 0.8510886128141695, 0.8507430712416192, 0.850462962962963, 0.8498256606073644, 0.8499148470832885, 0.8494783263418307, 0.8489485855652361, 0.8500739485855653, 0.

In [530]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [531]:
# 38
column_to_drop_38 = '소득 대비 생활비의 비율'

In [532]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(10564, 90)


In [533]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [534]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [535]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 117, 'learning_rate': 0.09999999999999999, 'max_depth': 7, 'max_leaves': 722, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'gamma': 5, 'reg_alpha': 4, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8502549935062027


In [536]:
xgb_optuna_39 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_39.fit(X_train, y_train)

In [537]:
xgb_optuna_proba_39 = xgb_optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, xgb_optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.852


In [538]:
X_train = X_train.values
y_train = y_train.values

In [539]:
auc_bootstrap = []

In [540]:
rs = RandomState(seed = 39)
bootstrap_auc(xgb_optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84791129, 0.85352678])

In [541]:
np.mean(auc_bootstrap)

0.8508327984385644

In [542]:
t_39 = auc_bootstrap
print(t_39)

[0.8509667096912983, 0.8498942311139794, 0.8492784410741816, 0.8509093435158296, 0.8512665375927719, 0.8512405435445125, 0.8514861424832384, 0.8511715248646516, 0.8487101573984439, 0.8519392456347925, 0.8513906815818723, 0.8510370728908967, 0.8516201462837474, 0.8508891757197663, 0.8509918073930659, 0.8499036427521423, 0.8526437739772685, 0.8538027499910366, 0.8506337169696319, 0.8516196981105018, 0.8523027141371768, 0.8522704456634758, 0.8524228245670648, 0.8512266501738912, 0.8520042307554407, 0.8494702592234054, 0.8512553332616256, 0.8497091355634434, 0.8525523466351153, 0.8524640565056828, 0.8525456240364274, 0.8511643540927182, 0.8494075149689864, 0.8510594815531892, 0.8528212505826251, 0.8509026209171419, 0.8521458535011294, 0.8505530457853788, 0.8504293499695241, 0.8495634792585423, 0.8525805815496038, 0.8517868667312036, 0.8513781327309885, 0.8501945071886988, 0.8519145961062706, 0.8492932307912946, 0.849549137714675, 0.8516013230074218, 0.8512674339392634, 0.8501133878311999, 

In [543]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [544]:
# 40
column_to_drop_39 = 'Cat_주택 보유 의식'

In [545]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(10564, 88)


In [546]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [547]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [548]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 106, 'learning_rate': 0.09, 'max_depth': 5, 'max_leaves': 590, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'gamma': 4, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8491353732813831


In [549]:
xgb_optuna_40 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_40.fit(X_train, y_train)

In [550]:
xgb_optuna_proba_40 = xgb_optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, xgb_optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.853


In [551]:
X_train = X_train.values
y_train = y_train.values

In [552]:
auc_bootstrap = []

In [553]:
rs = RandomState(seed = 40)
bootstrap_auc(xgb_optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84775844, 0.85397431])

In [554]:
np.mean(auc_bootstrap)

0.8509958203363092

In [555]:
t_40 = auc_bootstrap
print(t_40)

[0.8531645512889463, 0.8488706034204583, 0.85092547775268, 0.8525743071241619, 0.8521682621634218, 0.8501021835000538, 0.8518034491413001, 0.8549711376429673, 0.8531112186726901, 0.8502267756624, 0.8495675128177549, 0.8522193539134488, 0.8535871786597828, 0.8471527553691156, 0.8515426123122154, 0.8516080456061096, 0.8487316697142449, 0.8529149187910078, 0.8520840055932022, 0.85184557742641, 0.8513373489656162, 0.8543033595066508, 0.8486729590190384, 0.849973109605249, 0.8506292352371732, 0.8503957369760855, 0.8495912659997849, 0.8507466566275859, 0.8525281452798394, 0.8515600910688036, 0.8512073787243197, 0.8523914524398551, 0.8552185292746763, 0.848011455308164, 0.8502007816141408, 0.8543804453049371, 0.8529427055322506, 0.8495213509734323, 0.8508448065684271, 0.8522157685274822, 0.8511365673514754, 0.8493600086049263, 0.8509523681474311, 0.8513785809042342, 0.8535764225018825, 0.8489194543042557, 0.8488069628195475, 0.8495894733068016, 0.8501685131404396, 0.8488105482055144, 0.853267

In [556]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [557]:
# 41.
column_to_drop_40 = 'Cat_이사 계획 중인 주택의 유형'

In [558]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(10564, 72)


In [559]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [560]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [561]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.09999999999999999, 'max_depth': 6, 'max_leaves': 324, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.6, 'gamma': 5, 'reg_alpha': 3, 'reg_lambda': 9, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8486098515383582


In [562]:
xgb_optuna_41 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_41.fit(X_train, y_train)

In [563]:
xgb_optuna_proba_41 = xgb_optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, xgb_optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.851


In [564]:
X_train = X_train.values
y_train = y_train.values

In [565]:
auc_bootstrap = []

In [566]:
rs = RandomState(seed = 41)
bootstrap_auc(xgb_optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84546982, 0.85121277])

In [567]:
np.mean(auc_bootstrap)

0.8485021913430856

In [568]:
t_41 = auc_bootstrap
print(t_41)

[0.8493725574558102, 0.8484233265211, 0.8485524004159049, 0.8491206840916424, 0.845302696210247, 0.8478935857445054, 0.8470263705137858, 0.8494433688286543, 0.8471854720160624, 0.8482790147359364, 0.8485685346527554, 0.8472329783801227, 0.8474454124986553, 0.8478962747839806, 0.8492246602846797, 0.8488150299379729, 0.8474149367179378, 0.8513830626366929, 0.8470483310028325, 0.8479652934638414, 0.8472240149152056, 0.8483247284070131, 0.8451104298877775, 0.8470447456168657, 0.847957226345416, 0.8478971711304722, 0.8469058119106522, 0.8497364741314403, 0.8478747624681797, 0.8491888064250117, 0.8483673048653688, 0.849265444050052, 0.8495545157936253, 0.8505413932809867, 0.8469385285575992, 0.8475525259044138, 0.8522587931590836, 0.8529794557384103, 0.8494012405435446, 0.850773547022337, 0.8478657990032626, 0.8467677745509304, 0.848578394464164, 0.8493501487935178, 0.846822451686924, 0.8495186619339574, 0.8457145674231832, 0.8474427234591804, 0.8477618228102255, 0.8517541500842565, 0.844949

In [569]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [570]:
# 42.
column_to_drop_41 = 'Cat_현재 의료시설 접근용이성'

In [571]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(10564, 68)


In [572]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [573]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [574]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 160, 'learning_rate': 0.09999999999999999, 'max_depth': 9, 'max_leaves': 400, 'subsample': 0.9, 'colsample_bytree': 0.1, 'gamma': 5, 'reg_alpha': 2, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8510499238658247


In [575]:
xgb_optuna_42 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_42.fit(X_train, y_train)

In [576]:
xgb_optuna_proba_42 = xgb_optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, xgb_optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.850


In [577]:
X_train = X_train.values
y_train = y_train.values

In [578]:
auc_bootstrap = []

In [579]:
rs = RandomState(seed = 42)
bootstrap_auc(xgb_optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84799212, 0.8524417 ])

In [580]:
np.mean(auc_bootstrap)

0.850238479930802

In [581]:
t_42 = auc_bootstrap
print(t_42)

[0.8504584812305044, 0.8506104119608476, 0.8496965867125594, 0.8505499085726579, 0.8490525617582733, 0.8497951848266465, 0.8500546771359937, 0.8498758560108997, 0.8483632713061562, 0.8513839589831844, 0.8500878419561866, 0.8506386468753361, 0.8493891398659065, 0.8510097343228998, 0.8524075866767058, 0.8493568713922054, 0.8472975153275251, 0.8513005987594565, 0.8505857624323259, 0.8488141335914811, 0.8504911978774516, 0.8474454124986556, 0.8524654010254205, 0.8472804847441828, 0.8469353913448783, 0.8523322935714029, 0.8494379907497043, 0.8508197088666596, 0.8481768312358826, 0.8499623534473486, 0.852084005593202, 0.8512418880642503, 0.8496970348858055, 0.8498933347674877, 0.8496432540963035, 0.8536754687892152, 0.851526029902119, 0.8493591122584345, 0.8504732709476175, 0.8503056541536695, 0.8499605607543652, 0.85108099386899, 0.8517967265426123, 0.8490122261661468, 0.8509133770750421, 0.8515730880929333, 0.8502236384496791, 0.8487832096375174, 0.8478640063102793, 0.8526415331110394, 0.8

In [582]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [583]:
# 43.
column_to_drop_42 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [584]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(10564, 64)


In [585]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [586]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [587]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 196, 'learning_rate': 0.060000000000000005, 'max_depth': 8, 'max_leaves': 732, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'gamma': 3, 'reg_alpha': 4, 'reg_lambda': 5, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8483712324779435


In [588]:
xgb_optuna_43 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_43.fit(X_train, y_train)

In [589]:
xgb_optuna_proba_43 = xgb_optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, xgb_optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.852


In [590]:
X_train = X_train.values
y_train = y_train.values

In [591]:
auc_bootstrap = []

In [592]:
rs = RandomState(seed = 43)
bootstrap_auc(xgb_optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84788285, 0.85301547])

In [593]:
np.mean(auc_bootstrap)

0.85067346321394

In [594]:
t_43 = auc_bootstrap
print(t_43)

[0.8478487684199204, 0.8495231436664157, 0.8516618263956115, 0.8522812018213761, 0.8495831988813596, 0.8490050553942132, 0.8514305690007529, 0.8497517120217991, 0.8512737083647054, 0.8487890358897134, 0.8475099494460578, 0.8516528629306945, 0.8508909684127496, 0.8503683984080885, 0.8517326377684558, 0.848796206661647, 0.8500914273421534, 0.8484475278763758, 0.8497051020042308, 0.8474454124986555, 0.8524308916854899, 0.8527006919794916, 0.852003334408949, 0.8516438994657775, 0.8496217417805026, 0.8501586533290308, 0.8505046430748271, 0.8523349826108781, 0.8508434620486895, 0.8513696174393175, 0.8474256928758381, 0.8513749955182675, 0.8502464952852176, 0.8504401061274247, 0.850242013552759, 0.8517720770140906, 0.8508667670574739, 0.8510433473163387, 0.851582947904342, 0.850134451973755, 0.8512145494962532, 0.851928937650138, 0.8512427844107419, 0.853206679574056, 0.850331648201929, 0.848725395288803, 0.8511383600444589, 0.8515363378867735, 0.8514718009393711, 0.8496065038901437, 0.849870

In [595]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [596]:
## 44
column_to_drop_43 = 'Cat_기초생활보장 수급가구 여부'

In [597]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(10564, 62)


In [598]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [599]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [600]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8459024698822158


In [601]:
xgb_optuna_44 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_44.fit(X_train, y_train)

In [602]:
xgb_optuna_proba_44 = xgb_optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, xgb_optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.849


In [603]:
X_train = X_train.values
y_train = y_train.values

In [604]:
auc_bootstrap = []

In [605]:
rs = RandomState(seed = 44)
bootstrap_auc(xgb_optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84591408, 0.85112024])

In [606]:
np.mean(auc_bootstrap)

0.8486099281578288

In [607]:
t_44 = auc_bootstrap
print(t_44)

[0.8470420565773905, 0.8498821304363415, 0.8479608117313828, 0.8502007816141408, 0.847178301244129, 0.8466176365135707, 0.8465159011867629, 0.849519110107203, 0.8479652934638414, 0.8478985156502099, 0.8511809365028146, 0.8504082858269693, 0.8484363235452297, 0.8482539170341689, 0.8482476426087268, 0.8481140869814635, 0.8477725789681259, 0.8509277186189093, 0.8468533756408878, 0.847867591696246, 0.8467623964719801, 0.8480172815603599, 0.8494142375676742, 0.8478729697751963, 0.8495796134953928, 0.8483852317952029, 0.8468296224588576, 0.8494725000896346, 0.8488759814994083, 0.8481871392205371, 0.8485039977053529, 0.8483193503280628, 0.8477089383672154, 0.8486057330321609, 0.8464334373095264, 0.8486828188304472, 0.8496392205370908, 0.84720519163888, 0.8500869456096949, 0.8488141335914812, 0.8490364275214227, 0.849802803771826, 0.8486819224839554, 0.8489579972033989, 0.8495899214800473, 0.8489857839446417, 0.8501227994693628, 0.8493169839733248, 0.847665017389122, 0.8488907712165215, 0.8464

In [608]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [609]:
# 45
column_to_drop_44 = '소득 중 사적이전소득의 비중(월평균)'

In [610]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(10564, 61)


In [611]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [612]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [613]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 151, 'learning_rate': 0.06999999999999999, 'max_depth': 3, 'max_leaves': 594, 'subsample': 1.0, 'colsample_bytree': 0.4, 'gamma': 7, 'reg_alpha': 7, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8484510054189618


In [614]:
xgb_optuna_45 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_45.fit(X_train, y_train)

In [615]:
xgb_optuna_proba_45 = xgb_optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, xgb_optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.850


In [616]:
X_train = X_train.values
y_train = y_train.values

In [617]:
auc_bootstrap = []

In [618]:
rs = RandomState(seed = 45)
bootstrap_auc(xgb_optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84706261, 0.85146972])

In [619]:
np.mean(auc_bootstrap)

0.8493209888494496

In [620]:
t_45 = auc_bootstrap
print(t_45)

[0.8501591015022768, 0.8492632031838228, 0.8504952314366643, 0.8488127890717436, 0.850961779785594, 0.8494671220106844, 0.8495840952278513, 0.8454537305940985, 0.8479487110537449, 0.8487908285826969, 0.8485116166505324, 0.8501295220680507, 0.8509075508228462, 0.8516640672618407, 0.8495836470546054, 0.850224086622925, 0.8507843031802373, 0.8488320605213152, 0.8496970348858055, 0.8512109641102865, 0.8494330608439997, 0.8488714997669499, 0.8494796708615683, 0.8485290954071205, 0.847405525079775, 0.8487392886594242, 0.8481822093148329, 0.8506673299630707, 0.848949033738482, 0.849984762109641, 0.8504696855616508, 0.8508645261912445, 0.8477107310601986, 0.8498198343551684, 0.8488992865081926, 0.849632946111649, 0.8504338317019827, 0.848023107812556, 0.8494052741027572, 0.8508246387723639, 0.8501532752500806, 0.8502101932523036, 0.848285737334624, 0.8501268330285755, 0.8492851636728694, 0.8491480226596394, 0.8503908070703812, 0.8460592126492419, 0.8497893585744506, 0.8511612168799971, 0.84652

In [621]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [622]:
# 46.
column_to_drop_45 = '현재 주택 거주 기간(총 개월)'

In [623]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(10564, 60)


In [624]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [625]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [626]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8472879999104304


In [627]:
xgb_optuna_46 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_46.fit(X_train, y_train)

In [628]:
xgb_optuna_proba_46 = xgb_optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, xgb_optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.849


In [629]:
X_train = X_train.values
y_train = y_train.values

In [630]:
auc_bootstrap = []

In [631]:
rs = RandomState(seed = 46)
bootstrap_auc(xgb_optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84530748, 0.85018607])

In [632]:
np.mean(auc_bootstrap)

0.8478590109712811

In [633]:
t_46 = auc_bootstrap
print(t_46)

[0.8470227851278191, 0.8479012046896848, 0.8498283496468396, 0.8474588576960309, 0.846840826790004, 0.8475377361873007, 0.8487383923129325, 0.8489526191244487, 0.8498144562762181, 0.8439505575275178, 0.8478084328277941, 0.8475847943781147, 0.8465768527481984, 0.8492900935785738, 0.8480244523322936, 0.8465728191889857, 0.8495509304076583, 0.8487271879817864, 0.8491516080456061, 0.8456150729626044, 0.8478631099637876, 0.8479038937291599, 0.8463783120002868, 0.8471402065182317, 0.8478792442006382, 0.8482064106701086, 0.8495697536839841, 0.8478819332401133, 0.8486864042164138, 0.845661234806927, 0.846294055430067, 0.8471249686278728, 0.848399125165824, 0.8478218780251694, 0.8483932989136282, 0.8479908393388549, 0.8496342906313863, 0.845330034778244, 0.8463796565200243, 0.8471626151805242, 0.8463500770857982, 0.8467256462658205, 0.8482440572227601, 0.8476892187443978, 0.8471872647090459, 0.8502048151733534, 0.8494532286400631, 0.8478281524506113, 0.8485111684772866, 0.8473073751389336, 0.84

In [634]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [635]:
# 47.
column_to_drop_46 = '장기부채부담지표'

In [636]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(10564, 59)


In [637]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [638]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [639]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'learning_rate': 0.08, 'max_depth': 6, 'max_leaves': 146, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001, 'gamma': 5, 'reg_alpha': 5, 'reg_lambda': 7, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8456309619776972


In [640]:
xgb_optuna_47 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_47.fit(X_train, y_train)

In [641]:
xgb_optuna_proba_47 = xgb_optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, xgb_optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.848


In [642]:
X_train = X_train.values
y_train = y_train.values

In [643]:
auc_bootstrap = []

In [644]:
rs = RandomState(seed = 47)
bootstrap_auc(xgb_optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84480162, 0.84935963])

In [645]:
np.mean(auc_bootstrap)

0.8472084626313149

In [646]:
t_47 = auc_bootstrap
print(t_47)

[0.847414040371446, 0.8453851600874834, 0.8465548922591517, 0.8477322433759994, 0.8479715678892833, 0.8465248646516796, 0.8478133627334982, 0.8471119716037432, 0.8473221648560467, 0.8475919651500485, 0.8471850238428167, 0.8476452977663045, 0.8451301495105948, 0.8466866551934316, 0.8466803807679897, 0.8477143164461655, 0.8476452977663046, 0.8449880785916605, 0.8475789681259187, 0.8473288874547344, 0.8462160732852891, 0.8468860922878348, 0.8438022121831414, 0.8460793804453048, 0.8450055573482484, 0.8496661109318417, 0.8470460901366033, 0.8475301172421211, 0.8461748413466711, 0.8448128428525332, 0.8483507224552724, 0.8459368613531247, 0.8480997454375965, 0.8487742461726004, 0.8467225090530994, 0.8476699472948261, 0.8473181312968341, 0.8452296439711735, 0.8468780251694095, 0.8456688537521064, 0.8478492165931663, 0.8446936287691371, 0.8465696819762648, 0.8467260944390663, 0.8481105015954966, 0.8470474346563408, 0.847951848266466, 0.8474041805600372, 0.845864257287297, 0.8479029973826682, 0.

In [647]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [648]:
# 48
column_to_drop_47 = 'Cat_이사 계획 중인 주택의 점유형태'

In [649]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(10564, 35)


In [650]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [651]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [652]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 177, 'learning_rate': 0.03, 'max_depth': 6, 'max_leaves': 618, 'subsample': 1.0, 'colsample_bytree': 0.5, 'gamma': 8, 'reg_alpha': 4, 'reg_lambda': 1, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8466694097362175


In [653]:
xgb_optuna_48 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_48.fit(X_train, y_train)

In [654]:
xgb_optuna_proba_48 = xgb_optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, xgb_optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.849


In [655]:
X_train = X_train.values
y_train = y_train.values

In [656]:
auc_bootstrap = []

In [657]:
rs = RandomState(seed = 48)
bootstrap_auc(xgb_optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84637727, 0.85115911])

In [658]:
np.mean(auc_bootstrap)

0.8488592325929512

In [659]:
t_48 = auc_bootstrap
print(t_48)

[0.8500156860636047, 0.8488898748700298, 0.8486509985299917, 0.8484538023018178, 0.8501931626689614, 0.8482261302929259, 0.8483538596679933, 0.8482485389552185, 0.8502886235703274, 0.8486509985299918, 0.8505024022085976, 0.847873866121688, 0.8486496540102543, 0.8492995052167365, 0.8470774622638129, 0.8496109856226024, 0.8497364741314402, 0.8509227887132049, 0.8484681438456849, 0.8469412175970744, 0.84848920798824, 0.847778405220322, 0.8486635473808757, 0.8486357606396329, 0.8488809114051126, 0.8494276827650497, 0.8499139507367968, 0.8491713276684234, 0.8481069162095298, 0.8504293499695242, 0.8494415761356711, 0.8510608260729267, 0.8491767057473738, 0.8500165824100966, 0.8492721666487397, 0.8497472302893407, 0.8467310243447707, 0.8486084220716361, 0.8500909791689075, 0.8493949661181025, 0.8492887490588362, 0.8485340253128251, 0.8503934961098563, 0.8503970814958232, 0.8487491484708329, 0.848187587393783, 0.8465992614104909, 0.8497642608726831, 0.8475198092574666, 0.8484363235452297, 0.85

In [660]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [661]:
# 49
column_to_drop_48 = '총 이사 횟수'

In [662]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(10564, 34)


In [663]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [664]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [665]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 149, 'learning_rate': 0.03, 'max_depth': 9, 'max_leaves': 426, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'gamma': 4, 'reg_alpha': 2, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8450858468807382


In [666]:
xgb_optuna_49 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_49.fit(X_train, y_train)

In [667]:
xgb_optuna_proba_49 = xgb_optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, xgb_optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.850


In [668]:
X_train = X_train.values
y_train = y_train.values

In [669]:
auc_bootstrap = []

In [670]:
rs = RandomState(seed = 49)
bootstrap_auc(xgb_optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84566884, 0.85014258])

In [671]:
np.mean(auc_bootstrap)

0.847979619321645

In [672]:
t_49 = auc_bootstrap
print(t_49)

[0.8489369330608441, 0.8492793374206734, 0.848037897529669, 0.849143092753935, 0.8487769352120755, 0.8458969739342439, 0.8446367107669142, 0.8478994119967015, 0.8473862536302033, 0.8480352084901941, 0.8465934351582948, 0.847004858197985, 0.847700423075544, 0.8460148434979025, 0.8479594672116453, 0.8468865404610806, 0.8483206948478004, 0.8478281524506114, 0.848796654834893, 0.8462362410813524, 0.8478079846545481, 0.8475404252267756, 0.847661880176401, 0.844544387078269, 0.8477810942597972, 0.8462671650353161, 0.8483587895736977, 0.8485550894553799, 0.8491628123767524, 0.8486545839159585, 0.8474938152092073, 0.847369223046861, 0.8458149582302534, 0.8480733032160912, 0.8477703381018967, 0.8477851278190098, 0.8463446990068482, 0.8480513427270446, 0.8493456670610592, 0.8481400810297228, 0.8493268437847334, 0.8489324513283856, 0.8471043526585638, 0.8474987451149116, 0.8497620200064537, 0.8476072030404073, 0.8475108457925495, 0.8478505611129038, 0.8469403212505827, 0.8479043419024057, 0.84759

In [673]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [674]:
# 50
column_to_drop_49 = '현재 주택의 면적(㎡)'

In [675]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(10564, 33)


In [676]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [677]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [678]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 80, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 578, 'subsample': 0.9, 'colsample_bytree': 0.5, 'gamma': 6, 'reg_alpha': 2, 'reg_lambda': 3, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8444406657261857


In [679]:
xgb_optuna_50 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_50.fit(X_train, y_train)

In [680]:
xgb_optuna_proba_50 = xgb_optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, xgb_optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.849


In [681]:
X_train = X_train.values
y_train = y_train.values

In [682]:
auc_bootstrap = []

In [683]:
rs = RandomState(seed = 50)
bootstrap_auc(xgb_optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84526368, 0.849754  ])

In [684]:
np.mean(auc_bootstrap)

0.8476343816105554

In [685]:
t_50 = auc_bootstrap
print(t_50)

[0.8479280950844358, 0.8442230468609946, 0.849238553655301, 0.8475260836829084, 0.8457365279122298, 0.8484237746943459, 0.8475619375425765, 0.8464177512459217, 0.8497337850919652, 0.8460538345702915, 0.8460238069628195, 0.8484452870101467, 0.847097630059876, 0.8502500806711843, 0.8472607651213654, 0.8476430569000752, 0.8462631314761034, 0.8453936753791547, 0.8466812771144813, 0.8490306012692266, 0.8481692122907031, 0.8484399089311963, 0.8474790254920943, 0.84844707970313, 0.8462174178050267, 0.8486424832383206, 0.8472715212792659, 0.8482642250188234, 0.847894482090997, 0.8472029507726507, 0.8490955863898749, 0.8479549854791868, 0.8456383779713886, 0.8476094439066366, 0.8455778745831991, 0.8487769352120755, 0.8473033415797211, 0.8494680183571761, 0.8473902871894159, 0.84905614714424, 0.8480396902226525, 0.8486895414291348, 0.849277096554444, 0.8481759348893908, 0.8497920476139257, 0.8480101107884265, 0.8461170269979563, 0.8483682012118604, 0.8490171560718511, 0.8471312430533148, 0.84803

In [686]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [687]:
# 51
column_to_drop_50 = 'Cat_현재 주택의 유형'

In [688]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(10564, 22)


In [689]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [690]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [691]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8408130961977698


In [692]:
xgb_optuna_51 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_51.fit(X_train, y_train)

In [693]:
xgb_optuna_proba_51 = xgb_optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, xgb_optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.847


In [694]:
X_train = X_train.values
y_train = y_train.values

In [695]:
auc_bootstrap = []

In [696]:
rs = RandomState(seed = 51)
bootstrap_auc(xgb_optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84219586, 0.84772112])

In [697]:
np.mean(auc_bootstrap)

0.845005138754437

In [698]:
t_51 = auc_bootstrap
print(t_51)

[0.8453681295041411, 0.8447819188985695, 0.8460229106163277, 0.8449096482736366, 0.8440903875802231, 0.848276325696461, 0.8457723817718978, 0.8451825857803592, 0.8462259330966978, 0.842528503818436, 0.8430743788318812, 0.8458203363092037, 0.8443238858413107, 0.8456168656555878, 0.8440115090889534, 0.8462962962962963, 0.8452161987737979, 0.8439769997490228, 0.8422129898533577, 0.8451261159513822, 0.8473118568713921, 0.8459758524255135, 0.8440939729661898, 0.8454609013660321, 0.844926678856979, 0.846234448388369, 0.845708741170987, 0.8452323330106486, 0.8470568462945035, 0.843024631601592, 0.8452430891685491, 0.8450109354271989, 0.8459978129145602, 0.8458992148004734, 0.8410091965150049, 0.8444762647448998, 0.8465262091714173, 0.8462891255243626, 0.843970725323581, 0.8449809078197268, 0.8420996020221576, 0.8465320354236133, 0.8439227707862752, 0.8440357104442292, 0.8442046717579146, 0.8452533971532036, 0.844021817073608, 0.8485483668566922, 0.8453067297694596, 0.8458306442938581, 0.84687

In [699]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [700]:
# 52
column_to_drop_51 = '가구주 나이'

In [701]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(10564, 21)


In [702]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [703]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [704]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8401385250123158


In [705]:
xgb_optuna_52 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_52.fit(X_train, y_train)

In [706]:
xgb_optuna_proba_52 = xgb_optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, xgb_optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.845


In [707]:
X_train = X_train.values
y_train = y_train.values

In [708]:
auc_bootstrap = []

In [709]:
rs = RandomState(seed = 52)
bootstrap_auc(xgb_optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84189591, 0.84610719])

In [710]:
np.mean(auc_bootstrap)

0.8440728487684198

In [711]:
t_52 = auc_bootstrap
print(t_52)

[0.8468202108206948, 0.8431940410885231, 0.8441809185758846, 0.8428673227922986, 0.843663278476928, 0.8441217597074324, 0.8411933957190492, 0.8449818041662184, 0.8434441217597073, 0.8433473163386038, 0.8448473521924634, 0.8441490982754293, 0.8442535226417124, 0.8459108673048653, 0.8435023842816679, 0.8458777024846724, 0.8448218063174501, 0.8450243806245742, 0.8435651285360869, 0.8455254383134343, 0.8459126599978488, 0.8437327453300348, 0.8440639453587176, 0.8430560037288014, 0.8429269298339966, 0.8446268509555053, 0.8467498476210965, 0.8430434548779177, 0.8431841812771145, 0.8437506722598687, 0.8435247929439604, 0.8422515327525009, 0.8444332401132981, 0.8456979850130866, 0.8446537413502563, 0.8449665662758596, 0.8433612097092251, 0.8427844107418163, 0.8440401921766878, 0.8436915133914165, 0.8441759886701803, 0.8438555447993977, 0.8445000179269297, 0.8438080384353376, 0.8462998816822631, 0.8440581191065217, 0.8447613029292603, 0.844443099924707, 0.8438313434441216, 0.8445739665124951, 0

In [712]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [713]:
# 53
column_to_drop_52 = 'Cat_현재 주택의 점유형태'

In [714]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(10564, 17)


In [715]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [716]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [717]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8373443727887501


In [718]:
xgb_optuna_53 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_53.fit(X_train, y_train)

In [719]:
xgb_optuna_proba_53 = xgb_optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, xgb_optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.840


In [720]:
X_train = X_train.values
y_train = y_train.values

In [721]:
auc_bootstrap = []

In [722]:
rs = RandomState(seed = 53)
bootstrap_auc(xgb_optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.83762247, 0.8421717 ])

In [723]:
np.mean(auc_bootstrap)

0.8399178648578394

In [724]:
t_53 = auc_bootstrap
print(t_53)

[0.8393065863540209, 0.8379638593094546, 0.8378934961098562, 0.8419768025527948, 0.8395705603958266, 0.8384613316123479, 0.8368156394535872, 0.8416904198486966, 0.839211125452655, 0.8400397977842315, 0.840339625685705, 0.8380301889498404, 0.8408169301925351, 0.8413829730020437, 0.8391026675271593, 0.8393307877092969, 0.8389682155534043, 0.8405789501989889, 0.8404001290738948, 0.8394629988168225, 0.8401316732996306, 0.841611989530673, 0.8399654010254205, 0.8400716180846868, 0.8398753182030045, 0.839853357713958, 0.8398869707073966, 0.8367726148219856, 0.8395714567423183, 0.8382896812591876, 0.8398529095407121, 0.8405672976945967, 0.8410723889426697, 0.8376573088092933, 0.8404781112186728, 0.8415241475744863, 0.8388481051235165, 0.8410979348176829, 0.8389126420709189, 0.8393056900075292, 0.8415703094188087, 0.8392147108386218, 0.8384129289017962, 0.8400492094223942, 0.8378397153203541, 0.8399698827578788, 0.8410508766268687, 0.8401285360869098, 0.8399223763938187, 0.8401509447492023, 0.8

In [725]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [726]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [727]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(10564, 10)


In [728]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [729]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [730]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 117, 'learning_rate': 0.04, 'max_depth': 6, 'max_leaves': 440, 'subsample': 0.30000000000000004, 'colsample_bytree': 1.0, 'gamma': 5, 'reg_alpha': 1, 'reg_lambda': 10, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.8332269694119755


In [731]:
xgb_optuna_54 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_54.fit(X_train, y_train)

In [732]:
xgb_optuna_proba_54 = xgb_optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, xgb_optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.826


In [733]:
X_train = X_train.values
y_train = y_train.values

In [734]:
auc_bootstrap = []

In [735]:
rs = RandomState(seed = 54)
bootstrap_auc(xgb_optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.82245773, 0.82994605])

In [736]:
np.mean(auc_bootstrap)

0.8260141593614428

In [737]:
t_54 = auc_bootstrap
print(t_54)

[0.8244725000896347, 0.8250273385679967, 0.8248964719802087, 0.8251008389803163, 0.8259698469040192, 0.8267940375031372, 0.8261540461080634, 0.8233439998565846, 0.8250304757807176, 0.826122673980854, 0.8259447492022516, 0.8297076117744075, 0.8261141586891829, 0.8240807966727617, 0.82829048797103, 0.8281753074468466, 0.8282810763328672, 0.8262763974041805, 0.8250663296403857, 0.8245518267541501, 0.8281986124556308, 0.8297954537305942, 0.8282909361442758, 0.825665985443333, 0.826570847226704, 0.825713491807393, 0.8255888996450468, 0.825334337241404, 0.8258250869456097, 0.8253778100462514, 0.8212133842458139, 0.8246903122871176, 0.8262297873866121, 0.827418791007852, 0.8267855222114661, 0.8245482413681833, 0.8258667670574736, 0.8276227098347138, 0.8249341185328601, 0.8263790290774803, 0.8263490014700082, 0.8303323652791222, 0.8257636872109283, 0.8281313864687534, 0.8232758775232153, 0.8290998888530351, 0.8262907389480476, 0.8234237746943458, 0.825549460399412, 0.8252568032698719, 0.825811

In [738]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [739]:
# 55
column_to_drop_54 = '소득 대비 주택 임대료의 비율'

In [740]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(10564, 9)


In [741]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [742]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [743]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 50, 'learning_rate': 0.01, 'max_depth': 4, 'max_leaves': 20, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'gamma': 7, 'reg_alpha': 7, 'reg_lambda': 1, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7768708853956738


In [744]:
xgb_optuna_55 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_55.fit(X_train, y_train)

In [745]:
xgb_optuna_proba_55 = xgb_optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, xgb_optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.778


In [746]:
X_train = X_train.values
y_train = y_train.values

In [747]:
auc_bootstrap = []

In [748]:
rs = RandomState(seed = 55)
bootstrap_auc(xgb_optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77659012, 0.77869653])

In [749]:
np.mean(auc_bootstrap)

0.7780238607436091

In [750]:
t_55 = auc_bootstrap
print(t_55)

[0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7786015202036499, 0.7766851314043957, 0.7765901186762755, 0.7781040479007565, 0.7781040479007565, 0.7780090351726363, 0.7781040479007565, 0.7786015202036499, 0.7781040479007565, 0.7780090351726363, 0.7781040479007565, 0.7780090351726363, 0.7781040479007565, 0.7781040479007565, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7781040479007565, 0.7780090351726363, 0.7781040479007565, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7765901186762755, 0.7786965329317701, 0.7781246638700656, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7781040479007565, 0.7781040479007565, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363,

In [751]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc