In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [4]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
중장년가구 = pd.read_csv('중장년가구_변수추가.csv', encoding='cp949')
중장년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
중장년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [9]:
cat = 중장년가구.select_dtypes(include = 'object')
num = 중장년가구.select_dtypes(exclude = 'object')
num_중장년 = num.drop('target',axis=1)
target = 중장년가구.target

In [10]:
scaler=RobustScaler()
scaler.fit(num_중장년)
num_scaled_중장년=scaler.transform(num_중장년)
num_df_scaled_중장년=pd.DataFrame(data=num_scaled_중장년, columns=num_중장년.columns)

In [11]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [12]:
comp =pd.concat([num_df_scaled_중장년, target,cat2],axis=1)

In [13]:
X =comp.drop('target', axis = 1)
y=comp.target
X.shape

(19949, 214)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])

        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}


In [16]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7974859314779492


In [17]:
xgb_optuna_0 = XGBClassifier(**study.best_trial.params, random_state = 0)

In [18]:
xgb_optuna_0.fit(X_train, y_train)

In [19]:
xgb_optuna_0_proba = xgb_optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, xgb_optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.797


In [20]:
X_train = X_train.values
y_train = y_train.values

In [22]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [23]:
auc_bootstrap = []

In [24]:
rs = RandomState(seed = 2024)
bootstrap_auc(xgb_optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78535447, 0.79450324])

In [25]:
np.mean(auc_bootstrap)

0.7899500149119915

In [26]:
t_0 = auc_bootstrap
print(t_0)

[0.7874247400109469, 0.7907740604784939, 0.7887569131411496, 0.7894997390071282, 0.7859586363280944, 0.7920032101066583, 0.79095844613086, 0.7880735239109624, 0.7901305598349933, 0.7880928077972413, 0.7890274158747066, 0.7853123619749236, 0.7904697449032425, 0.7855816759757646, 0.7905003877910283, 0.7881202807311182, 0.7904636691582503, 0.7857514005913021, 0.789897832385517, 0.7887341951381359, 0.79262650229399, 0.7890168493616769, 0.7853117015678592, 0.7886924574116692, 0.7872447130452056, 0.7914055417134234, 0.7928876272472332, 0.7927798488143316, 0.7923109597986446, 0.79261131293151, 0.7895401559194664, 0.7934162170615372, 0.7893153533547621, 0.7909795791569191, 0.7917961064512788, 0.794782731359086, 0.7924417203973855, 0.7940183762228097, 0.789841301540809, 0.7932344730374288, 0.7839377907111896, 0.790309133905193, 0.7928481349047852, 0.7875442736895939, 0.7869573038908014, 0.787973670362833, 0.789415867309956, 0.7923776609121436, 0.7863675603823388, 0.789050265959133, 0.7885928680

auc_bootstrap = []
for _ in range(2000):

    X_test_re, y_test_re = resample(X_test, y_test, replace = True, random_state=0)

    proba_0 = xgb_optuna_0.predict_proba(X_test_re)[:,1]

    auc_bootstrap.append(roc_auc_score(y_test_re,proba_0))

In [27]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [14]:
# 1.
column_to_drop = 'Cat_가구주 동거 여부'

In [15]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(19949, 212)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [31]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])

        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [32]:
print(study.best_trial.params)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}


In [33]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7974859314779492


In [34]:
xgb_optuna_1 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_1.fit(X_train, y_train)

In [35]:
xgb_optuna_1_proba = xgb_optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, xgb_optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.797


In [36]:
X_train = X_train.values
y_train = y_train.values

In [37]:
auc_bootstrap = []
auc_bootstrap

[]

In [38]:
rs = RandomState(seed = 1)
bootstrap_auc(xgb_optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78537749, 0.79441982])

In [39]:
np.mean(auc_bootstrap)

0.7898445451961461

In [40]:
t_1 = auc_bootstrap
print(t_1)

[0.7880278237421096, 0.7872202779838248, 0.7898259800969161, 0.7939759780892787, 0.7852657372361806, 0.7881715283193116, 0.7906755277444932, 0.787067063544896, 0.7901310881606449, 0.7888448793621207, 0.7888773713896866, 0.7882791746708003, 0.7849960269911009, 0.7895361934770802, 0.7904990669768994, 0.7884335778424448, 0.7911597382040731, 0.787045930518837, 0.7889167316507218, 0.7920602692770181, 0.7897269190372639, 0.791151813319301, 0.7841625932758937, 0.7882468147246473, 0.785383818019286, 0.7871077446200598, 0.7873428495349677, 0.7959871817630438, 0.7873262072769462, 0.788138640047507, 0.791327085354179, 0.7890390390390392, 0.7863646545912556, 0.7857259088786183, 0.7891904043381877, 0.7887750082947127, 0.7883013643481623, 0.7899062855959408, 0.786096000997479, 0.7888160856141152, 0.7899511932763164, 0.7900523676385746, 0.7916484394316908, 0.7882403427354165, 0.7900016483760326, 0.7905769950104926, 0.7876973560471097, 0.7891291185626161, 0.7910773194024426, 0.7902692453185064, 0.7862

In [41]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [16]:
column_to_drop_1 = '부채 중 임대 보증금의 비중'

In [17]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(19949, 211)


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [45]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [46]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7974859314779492


In [47]:
xgb_optuna_2 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_2.fit(X_train, y_train)

In [48]:
xgb_optuna_2_proba = xgb_optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, xgb_optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.797


In [49]:
X_train = X_train.values
y_train = y_train.values

In [50]:
auc_bootstrap = []

In [51]:
rs = RandomState(seed = 2)
bootstrap_auc(xgb_optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78519029, 0.79427767])

In [52]:
np.mean(auc_bootstrap)

0.7898372658592794

In [53]:
t_2 = auc_bootstrap
print(t_2)

[0.7855161635949814, 0.7921709535010028, 0.7912507422975403, 0.789626537163483, 0.7926906938606446, 0.7918579205525018, 0.7891988575486113, 0.7913980130728899, 0.7906503001946351, 0.7943834492479812, 0.7896524251204053, 0.7954053631393533, 0.7932861168698607, 0.7891270052600103, 0.7909811641338734, 0.787040911425148, 0.7893964513422641, 0.7880550325131607, 0.7917006115897742, 0.7930952592282641, 0.7915834553765587, 0.7898510755653613, 0.7882734951700469, 0.7922705428863064, 0.790372004657719, 0.7908604417225107, 0.7925228183848874, 0.7935049757709856, 0.7913437276122005, 0.7871471048810951, 0.7889370721883038, 0.7900790480839741, 0.7861266438852645, 0.7917942573114988, 0.7891137971187232, 0.7936238490425681, 0.7841562533680759, 0.7910959428816571, 0.7883136479195593, 0.7904948403716876, 0.7901839207257926, 0.7908607058853363, 0.792950497999759, 0.7891597614504019, 0.7882660986109262, 0.7904319696191617, 0.7871920125614706, 0.7925378756659545, 0.7919952852218862, 0.7897275794443283, 0.7

In [54]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [20]:
# 2.
column_to_drop_2 = 'Cat_가구주 주민등록상 등재 여부'

In [21]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(19949, 209)


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [58]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [59]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7980307990524257


In [60]:
xgb_optuna_3 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_3.fit(X_train, y_train)

In [61]:
xgb_optuna_3_proba = xgb_optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, xgb_optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.797


In [62]:
X_train = X_train.values
y_train = y_train.values

In [63]:
auc_bootstrap = []

In [64]:
rs = RandomState(seed = 3)
bootstrap_auc(xgb_optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78503863, 0.79441537])

In [65]:
t_3 = auc_bootstrap
print(t_3)

[0.7952539978402048, 0.790879461445964, 0.7910744136113594, 0.7882919865678486, 0.7868113539295806, 0.7866866690758316, 0.7898933416174795, 0.7897797516024117, 0.7890091886397305, 0.790985654901911, 0.7894646053513049, 0.7908506676979584, 0.7913044994325783, 0.7922052946683489, 0.7896813509498238, 0.7876286737124175, 0.7898263763411547, 0.7879992941569297, 0.7850311606469242, 0.790088293782875, 0.7935965081901043, 0.7901094268089341, 0.7875272351873337, 0.7914537514291209, 0.784431775195322, 0.7891856494073244, 0.7896315562571721, 0.7899559482071797, 0.789729824828347, 0.7863311059123868, 0.7882491921900789, 0.7944562261064724, 0.7904634049954247, 0.7915480575579098, 0.7900716515248535, 0.7890451147840309, 0.7905719759168035, 0.7909079910311437, 0.7897049935227276, 0.790999919694501, 0.789703540627186, 0.7923141297525533, 0.7874194567544321, 0.7905192754330685, 0.791774709262394, 0.7951036891923592, 0.7929115339829625, 0.7885071471894132, 0.7895287969179596, 0.78804856052393, 0.7895657

In [66]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [22]:
### 4. 
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [23]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(19949, 208)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [70]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [71]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7987890730935723


In [72]:
xgb_optuna_4 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_4.fit(X_train, y_train)

In [73]:
xgb_optuna_4_proba = xgb_optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, xgb_optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.798


In [74]:
X_train = X_train.values
y_train = y_train.values

In [75]:
auc_bootstrap = []

In [76]:
rs = RandomState(seed = 4)
bootstrap_auc(xgb_optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78517112, 0.79437512])

In [77]:
np.mean(auc_bootstrap)

0.7898255711728618

In [78]:
t_4 = auc_bootstrap
print(t_4)

[0.7939560337959353, 0.7911116605697888, 0.7909414076285997, 0.7930654088289555, 0.7860561124107921, 0.7919178855139446, 0.7913245758073344, 0.7903665893197913, 0.7885849431415933, 0.7892099523872923, 0.7864935660502164, 0.7868528274932216, 0.7878035495030569, 0.7902685849114421, 0.7904717261244355, 0.7937598928978239, 0.794591345391838, 0.7947145773500454, 0.7921149509819462, 0.7935019378984897, 0.794130513342336, 0.7921510092076595, 0.7875425566312266, 0.7875479719691543, 0.7895367218027316, 0.7877633967535445, 0.7914438453231557, 0.7894421515111171, 0.7935790734436055, 0.7924621930163802, 0.7894565483851198, 0.7954675734848149, 0.7899842136295339, 0.7907661355937219, 0.7883614613910179, 0.7894101878092026, 0.7867479548514031, 0.7873178861479354, 0.7905732967309322, 0.7863843347217732, 0.7896891437531832, 0.790804439203454, 0.7921543112429811, 0.7896216501512068, 0.7927972835608305, 0.7906306200641176, 0.787130990948725, 0.7929568379075769, 0.7905305023531624, 0.7880655990261901, 0.7

In [24]:
## 5.
column_to_drop_4 = '소득 중 사적이전소득의 비중(월평균)'

In [25]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(19949, 207)


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7989199238671397


In [84]:
xgb_optuna_5 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_5.fit(X_train, y_train)

In [85]:
xgb_optuna_5_proba = xgb_optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, xgb_optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.798


In [86]:
X_train = X_train.values
y_train = y_train.values

In [87]:
auc_bootstrap = []

In [88]:
rs = RandomState(seed = 5)
bootstrap_auc(xgb_optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78503264, 0.79445441])

In [89]:
np.mean(auc_bootstrap)

0.7899328089304998

In [90]:
t_5 = auc_bootstrap
print(t_5)

[0.7914624688023701, 0.7880801279816059, 0.7936116975525842, 0.7930035947277326, 0.7925162143142439, 0.7869163586528118, 0.7910614696328983, 0.788338215062353, 0.7881407533501129, 0.7901663538978811, 0.7880706181198791, 0.793068050457213, 0.7877417354018339, 0.7885294689481882, 0.786354088078226, 0.7941770059996662, 0.7900441785909766, 0.7909322940111118, 0.7925951989991399, 0.7876841479058228, 0.7881781323899552, 0.7908279496949449, 0.7888837112975043, 0.7918127487093005, 0.788755328164195, 0.7904140065470114, 0.7938627843184494, 0.7887106846466453, 0.7898973040598656, 0.7918406178874159, 0.7942678780117203, 0.7915208487868586, 0.7906417149027988, 0.7899902893745258, 0.7962677226839787, 0.7904808397419234, 0.7874623832136147, 0.7906664141270052, 0.786913320780316, 0.7862061568958122, 0.7845585733516768, 0.7929401956495552, 0.7921928790155391, 0.7895884977165766, 0.7874728176452314, 0.794576156029358, 0.7899297960874315, 0.7896434435843303, 0.7872099756336209, 0.787977764886632, 0.7871

In [91]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
column_to_drop_5 = 'Cat_현재 주택의 위치'

In [27]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(19949, 203)


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [95]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [96]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7986557456492118


In [97]:
xgb_optuna_6 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_6.fit(X_train, y_train)

In [98]:
xgb_optuna_proba_6 = xgb_optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, xgb_optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.797


In [99]:
X_train = X_train.values
y_train = y_train.values

In [100]:
auc_bootstrap = []

In [101]:
rs = RandomState(seed = 6)
bootstrap_auc(xgb_optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7854932 , 0.79420175])

In [102]:
np.mean(auc_bootstrap)

0.7898354973552019

In [103]:
t_6 = auc_bootstrap
print(t_6)

[0.791689648832506, 0.7923595657585805, 0.7917642748307774, 0.788285646660031, 0.7877129416538283, 0.7917358773270102, 0.7901506362097495, 0.7892976544454378, 0.7859841280407782, 0.7905645793576828, 0.7939009558467687, 0.787137066693717, 0.7910487898172627, 0.7915554541170304, 0.7885194307608101, 0.7888314070580081, 0.789690464567312, 0.7858076672731844, 0.7871180469702638, 0.7878380227518158, 0.7901704484216799, 0.7928215865407983, 0.7889811873802022, 0.790981692459525, 0.7889677150760894, 0.7870007586756355, 0.788684532526897, 0.7891267410971844, 0.794972136105141, 0.7887112129722967, 0.7917305940704955, 0.7930146895664137, 0.7881086575667857, 0.7912821776738033, 0.7914091079115709, 0.7887294402072728, 0.7879617830356747, 0.7889677150760894, 0.787991897597809, 0.7917113101842166, 0.7911174721519549, 0.787812531039132, 0.7847109953021283, 0.7922326355208129, 0.7911578890642931, 0.7862785375100645, 0.7912244580963792, 0.7924365692222836, 0.7937231742650461, 0.7929271195896811, 0.792470

In [104]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
## 7 .
column_to_drop_6 = '소득 중 재산소득의 비중(월평균)'

In [29]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(19949, 202)


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [108]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [109]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.798073728012839


In [110]:
xgb_optuna_7 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_7.fit(X_train, y_train)

In [111]:
xgb_optuna_proba_7 = xgb_optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, xgb_optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.798


In [112]:
X_train = X_train.values
y_train = y_train.values

In [113]:
auc_bootstrap = []

In [114]:
rs = RandomState(seed = 7)
bootstrap_auc(xgb_optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78549987, 0.7944994 ])

In [115]:
t_7 = auc_bootstrap
print(t_7)

[0.7907299452865956, 0.7882180209766416, 0.7866743855044348, 0.7956886777699586, 0.7897822611492562, 0.7905458237970553, 0.792904005342429, 0.7887304968585758, 0.7895079280547261, 0.7837808779927007, 0.7874578924455772, 0.7935936023990211, 0.7915566428497463, 0.789951589520555, 0.7880048415762702, 0.7905588998569295, 0.7877105641883967, 0.7878437022525693, 0.7882964773358863, 0.7865305488458197, 0.7897271832000896, 0.791645005314956, 0.7915411893244406, 0.7834788078014678, 0.7905266719921892, 0.7894775493297661, 0.7886044911906981, 0.7895438541990267, 0.7910205243949087, 0.7873888138666465, 0.7893436187771163, 0.7937826109008376, 0.7874585528526414, 0.7931206188595351, 0.786172872379769, 0.7884238038178925, 0.7906520172530024, 0.7909959572521148, 0.7867455773859716, 0.7914056737948363, 0.7877940396413302, 0.7934146320845827, 0.789850283076884, 0.7907995521911778, 0.7965700570380374, 0.7951410682322013, 0.7872984701802436, 0.7907745888041455, 0.7882349273974889, 0.7932472849344769, 0.78

In [116]:
np.mean(t_7)

0.7900669298124655

In [117]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
## 8 
column_to_drop_7 = 'Cat_소득 계층'

In [31]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(19949, 200)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [121]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [122]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7981521559212863


In [123]:
xgb_optuna_8 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_8.fit(X_train, y_train)

In [124]:
xgb_optuna_proba_8 = xgb_optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, xgb_optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.798


In [125]:
X_train = X_train.values
y_train = y_train.values

In [126]:
auc_bootstrap = []

In [127]:
rs = RandomState(seed = 8)
bootstrap_auc(xgb_optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78536176, 0.79448899])

In [128]:
t_8 = auc_bootstrap
print(t_8)

[0.7903473054335123, 0.7917630860980615, 0.7899136821550614, 0.7883733487181763, 0.7899575331841341, 0.7929726876771213, 0.7922433341152553, 0.7916745915514388, 0.790047876870537, 0.7880603157696754, 0.7898608495899136, 0.7899662505573835, 0.786752181456615, 0.7908180435889796, 0.7869011692903318, 0.7872572607794281, 0.7885852073044192, 0.7946231770123395, 0.7923437159890361, 0.7916933471120664, 0.7898805297204312, 0.7919886811512427, 0.7920202486089185, 0.7916177965439049, 0.7913182358995168, 0.7887077788555621, 0.790056990488025, 0.794454641129518, 0.7882174926509902, 0.787337566278453, 0.7888829188090272, 0.7925428947596436, 0.7872923944352516, 0.7917020644853158, 0.7912957820593288, 0.7899408909261125, 0.7912841588949964, 0.7888867491700005, 0.7921877278404372, 0.7940086021982573, 0.7900954261791701, 0.7853463068980311, 0.7927980760493075, 0.7954053631393533, 0.7876442593191363, 0.7870810641746604, 0.7899387776235066, 0.7879454049404788, 0.7911124530582658, 0.7887711779337395, 0.78

In [129]:
np.mean(t_8)

0.7899559789821489

In [130]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [32]:
#9.
column_to_drop_8 = 'Cat_가구주 장애 여부'

In [33]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(19949, 198)


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [134]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [135]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7985913522085918


In [136]:
xgb_optuna_9 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_9.fit(X_train, y_train)

In [137]:
xgb_optuna_proba_9 = xgb_optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, xgb_optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.798


In [138]:
X_train = X_train.values
y_train = y_train.values

In [139]:
auc_bootstrap = []

In [140]:
rs = RandomState(seed = 9)
bootstrap_auc(xgb_optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78516718, 0.79452997])

In [141]:
t_9 = auc_bootstrap
print(t_9)

[0.7910289776053323, 0.7900484051961885, 0.7940610385191667, 0.7921179888544421, 0.7877978700023035, 0.7875747844959667, 0.7921389897990884, 0.7887122696235996, 0.7907756454554484, 0.7866566865951103, 0.7879990299941039, 0.7956874890372427, 0.7931116373234599, 0.7909872398788655, 0.7879004972601031, 0.7928424554040319, 0.7845377044884434, 0.7887010427035057, 0.787057025357518, 0.7908669137117412, 0.7905902031517795, 0.7844254352875042, 0.7875103287664864, 0.7906327333667235, 0.7937297783356897, 0.7901136534141461, 0.7922606367803412, 0.7872155230529614, 0.7904079308020194, 0.7920223619115244, 0.7903632872844695, 0.7902145636135782, 0.7861136999068034, 0.7921595944994959, 0.7912549689027522, 0.7870847624542205, 0.7843807917699543, 0.7945752314594678, 0.7892221038772763, 0.7912281563759396, 0.7880941286113701, 0.7900497260103171, 0.7873486611171341, 0.792359037432929, 0.7901298994279289, 0.7912492894019987, 0.7926704854044756, 0.7930833719011059, 0.7902643583062302, 0.7879763119910903, 0

In [142]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
column_to_drop_9 = '부채 중 비금융기관 대출금의 비중'

In [35]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(19949, 197)


In [145]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [146]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [147]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7982891983718367


In [148]:
xgb_optuna_10 = XGBClassifier(**study.best_trial.params, random_state=0)
xgb_optuna_10.fit(X_train, y_train)

In [149]:
xgb_optuna_proba_10 = xgb_optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, xgb_optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.797


In [150]:
X_train = X_train.values
y_train = y_train.values

In [151]:
auc_bootstrap = []

In [152]:
rs = RandomState(seed = 10)
bootstrap_auc(xgb_optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78502738, 0.79441549])

In [153]:
t_10 = auc_bootstrap
print(t_10)

[0.7889349588856978, 0.7898349616329912, 0.7913792575122625, 0.7888781638781639, 0.789893869943131, 0.7916109283104357, 0.7893560344299261, 0.7944129034030512, 0.7909661068528064, 0.7896944270096979, 0.7922441266037324, 0.789912097178107, 0.7871185752959152, 0.7926681079390439, 0.7887814802839433, 0.7900711231992021, 0.7930068967630544, 0.7911351710612795, 0.7915464725809553, 0.789661934982132, 0.7860648297840416, 0.7905426538431464, 0.790063990802907, 0.7934677288125564, 0.794558985445685, 0.787240354358581, 0.7877602267996358, 0.7895610247826996, 0.7934904468155701, 0.7930685787828645, 0.7909219916609078, 0.7911384730966011, 0.7906847734433942, 0.7903897035670434, 0.7939485051554017, 0.7862348185624047, 0.7911700405542771, 0.7836497211497211, 0.7919258103987168, 0.7879151582969317, 0.7899831569782308, 0.7923455651288163, 0.7931861312403183, 0.7931664511098009, 0.7920973841540344, 0.7897662792982989, 0.7916143624271703, 0.7872564682909511, 0.7892860312811052, 0.7908880467378004, 0.786

In [154]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [36]:
# 11.
column_to_drop_10 = 'Cat_현재 주택의 구조'

In [37]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(19949, 195)


In [157]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [158]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [159]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7998036825618021


In [160]:
xgb_optuna_11 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_11.fit(X_train, y_train)

In [161]:
xgb_optuna_proba_11 = xgb_optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, xgb_optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.797


In [162]:
X_train = X_train.values
y_train = y_train.values

In [163]:
auc_bootstrap = []

In [164]:
rs = RandomState(seed = 11)
bootstrap_auc(xgb_optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78550123, 0.79466369])

In [165]:
t_11 = auc_bootstrap
print(t_11)

[0.795780474351903, 0.7879065730050951, 0.7860825286933661, 0.7882151151855586, 0.7903130963475792, 0.792345829291642, 0.7876725247414902, 0.7863583146834379, 0.7948236765970756, 0.7895007956584311, 0.7913179717366908, 0.7880568816529407, 0.7870896494664967, 0.7867569363874783, 0.788508335922129, 0.7928929105037479, 0.7895507224324958, 0.7893312031243066, 0.7884665981956622, 0.7891605539388791, 0.7884098031881284, 0.7911354352241051, 0.7868977351735973, 0.7929874807953624, 0.7891597614504018, 0.7922240502289764, 0.7904182331522234, 0.7929639703038717, 0.782570219762338, 0.7898904358263964, 0.7919192063280733, 0.7913445201006777, 0.7917488213054715, 0.7877330180285845, 0.7906310163083562, 0.7856337160524353, 0.787966537966538, 0.7905711834283262, 0.7879575564304628, 0.7963389145655156, 0.7914561288945525, 0.7902050537518519, 0.7906833205478525, 0.7849352695411809, 0.7930741262022052, 0.7909668993412835, 0.7924773823788602, 0.7904459702489259, 0.7873156407639167, 0.7872289953570741, 0.79

In [166]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [38]:
# 12
column_to_drop_11 = 'Cat_기초생활보장 수급가구 여부'

In [39]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(19949, 193)


In [169]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [170]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [171]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7972857339221755


In [172]:
xgb_optuna_12 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_12.fit(X_train, y_train)

In [173]:
xgb_optuna_proba_12 = xgb_optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, xgb_optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.798


In [174]:
X_train = X_train.values
y_train = y_train.values

In [175]:
auc_bootstrap = []

In [176]:
rs = RandomState(seed = 12)
bootstrap_auc(xgb_optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78545559, 0.7946048 ])

In [177]:
t_12 = auc_bootstrap
print(t_12)

[0.7905734288123452, 0.787585615171822, 0.7934880693501383, 0.7925656127626571, 0.7902261867779108, 0.7879213661233365, 0.7895853277626677, 0.7912425532499424, 0.7940882472902178, 0.7879808027591279, 0.7899898931302872, 0.7891728375102759, 0.7919818129177736, 0.7912564217982937, 0.7943773735029893, 0.7854572552848414, 0.7898061678849856, 0.7891791774180937, 0.7901049360408965, 0.7881316397326248, 0.7902325266857286, 0.7866834991219227, 0.7924316822100075, 0.7894099236463769, 0.7915183392400142, 0.7914178252848204, 0.7913435955307877, 0.7928353230077368, 0.788955035260454, 0.792549498830287, 0.7936556806630699, 0.7868525633303959, 0.7926990149896553, 0.7881028459846193, 0.7889090709287754, 0.7881958312992796, 0.7883638388564496, 0.7904639333210762, 0.794079794079794, 0.7925915007195796, 0.790839837022103, 0.7934109338050224, 0.7889801307288992, 0.7900098374236305, 0.7897686567637308, 0.7903432109097134, 0.7919470755061888, 0.7841690652651244, 0.7902515464091817, 0.7904369887128508, 0.78

In [178]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
# 13.
column_to_drop_12 = '자산 중 부동산 자산의 비중'

In [41]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(19949, 192)


In [181]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [182]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [183]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.796629003383545


In [184]:
xgb_optuna_13 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_13.fit(X_train, y_train)

In [185]:
xgb_optuna_proba_13 = xgb_optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, xgb_optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.798


In [186]:
X_train = X_train.values
y_train = y_train.values

In [187]:
auc_bootstrap = []

In [188]:
rs = RandomState(seed = 13)
bootstrap_auc(xgb_optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78534479, 0.79426324])

In [189]:
t_13 = auc_bootstrap
print(t_13)

[0.7896562554813787, 0.7893433546142906, 0.7930693712713417, 0.7915282453459793, 0.7917881815665067, 0.7888180668353082, 0.7909608235962916, 0.7917448588630855, 0.7879022143184704, 0.7891853852444985, 0.787383134365893, 0.791274384870444, 0.7937748180974782, 0.7897876764871838, 0.7902592071311283, 0.7914190140175361, 0.7917334998615787, 0.7932804373691072, 0.790006403306896, 0.7929203834376248, 0.7864494508583177, 0.7856709630108644, 0.7911549832732099, 0.7902259226150851, 0.7868441101199721, 0.7880069548788762, 0.7902745285750212, 0.7877430562159626, 0.7879257248099613, 0.7920581559744122, 0.7873415287208391, 0.7873417928836648, 0.7875716145420579, 0.7913417463910075, 0.7882510413298591, 0.7901128609256688, 0.7927473567867656, 0.7919684726950735, 0.7888517475955901, 0.790162259374082, 0.7927301862030927, 0.7920993653752274, 0.7845810271918647, 0.7922211444378932, 0.7916323254993205, 0.7911924943944648, 0.7900230455649174, 0.7891816869649381, 0.791309782689093, 0.7916933471120664, 0.78

In [190]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [42]:
#14.
column_to_drop_13 = 'Cat_이사 예상 기간'

In [43]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(19949, 188)


In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [194]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [195]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7977236918740844


In [196]:
xgb_optuna_14 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_14.fit(X_train, y_train)

In [197]:
xgb_optuna_proba_14 = xgb_optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, xgb_optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.798


In [198]:
X_train = X_train.values
y_train = y_train.values

In [199]:
auc_bootstrap = []

In [200]:
rs = RandomState(seed = 14)
bootstrap_auc(xgb_optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78492802, 0.79443643])

In [201]:
t_14 = auc_bootstrap
print(t_14)

[0.7894009421103018, 0.7926593905657945, 0.7887886126802384, 0.784928137144886, 0.7886665694547468, 0.7926836935457625, 0.7905870331978706, 0.7920576276487608, 0.7905372385052188, 0.7898653403579512, 0.7876825629288683, 0.7890083961512534, 0.7917846153683592, 0.7917609727954555, 0.7935205613777043, 0.7935913570150024, 0.7902449423385384, 0.7948022794081908, 0.7891137971187233, 0.7910906596251425, 0.7843327141356697, 0.7901083701576312, 0.7887783103300344, 0.7889642809593549, 0.7896688032156012, 0.7895988000667804, 0.7930170670318453, 0.7878746093031808, 0.7866078164723485, 0.7905039539891757, 0.7881028459846194, 0.7895925922403755, 0.7909468229665275, 0.7882201342792476, 0.7883324034801866, 0.7850995788187907, 0.7887194020198947, 0.789798771325865, 0.7965148470074579, 0.7923516408738083, 0.7921659344073138, 0.7873494536056113, 0.7844877777143786, 0.7917406322578736, 0.7907201712620431, 0.7872732426303856, 0.7881050913686383, 0.786397674944473, 0.7913121601545248, 0.7934011597804701, 0.

In [202]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
## 15.
column_to_drop_14 = 'Cat_현재 주변도로의 보행 안전'

In [45]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(19949, 184)


In [205]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [206]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [207]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7978145031364972


In [208]:
xgb_optuna_15 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_15.fit(X_train, y_train)

In [209]:
xgb_optuna_proba_15 = xgb_optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, xgb_optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.799


In [210]:
X_train = X_train.values
y_train = y_train.values

In [211]:
auc_bootstrap = []

In [212]:
rs = RandomState(seed = 15)
bootstrap_auc(xgb_optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78530269, 0.79432276])

In [213]:
t_15 = auc_bootstrap
print(t_15)

[0.7896414623631373, 0.7913433313679619, 0.7875930117309428, 0.7871032538520224, 0.7929396673239038, 0.7896487268408451, 0.7914928475273303, 0.7890625495305299, 0.7906234876678226, 0.7899177766788603, 0.7913034427812753, 0.7877343388427132, 0.7944984921585907, 0.7910604129815952, 0.7924081717185165, 0.7903078130910643, 0.7874091544042283, 0.7860539991081863, 0.7881268848017616, 0.784980705547208, 0.7863583146834379, 0.7951604841998929, 0.7888948061361855, 0.7908233268454943, 0.7902317341972513, 0.793045860779851, 0.7881583201780247, 0.7886208692858939, 0.7873726999342763, 0.788078014679, 0.7897057860112047, 0.788162943027475, 0.7887347234637875, 0.7899564765328313, 0.7889584693771885, 0.7944556977808209, 0.7936922672144346, 0.7951528234779467, 0.7888480493160296, 0.79003374415936, 0.7897842423704493, 0.7870689126846763, 0.7905077843501489, 0.7913052919210555, 0.7883812736029485, 0.7882357198859662, 0.7907489650100488, 0.7880795996559544, 0.7936659830132737, 0.791058563841815, 0.7919133

In [214]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
# 16.
column_to_drop_15 = 'Cat_현재 공공기관 접근용이성'

In [47]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(19949, 180)


In [217]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [218]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [219]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7980274968247016


In [220]:
xgb_optuna_16 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_16.fit(X_train, y_train)

In [221]:
xgb_optuna_proba_16 = xgb_optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, xgb_optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.797


In [222]:
X_train = X_train.values
y_train = y_train.values

In [223]:
auc_bootstrap = []

In [224]:
rs = RandomState(seed = 16)
bootstrap_auc(xgb_optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78540031, 0.79446492])

In [225]:
np.mean(auc_bootstrap)

0.7900094307449603

In [226]:
t_16 = auc_bootstrap
print(t_16)

[0.7884494276119893, 0.7873190748806512, 0.7888438227108178, 0.7896513684691024, 0.7920026817810069, 0.7910598846559438, 0.7899393059491581, 0.7876874499411445, 0.7904536309708723, 0.7942972000853774, 0.7875544439583849, 0.7885931321891912, 0.7900198756110086, 0.7918600338551078, 0.7920227581557632, 0.7916830447618625, 0.7922454474178612, 0.7898962474085627, 0.7899577973469599, 0.793895672590254, 0.7913705401390131, 0.7922937892149715, 0.7927943777697473, 0.7867902209035214, 0.7908385162079743, 0.7886969481797068, 0.7928495878003267, 0.7899684959414024, 0.7913504637642569, 0.7884968448392093, 0.790300152369118, 0.7865868155277022, 0.7863390307971588, 0.7898941341059568, 0.7908533093262158, 0.7881276772902388, 0.7959038383915231, 0.7864848486769669, 0.7927970193980046, 0.7896278579776116, 0.7905146525836181, 0.7899461741826275, 0.7917784075419543, 0.7882534187952908, 0.7894680394680395, 0.7874878749262986, 0.7921593303366703, 0.7890398315275163, 0.7889896405906258, 0.7910104862075306, 0

In [227]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [48]:
# 17.
column_to_drop_16 = 'Cat_현재 의료시설 접근용이성'

In [49]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(19949, 176)


In [230]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [231]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [232]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7968407587363531


In [233]:
xgb_optuna_17 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_17.fit(X_train, y_train)

In [234]:
xgb_optuna_proba_17 = xgb_optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, xgb_optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.798


In [235]:
X_train = X_train.values
y_train = y_train.values

In [236]:
auc_bootstrap = []

In [237]:
rs = RandomState(seed = 17)
bootstrap_auc(xgb_optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78509529, 0.79431427])

In [238]:
np.mean(auc_bootstrap)

0.7898486734667461

In [239]:
t_17 = auc_bootstrap
print(t_17)

[0.7908435353016634, 0.7916917621351118, 0.7862333656668632, 0.7903429467468877, 0.7891391567499942, 0.7902000346581628, 0.790067028675403, 0.7900808972237544, 0.7912341000395188, 0.786345899030628, 0.791237798319079, 0.7879673304550152, 0.7884203697011578, 0.788774612050474, 0.7891186841309994, 0.7889748474723844, 0.7915794929341728, 0.7916619117358035, 0.7869695874621983, 0.7921651419188365, 0.7889080142774725, 0.7854960872202251, 0.7897226924320522, 0.7901376922312884, 0.7908445919529663, 0.7892596149985313, 0.7896923137070919, 0.7923434518262105, 0.7902539238746135, 0.7893224857510571, 0.791646061966259, 0.7941460989490547, 0.7905209924914358, 0.7891956875947024, 0.7898475093672137, 0.7892435010661611, 0.7911293594791132, 0.7929563095819253, 0.7955501243678583, 0.788596566305926, 0.7868694697512432, 0.7923083181703872, 0.7910115428588335, 0.7898146210954092, 0.791487035945164, 0.7883202519902027, 0.7883392717136559, 0.7880375977666618, 0.7919931719192803, 0.7904845380214838, 0.7920

In [240]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [50]:
## 18.
column_to_drop_17 ='부채 중 금융기관 대출금의 비중'

In [51]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(19949, 175)


In [243]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [244]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [245]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7974974892749835


In [246]:
xgb_optuna_18 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_18.fit(X_train, y_train)

In [247]:
xgb_optuna_proba_18 = xgb_optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, xgb_optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.798


In [248]:
X_train = X_train.values
y_train = y_train.values

In [249]:
auc_bootstrap = []

In [250]:
rs = RandomState(seed = 18)
bootstrap_auc(xgb_optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.784881  , 0.79450343])

In [251]:
np.mean(auc_bootstrap)

0.7897990759716437

In [252]:
t_18 = auc_bootstrap
print(t_18)

[0.7933195334673167, 0.7901870906797015, 0.7905215208170873, 0.7886904761904763, 0.7881783965527807, 0.7905060672917815, 0.7933359115625125, 0.7906340541808521, 0.7920021534553554, 0.7907209637505204, 0.7909476154550046, 0.7946902743700773, 0.790951313734565, 0.7915608694549583, 0.784670974634029, 0.7914862434566868, 0.789199121711437, 0.7904997273839638, 0.7867961645671004, 0.7911510208308238, 0.789302409376301, 0.7897786949511088, 0.7925732734846035, 0.7931103165093312, 0.7879596697330686, 0.7939717514840667, 0.7885181099466814, 0.7921211588083509, 0.7897730154503553, 0.7916830447618624, 0.789145496657812, 0.7881531690029228, 0.7877805673372176, 0.7899620239521719, 0.7908678382816314, 0.7877076583973135, 0.7911758521364433, 0.7900531601270517, 0.7887780461672087, 0.7869217739907395, 0.7864196004590093, 0.788188302658746, 0.7901685992818999, 0.7937939699023443, 0.7888282371040991, 0.7861319271417795, 0.792328658707969, 0.7888203122193271, 0.7944543769666923, 0.7901770524923235, 0.7899

In [253]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
# 19
column_to_drop_18 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [53]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(19949, 171)


In [256]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [257]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [258]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7974132824680189


In [259]:
xgb_optuna_19 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_19.fit(X_train, y_train)

In [260]:
xgb_optuna_proba_19 = xgb_optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, xgb_optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.798


In [261]:
X_train = X_train.values
y_train = y_train.values

In [262]:
auc_bootstrap = []

In [263]:
rs = RandomState(seed = 19)
bootstrap_auc(xgb_optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7851244 , 0.79434028])

In [264]:
np.mean(auc_bootstrap)

0.7898254405443444

In [265]:
t_19 = auc_bootstrap
print(t_19)

[0.786187005090946, 0.7828272501917822, 0.7910654320752843, 0.7902076953801093, 0.7863868442686177, 0.7901062568550254, 0.7926979583383524, 0.7865619842220828, 0.7903996096730087, 0.7907111897259681, 0.7906966607705525, 0.7878434380897434, 0.7890839467194146, 0.7884029349546591, 0.7908696874214116, 0.7920902517577395, 0.7912572142867709, 0.7899432683915443, 0.787364907130917, 0.7902086199499992, 0.7869577001350401, 0.7855831288713062, 0.7894215468107093, 0.7888842396231559, 0.7851244101244101, 0.7919686047764866, 0.790402383382679, 0.7925561029009304, 0.7867029150896147, 0.7904554801106526, 0.789027944200358, 0.7845974052870605, 0.7927177705502828, 0.7905186150260043, 0.7846016318922723, 0.792374226795409, 0.7896030266719921, 0.7880442018373053, 0.7893227499138828, 0.7882857787414437, 0.7909610877591173, 0.7910709794946248, 0.7925027420101312, 0.7929339878231503, 0.7890414165044708, 0.7981963754747006, 0.7916103999847842, 0.7916217589862911, 0.7912952537336775, 0.7883672729731843, 0.78

In [54]:
# 20.
column_to_drop_19 = '소득 중 정부 보조금의 비중(월평균)'

In [55]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(19949, 170)


In [268]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [269]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [270]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7973567318182437


In [271]:
xgb_optuna_20 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_20.fit(X_train, y_train)

In [272]:
xgb_optuna_proba_20 = xgb_optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, xgb_optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.796


In [273]:
X_train = X_train.values
y_train = y_train.values

In [274]:
auc_bootstrap = []

In [275]:
rs = RandomState(seed = 20)
bootstrap_auc(xgb_optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7850871 , 0.79435078])

In [276]:
np.mean(auc_bootstrap)

0.7898769977974104

In [277]:
t_20 = auc_bootstrap
print(t_20)

[0.7909551440955381, 0.7889710171114113, 0.7904413473994756, 0.7868327511184654, 0.7909822207851764, 0.7925974443831587, 0.7869698516250241, 0.7908873863307361, 0.7857615708600929, 0.7912648750087173, 0.7917266316281095, 0.7900526318014003, 0.790436724550025, 0.7894742472944445, 0.7882320216064059, 0.7882740234956984, 0.7920792890004713, 0.7931710739592513, 0.7871785402573581, 0.7905223133055646, 0.7947954111747215, 0.792451230259112, 0.7929629136525689, 0.7879385367070095, 0.7892696531859092, 0.7880677123287962, 0.7890155285475483, 0.7886413419048888, 0.7926691645903468, 0.7859039546231665, 0.7898164702351894, 0.7908708761541274, 0.7900258192745878, 0.791472771152574, 0.7856688497082586, 0.7935554308707018, 0.7898988890368199, 0.7882697968904865, 0.7906026188045892, 0.789848301855691, 0.7910273926283778, 0.789952646171858, 0.7885146758299468, 0.7903886469157406, 0.7877998512234965, 0.7864800937461036, 0.7917308582333213, 0.7874284382905072, 0.7943333903925037, 0.7904471589816418, 0.78

In [278]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
# 21
column_to_drop_20 = 'Cat_현재 상업시설 접근용이성'

In [57]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(19949, 166)


In [281]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [282]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [283]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7970975069419017


In [284]:
xgb_optuna_21 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_21.fit(X_train, y_train)

In [285]:
xgb_optuna_proba_21 = xgb_optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, xgb_optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.796


In [286]:
X_train = X_train.values
y_train = y_train.values

In [287]:
auc_bootstrap = []

In [288]:
rs = RandomState(seed = 21)
bootstrap_auc(xgb_optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78532055, 0.79439896])

In [289]:
np.mean(auc_bootstrap)

0.7899762495165821

In [290]:
t_21 = auc_bootstrap
print(t_21)

[0.7863620129629982, 0.7892110090385952, 0.7924557210271497, 0.7892666153134134, 0.7934825219307979, 0.7854305748394419, 0.7898103944901975, 0.7911968530810896, 0.7908613662924007, 0.7925550462496276, 0.7852472458383789, 0.7900243663790462, 0.7870369489827618, 0.7900048183299415, 0.7856953980722454, 0.7862164592460159, 0.7910501106313914, 0.7903494187361183, 0.7892193301676059, 0.7889254490239712, 0.7930891834832722, 0.7921183850986806, 0.7925183276168497, 0.7945146060909607, 0.7904475552258804, 0.7908408936734059, 0.7889711491928242, 0.7924702499825653, 0.7940055643257613, 0.7935229388431358, 0.7899142104807129, 0.7887143829262055, 0.7867918058804758, 0.7869759273700161, 0.7865845701436834, 0.7902684528300292, 0.7876611657399836, 0.7927359977852588, 0.7894640770256534, 0.7890009995921325, 0.7951441061046972, 0.7885455828805583, 0.7898753785453292, 0.7853795914140742, 0.7923545466648915, 0.7907804003863117, 0.7886832117127685, 0.7890431335628381, 0.7893261840306176, 0.7908871221679104,

In [291]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [58]:
# 22
column_to_drop_21 = 'Cat_현재 주차시설 이용편의성'

In [59]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(19949, 162)


In [294]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [295]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [296]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7982301710512684


In [297]:
xgb_optuna_22 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_22.fit(X_train, y_train)

In [298]:
xgb_optuna_proba_22 = xgb_optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, xgb_optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.797


In [299]:
X_train = X_train.values
y_train = y_train.values

In [300]:
auc_bootstrap = []

In [301]:
rs = RandomState(seed = 22)
bootstrap_auc(xgb_optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7858127 , 0.79471413])

In [302]:
np.mean(auc_bootstrap)

0.7903337526257785

In [303]:
t_22 = auc_bootstrap
print(t_22)

[0.7897771099741544, 0.7905040860705885, 0.7938679354935512, 0.7898236026314844, 0.794119418503655, 0.789669067378427, 0.789687294613403, 0.7886293224963176, 0.7922409566498236, 0.7893749220719664, 0.7925666694139601, 0.7926570131003629, 0.7906292992499888, 0.7877200740501232, 0.7868487329694226, 0.7929275158339198, 0.7895361934770802, 0.7888329920349625, 0.7900838030148375, 0.7920771756978654, 0.7904412153180627, 0.7912783473128302, 0.7930086138214218, 0.7893578835697062, 0.7921955206437965, 0.7892646340922203, 0.789377431618811, 0.7889679792389153, 0.79240051099657, 0.7898683782304473, 0.7944551694551695, 0.795033950206364, 0.7893162779246523, 0.7898922849661765, 0.7913879748855118, 0.7887217794853263, 0.7898949265944339, 0.7904647258095534, 0.7921289516117103, 0.7920047950836128, 0.7944995488098936, 0.7932904755564854, 0.7940449245867964, 0.7839768868093991, 0.7934892580828541, 0.7909987309617852, 0.7885593193474967, 0.7930005568552366, 0.7916122491245643, 0.7893553740228616, 0.7890

In [304]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [60]:
# 23.
column_to_drop_22 = '소득 중 근로/사업소득의 비중(월평균)'

In [61]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(19949, 161)


In [307]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [308]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [309]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.798531912109558


In [310]:
xgb_optuna_23 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_23.fit(X_train, y_train)

In [311]:
xgb_optuna_proba_23 = xgb_optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, xgb_optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.798


In [312]:
X_train = X_train.values
y_train = y_train.values

In [313]:
auc_bootstrap = []

In [314]:
rs = RandomState(seed = 23)
bootstrap_auc(xgb_optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78561824, 0.79511347])

In [315]:
np.mean(auc_bootstrap)

0.7902640466601365

In [316]:
t_23 = auc_bootstrap
print(t_23)

[0.7913649927196725, 0.7887854427263294, 0.7874121922767243, 0.7904636691582505, 0.7877047526062304, 0.7936249056938712, 0.7909985988803723, 0.7894310566724361, 0.7879690475133825, 0.7929230250658822, 0.7929680648276707, 0.7846507661778598, 0.7925386681544315, 0.792638653783974, 0.7929666119321293, 0.7896859737992743, 0.7886142652152506, 0.7895330235231713, 0.79373347661525, 0.7919435093080414, 0.7893449395912451, 0.7843144869006937, 0.7901593535829989, 0.7896056683002496, 0.791196456836851, 0.7879348384274492, 0.7884068973970453, 0.7903133605104048, 0.7905764666848412, 0.7892881445837111, 0.7905763346034282, 0.791387842804099, 0.7897676001124277, 0.791541717650092, 0.791218646514213, 0.7919366410745722, 0.7878665523369957, 0.7879895201323773, 0.7946567256912085, 0.7896180839530594, 0.791567077281363, 0.788636983218264, 0.7895619493525897, 0.7884893161986759, 0.7908760273292293, 0.7900165735756868, 0.7904597067158643, 0.7886863816666773, 0.7913998622126701, 0.7890079999070146, 0.785388

In [317]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [62]:
# 24
column_to_drop_23 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [63]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(19949, 157)


In [320]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [321]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [322]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7979148083036167


In [323]:
xgb_optuna_24 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_24.fit(X_train, y_train)

In [324]:
xgb_optuna_proba_24 = xgb_optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, xgb_optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.797


In [325]:
X_train = X_train.values
y_train = y_train.values

In [326]:
auc_bootstrap = []

In [327]:
rs = RandomState(seed = 24)
bootstrap_auc(xgb_optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78555862, 0.79448198])

In [328]:
np.mean(auc_bootstrap)

0.7900947142603547

In [329]:
t_24 = auc_bootstrap
print(t_24)

[0.7889048443235636, 0.7923797742147496, 0.7894141502515887, 0.7902203751957447, 0.7934697100337493, 0.7902663395274232, 0.788081977121386, 0.7887431766742112, 0.79300623635599, 0.7898015450355351, 0.7913216700162513, 0.7947232947232948, 0.7903220778836543, 0.7875161403486527, 0.7902327908485545, 0.7907180579594372, 0.7943537309300857, 0.7933755359863733, 0.789059379576621, 0.7934127829448027, 0.7918008613821421, 0.7906879433973031, 0.791432354240236, 0.7909073306240794, 0.7922940533777972, 0.787965481315235, 0.792421908185455, 0.7899747037678073, 0.7890414165044707, 0.789714503384454, 0.7909824849480023, 0.7895906110191824, 0.7903352860249413, 0.7895126829855894, 0.7881890951472232, 0.7894648695141306, 0.790661527114729, 0.7915121314136092, 0.7886879666436317, 0.7867211423245906, 0.7895628739224798, 0.7918583167967403, 0.7890511905290231, 0.7899138142364743, 0.7952186000215558, 0.7887336668124846, 0.787976576153916, 0.7911547191103842, 0.7876873178597316, 0.7910104862075306, 0.7934775

In [330]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [64]:
column_to_drop_24 = 'Cat_현재 대중교통 접근용이성'

In [65]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(19949, 153)


In [333]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [334]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [335]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7981040672300543


In [336]:
xgb_optuna_25 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_25.fit(X_train, y_train)

In [337]:
xgb_optuna_proba_25 = xgb_optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, xgb_optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.795


In [338]:
X_train = X_train.values
y_train = y_train.values

In [339]:
auc_bootstrap = []

In [340]:
rs = RandomState(seed = 25)
bootstrap_auc(xgb_optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78531985, 0.79443288])

In [341]:
np.mean(auc_bootstrap)

0.7899318770300913

In [342]:
t_25 = auc_bootstrap
print(t_25)

[0.7896880871018801, 0.7921442730556031, 0.7886404173349987, 0.7914713182570325, 0.7878347207164941, 0.791052488096823, 0.7900061391440703, 0.7921305365886646, 0.7902864159021795, 0.7893399204975559, 0.7934117262934998, 0.791705762764876, 0.7866031936228981, 0.7917274241165866, 0.7941194185036551, 0.7943838454922199, 0.7901548628149613, 0.7884380686104823, 0.7886532292320471, 0.7891214578406697, 0.7901559194662644, 0.7882608153544114, 0.7921113847837986, 0.7888879379027163, 0.7911098114300086, 0.7917842191241206, 0.7877903413617701, 0.7910642433425685, 0.7880014074595355, 0.7861694382630343, 0.7950212703907286, 0.7854490662372435, 0.7905989205250289, 0.7871896350960391, 0.7901777128993878, 0.7898637553809967, 0.7893290898217007, 0.7898024696054253, 0.7878978556318458, 0.7829544445923755, 0.7893050510045584, 0.7908840842954143, 0.79065650802104, 0.790279811831536, 0.7881223940337241, 0.7902967182523832, 0.7944650755611347, 0.7911853619981699, 0.7887458183024686, 0.7917419530720022, 0.79

In [343]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [66]:
# 26
column_to_drop_25 = 'Cat_현재 문화시설 접근용이성'

In [67]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(19949, 149)


In [346]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [347]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [348]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7961633892744469


In [349]:
xgb_optuna_26 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_26.fit(X_train, y_train)

In [350]:
xgb_optuna_proba_26 = xgb_optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, xgb_optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.798


In [351]:
X_train = X_train.values
y_train = y_train.values

In [352]:
auc_bootstrap = []

In [353]:
rs = RandomState(seed = 26)
bootstrap_auc(xgb_optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78578926, 0.79481085])

In [354]:
np.mean(auc_bootstrap)

0.7903390391843286

In [355]:
t_26 = auc_bootstrap
print(t_26)

[0.7861240022570072, 0.7910532805853003, 0.7916822522733853, 0.7905890144190637, 0.7914015792710374, 0.7914492606610833, 0.7894516613728436, 0.7929695177232123, 0.7934635022073446, 0.7904364603871993, 0.7926337667716978, 0.7869193965253078, 0.7896990498591483, 0.793745628105234, 0.7876006724528892, 0.7887923109597988, 0.7867413507807596, 0.7899929310027833, 0.7907625693955744, 0.79214136726452, 0.7905182187817656, 0.7880175213919056, 0.7867117645442768, 0.7909478796178303, 0.7898994173624716, 0.7885606401616254, 0.7899276827848256, 0.7899290035989543, 0.7901271257182587, 0.787160313022382, 0.7903231345349572, 0.7885006752001825, 0.7907378701713677, 0.7892094240616407, 0.7915039423660113, 0.7861158132094093, 0.7895826861344104, 0.7921308007514904, 0.7896561233999658, 0.7885968304687516, 0.7902901141817398, 0.7909388980817553, 0.7921091393997798, 0.785742286973814, 0.7897475237376714, 0.7865803435384717, 0.7901120684371916, 0.7894479630932831, 0.791750406282426, 0.7953306050596692, 0.788

In [356]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [68]:
# 27
column_to_drop_26 = 'Cat_현재 청소/쓰레기 처리상태'

In [69]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(19949, 145)


In [359]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [360]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [361]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7964519214218403


In [362]:
xgb_optuna_27 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_27.fit(X_train, y_train)

In [363]:
xgb_optuna_proba_27 = xgb_optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, xgb_optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.798


In [364]:
X_train = X_train.values
y_train = y_train.values

In [365]:
auc_bootstrap = []

In [366]:
rs = RandomState(seed = 27)
bootstrap_auc(xgb_optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78601675, 0.79498977])

In [367]:
np.mean(auc_bootstrap)

0.7905199119994379

In [368]:
t_27 = auc_bootstrap
print(t_27)

[0.7879147620526931, 0.7915448876040009, 0.7913821633033455, 0.7874648927604593, 0.7895887618794022, 0.7916487035945164, 0.7872992626687209, 0.7926165961880247, 0.7904655182980307, 0.7912492894019987, 0.7889756399608616, 0.7928353230077367, 0.7920874780480692, 0.7879541223137282, 0.7868744888449322, 0.7919833978947278, 0.7927966231537661, 0.7903249836747372, 0.792637465051258, 0.7893552419414489, 0.7914727711525741, 0.791597456006323, 0.7898607175085007, 0.7910601488187694, 0.790764946861006, 0.7912681770440391, 0.7920880063737206, 0.7919049415354834, 0.7846092926142187, 0.7917975593468204, 0.7892310854133513, 0.7883857643709861, 0.7882911940793715, 0.7945613629111166, 0.7929482526157403, 0.790837723719497, 0.7928089067251629, 0.7925897836612122, 0.7915220375195744, 0.7882172284881646, 0.7933326095271909, 0.7891589689619247, 0.7900505184987944, 0.7896260088378315, 0.7932022451726886, 0.786091774392267, 0.7891705921262571, 0.7887046089016533, 0.7934753895345028, 0.7922929967264942, 0.78

In [369]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [70]:
# 28
column_to_drop_27  = 'Cat_현재 교육환경'

In [71]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(19949, 141)


In [372]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [373]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)  

In [374]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7961931093239638


In [375]:
xgb_optuna_28 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_28.fit(X_train, y_train)

In [376]:
xgb_optuna_proba_28 = xgb_optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, xgb_optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.797


In [377]:
X_train = X_train.values
y_train = y_train.values

In [378]:
auc_bootstrap = []

In [379]:
rs = RandomState(seed = 28)
bootstrap_auc(xgb_optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78573744, 0.79470149])

In [380]:
np.mean(auc_bootstrap)

0.7902922239181476

In [381]:
t_28 = auc_bootstrap
print(t_28)

[0.7902423007102811, 0.7925484421789841, 0.7932912680449626, 0.7887569131411495, 0.7916553076651599, 0.7910004480201525, 0.7939109940341469, 0.7901662218164681, 0.7839301299892432, 0.7896408019560729, 0.7906014300718733, 0.791045487781941, 0.7925942744292498, 0.7945044358221698, 0.7898275650738704, 0.7899633447663005, 0.7907637581282903, 0.7943785622357051, 0.7917630860980616, 0.7856321310754808, 0.7908094582971431, 0.7901667501421196, 0.7916909696466347, 0.7933768568005021, 0.7916250610216127, 0.7869938904421663, 0.7901103513788243, 0.7913594453003321, 0.7898719444285947, 0.7922723920260866, 0.79415877876469, 0.7878967989805429, 0.7890456431096824, 0.7948265823881587, 0.793288890579531, 0.794127343388427, 0.7842438233448086, 0.7891793094995065, 0.7910981882656759, 0.7880996760307105, 0.7890905507900581, 0.7851459393947079, 0.7929277799967456, 0.7923352627786124, 0.7929560454190996, 0.7898735294055492, 0.7906162231901148, 0.7918069371271342, 0.7916103999847843, 0.7910590921674665, 0.78

In [382]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
# 29
column_to_drop_28 = 'Cat_이사 계획 중인 주택의 유형'

In [73]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(19949, 122)


In [385]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [386]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [387]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7963070361804453


In [388]:
xgb_optuna_29 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_29.fit(X_train, y_train)

In [389]:
xgb_optuna_proba_29 = xgb_optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, xgb_optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.797


In [390]:
X_train = X_train.values
y_train = y_train.values

In [391]:
auc_bootstrap = []

In [392]:
rs = RandomState(seed = 29)
bootstrap_auc(xgb_optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78600138, 0.79482473])

In [393]:
np.mean(auc_bootstrap)

0.7903595320117794

In [394]:
t_29 = auc_bootstrap
print(t_29)

[0.7860487158516716, 0.7899929310027832, 0.7929742726540756, 0.789537250128383, 0.7944673209451535, 0.7896932382769821, 0.7923548108277172, 0.7921458580325575, 0.7875486323762186, 0.787792454664376, 0.7915728888635293, 0.7960586378074062, 0.7914672237332335, 0.7940644726359012, 0.7886013212367892, 0.7920367587855273, 0.7887798953069889, 0.791134642735628, 0.7907800041420732, 0.7955392936920029, 0.7933448930985876, 0.7901258049041301, 0.791202532581843, 0.7884927503154104, 0.7911132455467431, 0.7887709137709138, 0.794299577550809, 0.7904998594653766, 0.789594045135917, 0.7896419906887886, 0.7906874150716516, 0.7853679682497416, 0.7908015334123709, 0.7852973046938565, 0.7855190693860645, 0.7874378160708211, 0.7902935482984745, 0.788763517211793, 0.7912033250703201, 0.7837569712569713, 0.7931079390438995, 0.7928358513333884, 0.7905915239659081, 0.7961263955722083, 0.7864040148522907, 0.7931922069853103, 0.7839045061951466, 0.7899999313176653, 0.7893239386465988, 0.7912617050548085, 0.7917

In [395]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [74]:
column_to_drop_29 = 'Cat_이사 계획 첫 번째 이유'

In [75]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(19949, 109)


In [398]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [399]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [400]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956804384697973


In [401]:
xgb_optuna_30 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_30.fit(X_train, y_train)

In [402]:
xgb_optuna_proba_30 = xgb_optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, xgb_optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.797


In [403]:
X_train = X_train.values
y_train = y_train.values

In [404]:
auc_bootstrap = []

In [405]:
rs = RandomState(seed = 30)
bootstrap_auc(xgb_optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78538166, 0.79469149])

In [406]:
np.mean(auc_bootstrap)

0.7900606963622665

In [407]:
t_30 = auc_bootstrap
print(t_30)

[0.7910580355161636, 0.7866676493523785, 0.7868139955578379, 0.7913944468747425, 0.7924287764189244, 0.7905664284974628, 0.7908464410927465, 0.7919950210590605, 0.7922546931167621, 0.7878410606243118, 0.7930223502883602, 0.789355109860036, 0.7863182940153384, 0.7956194671096148, 0.7920040025951356, 0.7885249781801507, 0.7905093693271035, 0.7902671320159005, 0.7875342355022159, 0.7935769601409995, 0.7895422692220723, 0.7918560714127216, 0.7880616365838041, 0.7886031703765694, 0.7862814433011477, 0.793161299934699, 0.7851141077742062, 0.7949076803756607, 0.7890995323261334, 0.7886851929339614, 0.7917395756065706, 0.793660567675346, 0.7863696736849446, 0.7888591441547106, 0.7909798433197448, 0.7873400758252975, 0.7884232754922411, 0.7908706119913016, 0.7901836565629669, 0.7828066454913745, 0.791539604347486, 0.7907999484354163, 0.7915147730418667, 0.7939706948327638, 0.7920935537930612, 0.7927412810417735, 0.7925170068027212, 0.789166761765284, 0.7896061966259011, 0.7905392197264118, 0.79

In [408]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [76]:
# 31
column_to_drop_30 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [77]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(19949, 105)


In [411]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [412]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [413]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956346200601253


In [414]:
xgb_optuna_31= XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_31.fit(X_train, y_train)

In [415]:
xgb_optuna_proba_31 = xgb_optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, xgb_optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.795


In [416]:
X_train = X_train.values
y_train = y_train.values

In [417]:
auc_bootstrap = []

In [418]:
rs = RandomState(seed = 31)
bootstrap_auc(xgb_optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78501068, 0.79437923])

In [419]:
np.mean(auc_bootstrap)

0.7897324093974342

In [420]:
t_31 = auc_bootstrap
print(t_31)

[0.7925574237150592, 0.7879065730050954, 0.7889284868964671, 0.7894307925096102, 0.7928598901505306, 0.7857845530259324, 0.7871243868780814, 0.7880479001168657, 0.7911256611995529, 0.7894371324174281, 0.7886641919893151, 0.7901770524923235, 0.7938872193798303, 0.7911895886033818, 0.7897660151354733, 0.7950453092078709, 0.7891225144919726, 0.7850108201093422, 0.7882872316369853, 0.7894759643528115, 0.7880241254625491, 0.7884705606380483, 0.7884322570283161, 0.7930807302728485, 0.7911911735803362, 0.7890609645535754, 0.7898143569325835, 0.7860960009974789, 0.7873621334212467, 0.7882098319290438, 0.7877293197490242, 0.7855145786180269, 0.7853745723203852, 0.7954777437536058, 0.7898789447434768, 0.7902967182523832, 0.7901334656260766, 0.7860646977026287, 0.7908649324905483, 0.7922845435160706, 0.7881472253393436, 0.7870435530534052, 0.7893531286388429, 0.7872406185214067, 0.7894268300672242, 0.789209159898815, 0.7926160678623733, 0.7890733802063852, 0.7879088183891139, 0.7923299795220977, 

In [421]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [78]:
# 32
column_to_drop_31 = '현재 무주택 기간(총 개월)'

In [79]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(19949, 104)


In [424]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [425]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [426]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7961732959576192


In [427]:
xgb_optuna_32 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_32.fit(X_train, y_train)

In [428]:
xgb_optuna_proba_32 = xgb_optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, xgb_optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.797


In [429]:
X_train = X_train.values
y_train = y_train.values

In [430]:
auc_bootstrap = []

In [431]:
rs = RandomState(seed = 32)
bootstrap_auc(xgb_optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78538125, 0.79425094])

In [432]:
np.mean(auc_bootstrap)

0.7898327169093795

In [433]:
t_32 = auc_bootstrap
print(t_32)

[0.7875129703947439, 0.7869271893286671, 0.7898119794671519, 0.7876305228521978, 0.7901708446659186, 0.7914487323354318, 0.7863815610121029, 0.7862980855591694, 0.7917216125344204, 0.7888990327413972, 0.7880695614685762, 0.7909082551939696, 0.7906517530901767, 0.7888216330334556, 0.7892582941844026, 0.7911481150397406, 0.7886182276576365, 0.7893504870105854, 0.7881879064145074, 0.7922874493071537, 0.789604347486121, 0.7905149167464438, 0.790687943397303, 0.7935151460397765, 0.7911621156695048, 0.7904272146882984, 0.7912914233727042, 0.792032003854664, 0.7931381856874469, 0.7892849746298022, 0.7917266316281095, 0.7862632160661718, 0.7880648065377129, 0.7879805385963022, 0.7865306809272326, 0.7940568119139548, 0.7912640825202402, 0.7902243376381307, 0.7896423869330274, 0.7862019302906003, 0.7873386229297559, 0.7888845037859817, 0.7879345742646235, 0.7857986857371094, 0.790926086184707, 0.7856471883565479, 0.7910462802704182, 0.7893420338001618, 0.790338588060263, 0.7945317766746338, 0.79

In [434]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [80]:
# 33.
column_to_drop_32 = 'Cat_현재 대기오염 정도'

In [81]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(19949, 100)


In [437]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [438]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [439]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.795199551557475


In [440]:
xgb_optuna_33 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_33.fit(X_train, y_train)

In [441]:
xgb_optuna_proba_33 = xgb_optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, xgb_optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.797


In [442]:
X_train = X_train.values
y_train = y_train.values

In [443]:
auc_bootstrap = []

In [444]:
rs = RandomState(seed = 33)
bootstrap_auc(xgb_optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78487958, 0.7940618 ])

In [445]:
np.mean(auc_bootstrap)

0.7895427516494327

In [446]:
t_33 = auc_bootstrap
print(t_33)

[0.7886698714900686, 0.7911750596479661, 0.792561650320271, 0.7930191803344513, 0.7867225952201322, 0.7887106846466452, 0.7938816719604896, 0.7926718062186042, 0.7874376839894082, 0.7893639593146982, 0.7873550010249517, 0.7858815007829787, 0.7879104033660684, 0.7884098031881284, 0.7895412125707693, 0.7875996158015863, 0.7909917306469031, 0.790676848558622, 0.7900565942437864, 0.7906639045801607, 0.7925988972787001, 0.7941908745480174, 0.789814885258235, 0.7938454816533634, 0.7898814542903213, 0.7872746955259271, 0.7904626125069474, 0.7900211964251374, 0.7860196579408403, 0.7887875560289354, 0.7890168493616769, 0.7913792575122625, 0.79101920358078, 0.7877330180285844, 0.7905478050182484, 0.7863844668031861, 0.7889666584247865, 0.7901213141360923, 0.7891169670726321, 0.7879759157468518, 0.7895084563803776, 0.789438189068731, 0.7919416601682612, 0.786456319091787, 0.7886173030877464, 0.7902663395274233, 0.7887448937325785, 0.7900425936140223, 0.7911581532271188, 0.7918288626416705, 0.7899

In [447]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [82]:
# 34
column_to_drop_33 = '총 가구원 수'

In [83]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(19949, 99)


In [450]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [452]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [453]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7955303934975834


In [454]:
xgb_optuna_34 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_34.fit(X_train, y_train)

In [455]:
xgb_optuna_proba_34 = xgb_optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, xgb_optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.797


In [456]:
X_train = X_train.values
y_train = y_train.values

In [457]:
auc_bootstrap = []

In [458]:
rs = RandomState(seed = 34)
bootstrap_auc(xgb_optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78509056, 0.79387385])

In [459]:
np.mean(auc_bootstrap)

0.7893878094535423

In [460]:
t_34 = auc_bootstrap
print(t_34)

[0.7920493065197499, 0.7920540614506132, 0.789122118247734, 0.7928535502427128, 0.785091125608367, 0.7852776245633388, 0.7880584666298951, 0.7887410633716052, 0.788837614884413, 0.7888477851532039, 0.7921215550525894, 0.7907222845646491, 0.7883320072359482, 0.7910762627511396, 0.7895686855046461, 0.7943112007151415, 0.7897719587990525, 0.7911285669906359, 0.7912952537336774, 0.7898671894977314, 0.7906650933128765, 0.7876651281823694, 0.7899707413254212, 0.7880185780432086, 0.7894146785772402, 0.7894329058122161, 0.7889996787780039, 0.7892287079479197, 0.7889621676567489, 0.7903342293736383, 0.7906569042652787, 0.7916991586942326, 0.7877898130361185, 0.787341660802252, 0.7877631325907188, 0.7859816184939339, 0.7934027447574247, 0.789607253277204, 0.7887141187633798, 0.7898500189140584, 0.7882573812376767, 0.7895134754740667, 0.7853581942251894, 0.7877139983051313, 0.7874210417313866, 0.7877715858011425, 0.7877940396413303, 0.7898455281460207, 0.7910651679124585, 0.7863725794760277, 0.79

In [461]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [84]:
# 35
column_to_drop_34 = '자산 중 기타자산의 비중'

In [85]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(19949, 98)


In [464]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [465]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [466]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7956135683583841


In [467]:
xgb_optuna_35 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_35.fit(X_train, y_train)

In [468]:
xgb_optuna_proba_35 = xgb_optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, xgb_optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.796


In [469]:
X_train = X_train.values
y_train = y_train.values

In [470]:
auc_bootstrap = []

In [471]:
rs = RandomState(seed = 35)
bootstrap_auc(xgb_optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78515745, 0.79352818])

In [472]:
np.mean(auc_bootstrap)

0.7894295483027011

In [473]:
t_35 = auc_bootstrap
print(t_35)

[0.786458432394393, 0.7907471158702686, 0.7860381493386419, 0.7877303764003271, 0.7895694779931233, 0.7838247290217732, 0.7891675542537613, 0.7901337297889023, 0.7903178512784423, 0.788858879991885, 0.7915794929341727, 0.7873240939743402, 0.7906356391578067, 0.7877258856322895, 0.7876002762086507, 0.7878196634354271, 0.787637391085667, 0.7849470247869264, 0.7861382670495971, 0.7904890287895214, 0.7890210759668888, 0.7894931349364848, 0.7886783247004922, 0.7865923629470427, 0.7962981014089389, 0.7884997506302925, 0.7915283774273922, 0.7869159624085732, 0.790336474757657, 0.7889339022343947, 0.7898521322166643, 0.7900222530764403, 0.7901730900499373, 0.7935044474453341, 0.7915123955764349, 0.7921022711663107, 0.7894833609119323, 0.78563899930895, 0.7863308417495609, 0.7895517790837987, 0.7927843395823692, 0.7925679902280887, 0.7888538608981959, 0.7911940793714193, 0.7936283398106059, 0.785072898373391, 0.7872021828302618, 0.7867396337223923, 0.7922453153364484, 0.7869928337908634, 0.7859

In [474]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [86]:
# 36
column_to_drop_35 = '자산 중 금융자산의 비중'

In [87]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(19949, 97)


In [477]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [478]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [479]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7959417272384666


In [480]:
xgb_optuna_36 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_36.fit(X_train, y_train)

In [481]:
xgb_optuna_proba_36 = xgb_optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, xgb_optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.796


In [482]:
X_train = X_train.values
y_train = y_train.values

In [483]:
auc_bootstrap = []

In [484]:
rs = RandomState(seed = 36)
bootstrap_auc(xgb_optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78516054, 0.79386904])

In [485]:
np.mean(auc_bootstrap)

0.7896933414325656

In [486]:
t_36 = auc_bootstrap
print(t_36)

[0.7924179457430689, 0.7881118275206945, 0.7871885784447361, 0.7889336380715691, 0.7906306200641177, 0.7929890657723171, 0.7916027392628376, 0.7907938726904244, 0.791895167510931, 0.7891885551984074, 0.7856596040093576, 0.7894844175632354, 0.7864567153360258, 0.788204548672529, 0.791698894531407, 0.79208153438449, 0.7866705551434615, 0.786952681041351, 0.7866391197671987, 0.7893375430321243, 0.7891670259281098, 0.7899540990673996, 0.7907140955170512, 0.7920189277947898, 0.7868369777236772, 0.7905627302179026, 0.7851159569139865, 0.7924324746984845, 0.7875518023301276, 0.7897414479926796, 0.7914386941480538, 0.7895356651514286, 0.7902359608024632, 0.7912831022436934, 0.7890828900681117, 0.785011084272168, 0.7894838892375837, 0.7852044514606091, 0.7857916854222273, 0.789614385673499, 0.7888883341469549, 0.7915837195393846, 0.7921401785318042, 0.7894943236692005, 0.7888174064282438, 0.7827815500229294, 0.7881699433423572, 0.7876173147109107, 0.7841388186215772, 0.790793344364773, 0.782591

In [487]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [88]:
# 37
column_to_drop_36 = '소득 대비 주거관리비의 비율'

In [89]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(19949, 96)


In [490]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [491]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [492]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7955648604994536


In [493]:
xgb_optuna_37 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_37.fit(X_train, y_train)

In [494]:
xgb_optuna_proba_37 = xgb_optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, xgb_optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.796


In [495]:
X_train = X_train.values
y_train = y_train.values

In [496]:
auc_bootstrap = []

In [497]:
rs = RandomState(seed = 37)
bootstrap_auc(xgb_optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78535193, 0.79418835])

In [498]:
np.mean(auc_bootstrap)

0.7899084607126479

In [499]:
t_37 = auc_bootstrap
print(t_37)

[0.7905088410014518, 0.7902886612861982, 0.7908451202786179, 0.7915205846240329, 0.7913578603233776, 0.7920082292003474, 0.7934901826527441, 0.790843799464489, 0.7877180928289302, 0.7885797919664915, 0.789673029820813, 0.7875948608707231, 0.7875521985743661, 0.7928575126850989, 0.7879987658312781, 0.7893935455511809, 0.7854169704539162, 0.7904541592965239, 0.7907354927059359, 0.7885177137024427, 0.7860368285245133, 0.7894355474404735, 0.7896373678393382, 0.7924884772175411, 0.7925236108733646, 0.7926969016870495, 0.7906361674834581, 0.7890017920806098, 0.7888839754603301, 0.788539903379805, 0.7893528644760173, 0.7903027939973754, 0.7926937317331405, 0.7876466367845678, 0.7853896296014522, 0.791658213456243, 0.7895655155507372, 0.7884668623584881, 0.7912381945633176, 0.7925388002358447, 0.790254452200265, 0.7882885524511141, 0.7876244471072058, 0.7884510125889437, 0.7906837167920912, 0.7899403626004611, 0.7887901976571927, 0.7893778278630496, 0.7913673701851042, 0.7912577426124224, 0.78

In [500]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [90]:
# 38
column_to_drop_37 = '소득 대비 생활비의 비율'

In [91]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(19949, 95)


In [503]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [504]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [505]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7947071068481186


In [506]:
xgb_optuna_38 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_38.fit(X_train, y_train)

In [507]:
xgb_optuna_proba_38 = xgb_optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, xgb_optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.797


In [508]:
X_train = X_train.values
y_train = y_train.values

In [509]:
auc_bootstrap = []

In [510]:
rs = RandomState(seed = 38)
bootstrap_auc(xgb_optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78704087, 0.79538603])

In [511]:
np.mean(auc_bootstrap)

0.7912301619661533

In [512]:
t_38 = auc_bootstrap
print(t_38)

[0.7944276965212925, 0.786082528693366, 0.7985002948057135, 0.7926770894751191, 0.7936223961470267, 0.7917144801381254, 0.7916753840399161, 0.7916584776190687, 0.7906551872069114, 0.7934385388203122, 0.7890390390390392, 0.7905875615235222, 0.7907214920761718, 0.7882293799781483, 0.7907510783126546, 0.7898708877772915, 0.790224073475305, 0.7917076119046562, 0.7958187779616351, 0.7929800842362419, 0.7918502598305553, 0.7885672442322689, 0.7904659145422692, 0.7917708789014207, 0.7911306802932419, 0.7892495768111532, 0.7884901086871531, 0.7923677548061785, 0.7903643439357725, 0.7904760848110602, 0.7899863269321398, 0.7912245901777922, 0.792009814177302, 0.7941656469981594, 0.7926036522095635, 0.7934825219307977, 0.7873236977301017, 0.7926401066795156, 0.790528917376208, 0.793721193043853, 0.791027392628378, 0.7884547108685038, 0.7882563245863738, 0.7894112444605057, 0.7904743677526929, 0.7870428926463409, 0.7908472335812239, 0.790119729159138, 0.7894414911040526, 0.7929106094130725, 0.7909

In [513]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [92]:
# 38
column_to_drop_38 = '총 이사 횟수'

In [93]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(19949, 94)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [97]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [98]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7935901283204416


In [99]:
xgb_optuna_39 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_39.fit(X_train, y_train)

In [100]:
xgb_optuna_proba_39 = xgb_optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, xgb_optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.797


In [101]:
X_train = X_train.values
y_train = y_train.values

In [102]:
auc_bootstrap = []

In [103]:
rs = RandomState(seed = 39)
bootstrap_auc(xgb_optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78739506, 0.79542462])

In [104]:
np.mean(auc_bootstrap)

0.7914171703591346

In [105]:
t_39 = auc_bootstrap
print(t_39)

[0.7928863064331044, 0.7873999087053274, 0.7931334307565835, 0.7915298303229338, 0.7905933731056883, 0.7905252190966476, 0.7935366753100743, 0.7915578315824622, 0.7913542941252301, 0.7928823439907183, 0.791591380261331, 0.7918103712438687, 0.7908422144875347, 0.7903727971461961, 0.7940948513608612, 0.7909996555316752, 0.792427983930447, 0.7888063115895627, 0.7890447185397924, 0.7919332069578375, 0.7915793608527597, 0.7930477099196311, 0.7889217507444108, 0.7898032620939025, 0.7934011597804702, 0.7930035947277326, 0.7932003960329084, 0.7948252615740299, 0.7896106873939386, 0.7897687888451436, 0.791646061966259, 0.7885973587944031, 0.7919207913050277, 0.7912337037952801, 0.7887061938786077, 0.7870410435065607, 0.7894171881240846, 0.7882050769981805, 0.7902603958638441, 0.7921487638236406, 0.7926300684921375, 0.7927706031154307, 0.7908554226288216, 0.7893640913961111, 0.7906398657630185, 0.7904044966852849, 0.7932845318929064, 0.7908157982049608, 0.7932464924459999, 0.7907638902097031, 0.

In [106]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [94]:
# 40
column_to_drop_39 = '현재 주택 거주 기간(총 개월)'

In [95]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(19949, 93)


In [109]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [110]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [111]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7941948487724174


In [112]:
xgb_optuna_40 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_40.fit(X_train, y_train)

In [113]:
xgb_optuna_proba_40 = xgb_optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, xgb_optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.796


In [114]:
X_train = X_train.values
y_train = y_train.values

In [115]:
auc_bootstrap = []

In [116]:
rs = RandomState(seed = 40)
bootstrap_auc(xgb_optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78742798, 0.79535301])

In [117]:
np.mean(auc_bootstrap)

0.7914209527085143

In [118]:
t_40 = auc_bootstrap
print(t_40)

[0.7924018318106989, 0.7897686567637308, 0.7920741378253694, 0.7933155710249307, 0.7917983518352977, 0.7945074736946658, 0.7926765611494675, 0.790077331025607, 0.7934122546191512, 0.7917253108139808, 0.7909828811922407, 0.7933784417774565, 0.7923841329013742, 0.7922190311352872, 0.7913034427812753, 0.7907571540576467, 0.7941908745480175, 0.7906537343113698, 0.7897999600585808, 0.7913758233955279, 0.7933088348728742, 0.7919860395229853, 0.7852712846555211, 0.7922696183164164, 0.7893363542994086, 0.7906122607477287, 0.7934726158248326, 0.7914215235643808, 0.7888842396231559, 0.789152364891281, 0.7887612718277742, 0.7896518967947539, 0.7927021849435643, 0.7969019776285786, 0.7908914808545351, 0.7894717377475998, 0.7890678327870446, 0.7907035290040216, 0.7922697503978292, 0.788566055499553, 0.7928394175315359, 0.7948248653297915, 0.7893758466418566, 0.7928050763641897, 0.7936902859932415, 0.7888077644851044, 0.7888179347538954, 0.7903346256178768, 0.7876358061087125, 0.7909698051323667, 0.

In [119]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [96]:
# 41.
column_to_drop_40 = '중기부채부담지표'

In [97]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(19949, 92)


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [123]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [124]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7940693641189017


In [125]:
xgb_optuna_41 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_41.fit(X_train, y_train)

In [126]:
xgb_optuna_proba_41 = xgb_optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, xgb_optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.796


In [127]:
X_train = X_train.values
y_train = y_train.values

In [128]:
auc_bootstrap = []

In [129]:
rs = RandomState(seed = 41)
bootstrap_auc(xgb_optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7877073 , 0.79562319])

In [130]:
np.mean(auc_bootstrap)

0.7916891170727378

In [131]:
t_41 = auc_bootstrap
print(t_41)

[0.7933702527298585, 0.7936353401254879, 0.7920059838163287, 0.7924191344757847, 0.7899424759030671, 0.7944699625734108, 0.7907913631435799, 0.7936554165002443, 0.7930091421470731, 0.7947497110058686, 0.7897839782076236, 0.7907226808088876, 0.7924155682776374, 0.7902253942894337, 0.7921883882475015, 0.7926844860342397, 0.7885959058988615, 0.7883849718825088, 0.7924532114803051, 0.7921234041923697, 0.7896513684691024, 0.7884341061680964, 0.7930067646816416, 0.7935290145881279, 0.7926408991679927, 0.7924118699980769, 0.7903095301494317, 0.7940948513608612, 0.7922467682319899, 0.7904694807404167, 0.7890325670498084, 0.7931203546967094, 0.790163316025385, 0.7908699515842373, 0.7930385963021432, 0.7947018975344099, 0.7949214168425992, 0.7960253532913631, 0.791470922012794, 0.7940494153548341, 0.7936161883206216, 0.7914958853998262, 0.7890799842770286, 0.7937882904015909, 0.7913451805077423, 0.7927410168789479, 0.791108226453054, 0.7914174290405817, 0.7937878941573523, 0.7907270394955124, 0.

In [132]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [133]:
# 42.
column_to_drop_41 = 'Cat_현재 주택의 유형'

In [134]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(19949, 81)


In [135]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [136]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [137]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7938237609319216


In [138]:
xgb_optuna_42 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_42.fit(X_train, y_train)

In [139]:
xgb_optuna_proba_42 = xgb_optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, xgb_optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.794


In [140]:
X_train = X_train.values
y_train = y_train.values

In [141]:
auc_bootstrap = []

In [142]:
rs = RandomState(seed = 42)
bootstrap_auc(xgb_optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78653106, 0.79446285])

In [143]:
np.mean(auc_bootstrap)

0.7905233819102354

In [144]:
t_42 = auc_bootstrap
print(t_42)

[0.7901860340283986, 0.7914556005689011, 0.7886985331566613, 0.7865310771714713, 0.7902825855412062, 0.7919763975798458, 0.7896146498363246, 0.7907415684509282, 0.7908975565995271, 0.7936793232359735, 0.7918301834557993, 0.7871854084908273, 0.789706578499682, 0.7899797228614962, 0.7950236478561601, 0.7946892177187743, 0.7903438713167777, 0.7896798980542823, 0.7948294881792419, 0.7918042954988768, 0.7956761300357359, 0.7910913200322067, 0.7903549661554587, 0.7936636055478421, 0.7892324062274803, 0.7842077651190952, 0.7884058407457422, 0.7873452270003993, 0.7936618884894747, 0.7910073162536215, 0.7919375656444623, 0.7906986419917454, 0.7918307117814507, 0.7900383670088104, 0.7903745142045634, 0.7902129786366239, 0.7878859683046875, 0.7910527522596488, 0.789783449881972, 0.7920154936780552, 0.7892236888542307, 0.7912302696785456, 0.7903424184212362, 0.7891204011893667, 0.7922050305055232, 0.7851957340873597, 0.7921746517805631, 0.788686645829503, 0.794409073042078, 0.7904906137664759, 0.7

In [145]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [146]:
# 43.
column_to_drop_42 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [147]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(19949, 77)


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [149]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [150]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7923685104517572


In [151]:
xgb_optuna_43 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_43.fit(X_train, y_train)

In [152]:
xgb_optuna_proba_43 = xgb_optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, xgb_optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.794


In [153]:
X_train = X_train.values
y_train = y_train.values

In [154]:
auc_bootstrap = []

In [155]:
rs = RandomState(seed = 43)
bootstrap_auc(xgb_optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78607473, 0.79388423])

In [156]:
np.mean(auc_bootstrap)

0.790198997753031

In [157]:
t_43 = auc_bootstrap
print(t_43)

[0.7909560686654282, 0.7897752608343741, 0.7899078705728951, 0.7911839091026283, 0.7950396297071174, 0.7870097402117107, 0.7894846817260611, 0.7918806385555154, 0.7880867320522493, 0.7922904871796498, 0.7923167713808108, 0.7915257357991348, 0.7884119164907342, 0.7954032498367474, 0.7922207481936546, 0.7880278237421096, 0.7859536172344054, 0.7910929050091612, 0.7906335258552007, 0.7869246797818226, 0.7903606456562121, 0.7929386106726009, 0.7919588307519343, 0.7895552132005335, 0.7880510700707746, 0.7886647203149666, 0.7864524887308137, 0.7900654436984487, 0.7906969249333782, 0.7892107448757695, 0.7893506190919986, 0.7932805694505202, 0.7924084358813421, 0.7906846413619812, 0.7901351826844438, 0.7891098346763371, 0.7922932608893198, 0.7902902462631525, 0.7902363570467019, 0.7897847706961006, 0.7897905822782669, 0.7910919804392711, 0.7854071964293639, 0.7861047183707282, 0.7857977611672192, 0.7905538807632403, 0.7907418326137537, 0.7886324924502266, 0.792794641932573, 0.7885434695779523, 

In [158]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [159]:
## 44
column_to_drop_43 = 'Cat_현재 주택의 점유형태'

In [160]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(19949, 73)


In [161]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [162]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [163]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7921458164696131


In [164]:
xgb_optuna_44 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_44.fit(X_train, y_train)

In [165]:
xgb_optuna_proba_44 = xgb_optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, xgb_optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.795


In [166]:
X_train = X_train.values
y_train = y_train.values

In [167]:
auc_bootstrap = []

In [168]:
rs = RandomState(seed = 44)
bootstrap_auc(xgb_optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7862324 , 0.79400846])

In [169]:
np.mean(auc_bootstrap)

0.7902299196628014

In [170]:
t_44 = auc_bootstrap
print(t_44)

[0.7919180175953575, 0.7866187792296166, 0.7899744396049815, 0.7909201425211277, 0.7926361442371295, 0.7903857411246574, 0.7926752403353389, 0.7889143541852901, 0.789025038409275, 0.7897215036993361, 0.7883189311760741, 0.7910194677436058, 0.793949165562466, 0.791653194362554, 0.7895980075783031, 0.7885152041555983, 0.7899646655804291, 0.7904615558556445, 0.7904766131367116, 0.7890813050911573, 0.7888287654297506, 0.7913932581420267, 0.7852290186034028, 0.7931178451498648, 0.790531030678814, 0.7896940307654594, 0.7868858478464389, 0.7895108338458091, 0.7889242602912554, 0.7925211013265201, 0.791113509709569, 0.7944290173354213, 0.7919556607980254, 0.7888712956446946, 0.7885001468745312, 0.7867883717637412, 0.7923503200596798, 0.7899797228614962, 0.7934814652794948, 0.7902845667623992, 0.7885046376425686, 0.7883382150623529, 0.79014006969672, 0.7911885319520787, 0.7902779626917558, 0.7933619316008478, 0.7910346571060858, 0.7922858643301993, 0.7930804661100227, 0.7922421453825395, 0.7859

In [171]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [172]:
# 45
column_to_drop_44 = 'Cat_주택 보유 의식'

In [173]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(19949, 71)


In [174]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [175]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [176]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7903999699497277


In [177]:
xgb_optuna_45 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_45.fit(X_train, y_train)

In [178]:
xgb_optuna_proba_45 = xgb_optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, xgb_optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.793


In [179]:
X_train = X_train.values
y_train = y_train.values

In [180]:
auc_bootstrap = []

In [181]:
rs = RandomState(seed = 45)
bootstrap_auc(xgb_optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78514365, 0.79276086])

In [182]:
np.mean(auc_bootstrap)

0.7889511385946115

In [183]:
t_45 = auc_bootstrap
print(t_45)

[0.7887735553991712, 0.7892983148525021, 0.789272030651341, 0.786508095005632, 0.7894257734159211, 0.7863223885391373, 0.7885525831954403, 0.7876414856094659, 0.7899373247279652, 0.7910668849708259, 0.7886257562981701, 0.7902967182523832, 0.791160002366899, 0.7855122011525952, 0.7906536022299568, 0.7912152123974785, 0.7859516360132124, 0.7879932184119376, 0.7900583113021536, 0.7888380111286516, 0.7905685418000687, 0.7878827983507787, 0.7900978036446017, 0.7880177855547313, 0.7870220237831076, 0.7891270052600101, 0.79106477166822, 0.7873902667621879, 0.7888712956446946, 0.7884393894246111, 0.7885076755150647, 0.7904190256407004, 0.7890139435705937, 0.7881361305006627, 0.7859326162897592, 0.7910147128127423, 0.7884956561064935, 0.7880245217067877, 0.7849531005319182, 0.7887592906065812, 0.7890662478100903, 0.7878909873983766, 0.7862547628557481, 0.7916214948234652, 0.7901884114938302, 0.7852234711840623, 0.7881486782348852, 0.7858236491241417, 0.7905483333438998, 0.7889662621805479, 0.78

In [184]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [185]:
# 46.
column_to_drop_45 = '가구주 나이'

In [186]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(19949, 70)


In [187]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [188]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [189]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.78962662949459


In [190]:
xgb_optuna_46 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_46.fit(X_train, y_train)

In [191]:
xgb_optuna_proba_46 = xgb_optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, xgb_optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.794


In [192]:
X_train = X_train.values
y_train = y_train.values

In [193]:
auc_bootstrap = []

In [194]:
rs = RandomState(seed = 46)
bootstrap_auc(xgb_optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78645336, 0.79390148])

In [195]:
np.mean(auc_bootstrap)

0.7901381407797664

In [196]:
t_46 = auc_bootstrap
print(t_46)

[0.7903593248420835, 0.7898052433150955, 0.7894226034620124, 0.7933810834057139, 0.7905520316234602, 0.7916171361368405, 0.7871773515246422, 0.7888691823420887, 0.7845140619155397, 0.7914272030651341, 0.7894153389843045, 0.7920285697379293, 0.790819628565934, 0.789953702823161, 0.7916131736944545, 0.7939152206393585, 0.7914018434338631, 0.7884174639100747, 0.790020536018073, 0.7902742644121954, 0.7885340917976386, 0.7906718294649329, 0.7879582168375272, 0.7929699139674509, 0.7875314617925456, 0.7900190831225314, 0.7904200822920036, 0.791351388334147, 0.7864124680627144, 0.7876912803021177, 0.7898558304962245, 0.7888678615279601, 0.7904023833826789, 0.7915777758758054, 0.7917957102070403, 0.7875465190736126, 0.788513487097231, 0.7902660753645976, 0.7910946220675285, 0.7897553165410308, 0.790082218037883, 0.7854699351004769, 0.7934236136206579, 0.7913663135338013, 0.7900276684143679, 0.7908414219990574, 0.7903020015088981, 0.7915289057530437, 0.7908087978900786, 0.7906496397875709, 0.791

In [197]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [198]:
# 47.
column_to_drop_46 = 'Cat_가구주 성별'

In [199]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(19949, 68)


In [200]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [201]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [202]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7912216054853305


In [203]:
xgb_optuna_47 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_47.fit(X_train, y_train)

In [204]:
xgb_optuna_proba_47 = xgb_optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, xgb_optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.795


In [205]:
X_train = X_train.values
y_train = y_train.values

In [206]:
auc_bootstrap = []

In [207]:
rs = RandomState(seed = 47)
bootstrap_auc(xgb_optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78684384, 0.79431543])

In [208]:
np.mean(auc_bootstrap)

0.7905682721558644

In [209]:
t_47 = auc_bootstrap
print(t_47)

[0.7926105204430328, 0.7911854940795828, 0.7894937953435491, 0.79162585351009, 0.7876668452407369, 0.7920025496995939, 0.7931586583064416, 0.7905113505482964, 0.7899966292823436, 0.7901557873848515, 0.7902527351418978, 0.7902561692586324, 0.7886752868279963, 0.7903873261016118, 0.791112320976853, 0.7924468715724874, 0.7913261607842887, 0.7905573148799749, 0.7913527091482757, 0.7900247626232848, 0.7917317828032113, 0.7907386626598449, 0.7928874951658204, 0.792021965667286, 0.7910131278357879, 0.7899605710566302, 0.7909787866684418, 0.790373985878912, 0.7875109891735508, 0.791670364946227, 0.788937864676781, 0.7914434490789172, 0.7901012377613363, 0.7902359608024633, 0.7890058866044087, 0.7904649899723792, 0.7941752889412989, 0.7902902462631527, 0.7917220087786592, 0.7934764461858059, 0.7900456314865182, 0.7914767335949602, 0.7912926121054199, 0.791324972051573, 0.7908872542493233, 0.7909329544181761, 0.7916209664978138, 0.7901892039823075, 0.7889637526337033, 0.7914467511142389, 0.79073

In [210]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [211]:
# 48
column_to_drop_47 = 'Cat_가구주 종사상 지위'

In [212]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(19949, 63)


In [213]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [214]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [215]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7901353789533342


In [216]:
xgb_optuna_48 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_48.fit(X_train, y_train)

In [217]:
xgb_optuna_proba_48 = xgb_optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, xgb_optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.793


In [218]:
X_train = X_train.values
y_train = y_train.values

In [219]:
auc_bootstrap = []

In [220]:
rs = RandomState(seed = 48)
bootstrap_auc(xgb_optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78578669, 0.79285054])

In [221]:
np.mean(auc_bootstrap)

0.7893551782782079

In [222]:
t_48 = auc_bootstrap
print(t_48)

[0.7895223249287289, 0.7893203724484513, 0.7894393778014467, 0.7897174091755373, 0.7878497779975614, 0.7856232816208185, 0.7893172024945423, 0.7919190742466604, 0.7899029835606189, 0.7914937720972204, 0.7928170957727609, 0.7865213031469189, 0.7947581642162922, 0.7905707871840877, 0.7891906685010133, 0.7891502515886752, 0.7909023115303904, 0.7910840555544989, 0.78965585923714, 0.7909119534735298, 0.7880055019833345, 0.7906191289811979, 0.790617147760005, 0.7895220607659031, 0.7901065210178511, 0.7901917135291519, 0.7912105895480279, 0.7881916046940678, 0.7894350191148221, 0.7872095793893824, 0.7902341116626831, 0.7884692398239197, 0.7879703683275112, 0.790290774588804, 0.7882215871747891, 0.7903166625457266, 0.7895175699978656, 0.7874273816392043, 0.7886740980952802, 0.7883415170976747, 0.7864757350594789, 0.7915776437943925, 0.7869377558416968, 0.7894994748443025, 0.7877143945493699, 0.7926364083999553, 0.7909555403397767, 0.7906557155325629, 0.7855503726809145, 0.7907349643802846, 0.7

In [223]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [224]:
# 49
column_to_drop_48 = 'Cat_이사 계획 중인 주택의 점유형태'

In [225]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(19949, 39)


In [226]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [227]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [228]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7868354215107938


In [229]:
xgb_optuna_49 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_49.fit(X_train, y_train)

In [230]:
xgb_optuna_proba_49 = xgb_optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, xgb_optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.792


In [231]:
X_train = X_train.values
y_train = y_train.values

In [232]:
auc_bootstrap = []

In [233]:
rs = RandomState(seed = 49)
bootstrap_auc(xgb_optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78409746, 0.79133305])

In [234]:
np.mean(auc_bootstrap)

0.7877387183982012

In [235]:
t_49 = auc_bootstrap
print(t_49)

[0.7903099263936703, 0.7882989868827308, 0.7897757891600257, 0.7897195224781431, 0.7866092693678901, 0.7859652403987379, 0.7880877887035522, 0.789010641535272, 0.7896536138531213, 0.7869423786911471, 0.786607288146697, 0.7888209726263915, 0.7856246024349473, 0.7897711663105751, 0.7881633392717137, 0.7895766103894183, 0.7882696648090737, 0.7879019501556447, 0.7841077794895528, 0.7909962214149406, 0.7866208925322227, 0.7876034461625594, 0.7929005712256945, 0.7864662251977523, 0.7886458326729263, 0.785556976751558, 0.7881303189184962, 0.789044190214141, 0.7857117761674411, 0.789422075136361, 0.7881740378661561, 0.7906118645034901, 0.7894210184850579, 0.7921223475410668, 0.7903166625457265, 0.786568852455552, 0.7867855980540709, 0.7884741268361958, 0.7886499271967253, 0.7861831747299728, 0.7923932465188623, 0.7854147250698974, 0.7889192411975663, 0.7916825164362111, 0.7890687573569347, 0.7888529363283059, 0.7884924861525847, 0.7878493817533225, 0.7851221647403913, 0.7879895201323772, 0.786

In [236]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [237]:
# 50
column_to_drop_49 = 'Cat_현재 거주 지역'

In [238]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(19949, 22)


In [239]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [240]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [241]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7823629668369654


In [242]:
xgb_optuna_50 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_50.fit(X_train, y_train)

In [243]:
xgb_optuna_proba_50 = xgb_optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, xgb_optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.789


In [244]:
X_train = X_train.values
y_train = y_train.values

In [245]:
auc_bootstrap = []

In [246]:
rs = RandomState(seed = 50)
bootstrap_auc(xgb_optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78045667, 0.7870551 ])

In [247]:
np.mean(auc_bootstrap)

0.7838712650678477

In [248]:
t_50 = auc_bootstrap
print(t_50)

[0.7852506799551134, 0.7826305809680194, 0.7826023155456653, 0.7820827072674363, 0.7866787441910595, 0.7836999120866115, 0.7847879987658313, 0.780820008960403, 0.7855909216746655, 0.7849952345026237, 0.7804713140304274, 0.7843603191509594, 0.7857945912133104, 0.7838120492061378, 0.7852456608614244, 0.7866506108501182, 0.7849204764229394, 0.7850879556544581, 0.7860743396457682, 0.7830862618424195, 0.7825608419820242, 0.782378701713677, 0.7863700699291833, 0.7831632653061225, 0.7846173495804039, 0.7811288153036922, 0.7835527733926749, 0.7848813803247301, 0.7812206118856364, 0.7813776566855385, 0.7812433298886501, 0.7843843579681019, 0.7859769956444834, 0.7860361681174488, 0.781958418657926, 0.7837590845595772, 0.7834529198445455, 0.7860418476182023, 0.7839321112104363, 0.7848937959775397, 0.7818231672911476, 0.7865170765417071, 0.7816978220303343, 0.7833302162119896, 0.7850956163764047, 0.7845433839891969, 0.7846911830901979, 0.7846568419228517, 0.7841801601038054, 0.7841335353650625, 0.

In [249]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [250]:
# 51
column_to_drop_50 = 'Cat_가구주 최종 학력'

In [251]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(19949, 19)


In [252]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [253]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [254]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7794353355703174


In [255]:
xgb_optuna_51 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_51.fit(X_train, y_train)

In [256]:
xgb_optuna_proba_51 = xgb_optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, xgb_optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.784


In [257]:
X_train = X_train.values
y_train = y_train.values

In [258]:
auc_bootstrap = []

In [259]:
rs = RandomState(seed = 51)
bootstrap_auc(xgb_optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77815552, 0.78448013])

In [260]:
np.mean(auc_bootstrap)

0.7814059627889677

In [261]:
t_51 = auc_bootstrap
print(t_51)

[0.7833365561198072, 0.7827527562749237, 0.7789356034429926, 0.7815468529754244, 0.7809802237142138, 0.7807731200588344, 0.7803204770569302, 0.7822655079428478, 0.7816770852485138, 0.7818725657395609, 0.7820922171291629, 0.7805219012115565, 0.7807958380618478, 0.7816020630060039, 0.782397061030066, 0.7835149981085942, 0.7818493194108957, 0.777985145595983, 0.7816502727217012, 0.7812116303495613, 0.7813200691895272, 0.7822353933807136, 0.780207283286101, 0.7829000270502734, 0.7818600180053383, 0.7791478582734741, 0.7787904459702489, 0.7817560699334098, 0.7846774466232594, 0.782056687229101, 0.7810049229384204, 0.7802223405671682, 0.7839064874163396, 0.7780149959952916, 0.7800984482018964, 0.7813503158330745, 0.7825620307147401, 0.7833139701982066, 0.7772444330326104, 0.7799320256216808, 0.7812649912403606, 0.7797234690707597, 0.7841006470932579, 0.7825412939329195, 0.7804549359352315, 0.7829173297153593, 0.7796981094394886, 0.7808238393213762, 0.7787748603635303, 0.7817535603865653, 0.7

In [262]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [263]:
# 52
column_to_drop_51 = '소득 대비 주택 임대료의 비율'

In [264]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(19949, 18)


In [265]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [266]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [267]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7732628527862753


In [268]:
xgb_optuna_52 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_52.fit(X_train, y_train)

In [269]:
xgb_optuna_proba_52 = xgb_optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, xgb_optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.779


In [270]:
X_train = X_train.values
y_train = y_train.values

In [271]:
auc_bootstrap = []

In [272]:
rs = RandomState(seed = 52)
bootstrap_auc(xgb_optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77298233, 0.77903112])

In [273]:
np.mean(auc_bootstrap)

0.776109445300332

In [274]:
t_52 = auc_bootstrap
print(t_52)

[0.7762794462301851, 0.7784196934443239, 0.7756714754867464, 0.7771555422417491, 0.7749419898434677, 0.7758472758472759, 0.7761358737343959, 0.7773811372949304, 0.7776920569408254, 0.7764021498627409, 0.7759150336120779, 0.7767946958217895, 0.7789241123600729, 0.7745297637539016, 0.7744473449522711, 0.7734786598702855, 0.771325864921924, 0.7771394283093791, 0.7757668382668382, 0.7762156509077691, 0.7788266362773753, 0.7756702867540306, 0.7756425496573279, 0.7730896801093845, 0.7773112662275224, 0.7730711887115826, 0.7769596655064637, 0.7770012711515174, 0.7749726327312534, 0.776137987037002, 0.7765689686871953, 0.7757886316999617, 0.7750347109953022, 0.7776138647444066, 0.7769850251377346, 0.7762363876895897, 0.7753425927687011, 0.7773367579402062, 0.7740304960132546, 0.7771836755826904, 0.7748338151663275, 0.7746755816337098, 0.7742506757285083, 0.7760341710464862, 0.7772102239466772, 0.7764762475353608, 0.7764767758610123, 0.775328592138937, 0.7762931826971237, 0.774181068823926, 0.7

In [275]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [276]:
# 53
column_to_drop_52 = '장기부채부담지표'

In [277]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(19949, 17)


In [278]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [279]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [280]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7656726823624302


In [281]:
xgb_optuna_53 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_53.fit(X_train, y_train)

In [282]:
xgb_optuna_proba_53 = xgb_optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, xgb_optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.771


In [283]:
X_train = X_train.values
y_train = y_train.values

In [284]:
auc_bootstrap = []

In [285]:
rs = RandomState(seed = 53)
bootstrap_auc(xgb_optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76672906, 0.77171677])

In [286]:
np.mean(auc_bootstrap)

0.7693800413494071

In [287]:
t_53 = auc_bootstrap
print(t_53)

[0.7723823841434678, 0.7675439778272289, 0.7681198527873404, 0.7704208430809416, 0.77038993603033, 0.7665980107482572, 0.77002446676092, 0.7698749506015516, 0.7685640425788208, 0.769863459518632, 0.7687904301204794, 0.7704063141255257, 0.7682148193231937, 0.7689437766408208, 0.7685805527554297, 0.770072016069553, 0.7699199903633402, 0.770777991221341, 0.7700309387501506, 0.7668208320917681, 0.7710620983404235, 0.7683626184241948, 0.7705233382573283, 0.7684368481782275, 0.7701720016990954, 0.7691070292671277, 0.7722740773849148, 0.7687658629776856, 0.7654128442305783, 0.7684863787080536, 0.771655143884208, 0.7708351824731134, 0.7695028772614979, 0.7672894569446295, 0.7693153216552232, 0.7652614789314298, 0.7676327365366774, 0.7691902405572356, 0.7685137195605177, 0.7706835530111393, 0.7683778077866747, 0.7693898155720815, 0.770967395967396, 0.7689955525546658, 0.767676719647163, 0.7718872109266197, 0.7690491776082911, 0.7680679447920827, 0.7693248315169496, 0.7697625493191995, 0.7689643

In [288]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [289]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [290]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(19949, 10)


In [291]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [292]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [293]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'max_leaves': 826, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 4, 'reg_alpha': 1, 'reg_lambda': 4, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7648299951250863


In [294]:
xgb_optuna_54 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_54.fit(X_train, y_train)

In [295]:
xgb_optuna_proba_54 = xgb_optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, xgb_optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.767


In [296]:
X_train = X_train.values
y_train = y_train.values

In [297]:
auc_bootstrap = []

In [298]:
rs = RandomState(seed = 54)
bootstrap_auc(xgb_optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76243272, 0.76673342])

In [299]:
np.mean(auc_bootstrap)

0.7645850592411554

In [300]:
t_54 = auc_bootstrap
print(t_54)

[0.7654992254745949, 0.763469530403028, 0.7636816531520966, 0.7625639802363939, 0.7658487128930479, 0.7645194455539285, 0.7652605543615396, 0.7637858653868504, 0.7665507256024497, 0.7651819659208823, 0.7657825401052, 0.7661020450429317, 0.7656483453897248, 0.7658658834767209, 0.7654035985316774, 0.763895889203771, 0.7642733778817523, 0.7647192847316, 0.7662536745049061, 0.7638013189121562, 0.7637916769690171, 0.765852675335434, 0.7640354992571741, 0.7646270919054169, 0.7639130597874441, 0.7649875420811383, 0.7641798642414406, 0.7649599370658485, 0.7667234880904832, 0.7645870712373175, 0.7659148856808955, 0.7630060567252687, 0.7660221357881456, 0.7647645886562142, 0.7631954614713233, 0.7637656569306815, 0.7627893111267496, 0.7650994150378385, 0.7640354992571741, 0.7653261988237356, 0.76508290486123, 0.7647315683029969, 0.7626091520795956, 0.7647417385717878, 0.7652624035013196, 0.7648557248310942, 0.7649020854070114, 0.7646926042862001, 0.7659986252966547, 0.7672071702244115, 0.76405531

In [301]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [302]:
# 55
column_to_drop_54 = '현재 주택의 면적(㎡)'

In [303]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(19949, 9)


In [304]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [305]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'gamma' :trial.suggest_int('gamma',1, 10),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'booster' : trial.suggest_categorical('booster', ['gbtree']),
        'objective' : trial.suggest_categorical('objective', ['binary:logistic'])
        }
        
    clf = XGBClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [306]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 166, 'learning_rate': 0.01, 'max_depth': 7, 'max_leaves': 768, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'gamma': 2, 'reg_alpha': 8, 'reg_lambda': 2, 'booster': 'gbtree', 'objective': 'binary:logistic'}
0.7352387324894216


In [307]:
xgb_optuna_55 = XGBClassifier(**study.best_trial.params, random_state = 0)
xgb_optuna_55.fit(X_train, y_train)

In [308]:
xgb_optuna_proba_55 = xgb_optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, xgb_optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.742


In [309]:
X_train = X_train.values
y_train = y_train.values

In [310]:
auc_bootstrap = []

In [311]:
rs = RandomState(seed = 55)
bootstrap_auc(xgb_optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74220667, 0.74237203])

In [312]:
np.mean(auc_bootstrap)

0.7423247969116196

In [313]:
t_55 = auc_bootstrap
print(t_55)

[0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7422367828771771, 0.7422367828771771, 0.7422367828771771, 0.7422367828771771, 0.7423419196818213, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423569769628885, 0.7422066683150428, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423419196818213, 0.7423720342439556, 0.7423569769628885, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771,

In [314]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc