In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
고령가구 = pd.read_csv('고령가구_변수추가.csv', encoding='cp949')
고령가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [7]:
고령가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [8]:
cat = 고령가구.select_dtypes(include = 'object')
num = 고령가구.select_dtypes(exclude = 'object')
num_고령 = num.drop('target',axis=1)
target = 고령가구.target

In [9]:
scaler=RobustScaler()
scaler.fit(num_고령)
num_scaled_고령=scaler.transform(num_고령)
num_df_scaled_고령=pd.DataFrame(data=num_scaled_고령, columns=num_고령.columns)

In [10]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [11]:
comp =pd.concat([num_df_scaled_고령, target,cat2],axis=1)

In [12]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(10564, 210)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)

In [None]:
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_0 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)

In [None]:
optuna_0.fit(X_train, y_train)

In [None]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(auc_0)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
np.mean(auc_bootstrap)

In [None]:
t_0 = auc_bootstrap
print(t_0)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 1.
column_to_drop = 'Cat_기초생활보장 수급가구 여부'

In [None]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)

In [None]:
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_1 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [None]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap)

In [None]:
np.mean(auc_bootstrap)

In [None]:
t_1 = auc_bootstrap
print(t_1)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
#### 2. 
column_to_drop_1 = 'Cat_가구주 장애 여부'

In [None]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_2 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [None]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap)

In [None]:
np.mean(auc_bootstrap)

In [None]:
t_2 = auc_bootstrap
print(t_2)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
#### 3.
column_to_drop_2 = '부채 중 임대 보증금의 비중'

In [None]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_3 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [None]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_3 = auc_bootstrap
print(t_3)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
### 4. 
column_to_drop_3 = 'Cat_가구주 동거 여부'

In [None]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_4 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [None]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_4 = auc_bootstrap
print(t_4)

In [None]:
## 5.현재 주택의 위치
column_to_drop_4 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [None]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_5 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [None]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

In [None]:
t_5 = auc_bootstrap
print(t_5)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
## 6
column_to_drop_5 = 'Cat_가구주 주민등록상 등재 여부'

In [None]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_6 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [None]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_6 = auc_bootstrap
print(t_6)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
## 7 .
column_to_drop_6 = '자산 중 부동산 자산의 비중'

In [None]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_7 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [None]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_7 = auc_bootstrap
print(t_7)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
## 8 .
column_to_drop_7 = 'Cat_현재 주택의 구조'

In [None]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_8 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [None]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_8 = auc_bootstrap
print(t_8)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
#9.
column_to_drop_8 = '소득 중 재산소득의 비중(월평균)'

In [None]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_9 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [None]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_9 = auc_bootstrap
print(t_9)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 여기서 다시 3월 25일 오전 3시

In [None]:
# 10.
column_to_drop_9 = 'Cat_현재 주택의 위치'

In [None]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_10 = GradientBoostingClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [None]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
from scipy.stats import shapiro

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_10 = auc_bootstrap
print(t_10)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 11.
column_to_drop_10 = '소득 중 사적이전소득의 비중(월평균)'

In [None]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_11 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [None]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_11 = auc_bootstrap
print(t_11)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 12
column_to_drop_11 = 'Cat_소득 계층'

In [None]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
X_train.shape

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_12 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [None]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

In [None]:
t_12 = auc_bootstrap
print(t_12)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 13.
column_to_drop_12 = '부채 중 금융기관 대출금의 비중'

In [None]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_13 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [None]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

In [None]:
t_13 = auc_bootstrap
print(t_13)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
#14.
column_to_drop_13 = 'Cat_가구주 성별'

In [None]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_14 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [None]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

In [None]:
t_14 = auc_bootstrap
print(t_14)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
## 15.
column_to_drop_14 = '부채 중 비금융기관 대출금의 비중'

In [None]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
X_train.shape

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_15 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [None]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

In [None]:
t_15 = auc_bootstrap
print(t_15)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 16.
column_to_drop_15 = 'Cat_현재 공공기관 접근용이성'

In [None]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_16 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [None]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_16 = auc_bootstrap
print(t_16)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 17.
column_to_drop_16 = 'Cat_현재 주변도로의 보행 안전'

In [None]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_17 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [None]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_17 = auc_bootstrap
print(t_17)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
## 18.
column_to_drop_17 ='Cat_이사 계획 중인 주택의 유형'

In [None]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_18 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [None]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_18 = auc_bootstrap
print(t_18)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 19
column_to_drop_18 = 'Cat_현재 대기오염 정도'

In [None]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_19 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [None]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_19 = auc_bootstrap
print(t_19)

In [None]:
# 20.
column_to_drop_19 = 'Cat_주택 보유 의식'

In [None]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_20 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [None]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_20 = auc_bootstrap
print(t_20)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 21
column_to_drop_20 = '총 가구원 수'

In [None]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_21 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [None]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_21 = auc_bootstrap
print(t_21)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 22
column_to_drop_21 = '총 이사 횟수'

In [None]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_22 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [None]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_22 = auc_bootstrap
print(t_22)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 23.
column_to_drop_22 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [None]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_23 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [None]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_23 = auc_bootstrap
print(t_23)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 24
column_to_drop_23 = 'Cat_가족계획 시 중요 고려 사항 1순위'

In [None]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_24 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [None]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_24 = auc_bootstrap
print(t_24)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
column_to_drop_24 = 'Cat_현재 의료시설 접근용이성'

In [None]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_25 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [None]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_25 = auc_bootstrap
print(t_25)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 26
column_to_drop_25 = 'Cat_이사 예상 기간'

In [None]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_26 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [None]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_26 = auc_bootstrap
print(t_26)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 27
column_to_drop_26 = '소득 중 근로/사업소득의 비중(월평균)'

In [None]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_27 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [None]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_27 = auc_bootstrap
print(t_27)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 28
column_to_drop_27  = 'Cat_현재 교육환경'

In [None]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_28 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [None]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
np.mean(auc_bootstrap)

In [None]:
t_28 = auc_bootstrap
print(t_28)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 29
column_to_drop_28 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [None]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_29 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [None]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_29 = auc_bootstrap
print(t_29)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
column_to_drop_29 = '현재 주택 거주 기간(총 개월)'

In [None]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_30 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [None]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_30 = auc_bootstrap
print(t_30)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 31
column_to_drop_30 = 'Cat_현재 상업시설 접근용이성'

In [None]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_31= GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [None]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_31 = auc_bootstrap
print(t_31)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 32
column_to_drop_31 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [None]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_32 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [None]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_32 = auc_bootstrap
print(t_32)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 33.
column_to_drop_32 = 'Cat_현재 문화시설 접근용이성'

In [None]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_33 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [None]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_33 = auc_bootstrap
print(t_33)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 34
column_to_drop_33 = '자산 중 금융자산의 비중'

In [None]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_34 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [None]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_34 = auc_bootstrap
print(t_34)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 35
column_to_drop_34 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [None]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_35 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [None]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_35 = auc_bootstrap
print(t_35)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 36
column_to_drop_35 = 'Cat_현재 주차시설 이용편의성'

In [None]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_36 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [None]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_36 = auc_bootstrap
print(t_36)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 37
column_to_drop_36 = '현재 무주택 기간(총 개월)'

In [None]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_37 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [None]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_37 = auc_bootstrap
print(t_37)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 38
column_to_drop_37 = 'Cat_현재 청소/쓰레기 처리상태'

In [None]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_38 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [None]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_38 = auc_bootstrap
print(t_38)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 38
column_to_drop_38 = 'Cat_현재 대중교통 접근용이성'

In [None]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_39 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [None]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_39 = auc_bootstrap
print(t_39)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 40
column_to_drop_39 = '자산 중 기타자산의 비중'

In [None]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_40 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [None]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

In [None]:
t_40 = auc_bootstrap
print(t_40)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 41.
column_to_drop_40 = '소득 대비 생활비의 비율'

In [None]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

In [79]:
comp_41 = comp[[
    '현재 주택 거주 기간(총 개월)','현재 주택의 면적(㎡)','총 이사 횟수','가구주 나이','소득 대비 주택 임대료의 비율',
    '소득 중 사적이전소득의 비중(월평균)','장기부채부담지표','target',
    'Cat_현재 주택의 유형_고시원','Cat_현재 주택의 유형_기타','Cat_현재 주택의 유형_다가구 단독주택',
    'Cat_현재 주택의 유형_다세대주택','Cat_현재 주택의 유형_비거주용 건물 내 주택','Cat_현재 주택의 유형_아파트',
    'Cat_현재 주택의 유형_연립주택','Cat_현재 주택의 유형_영업겸용 단독주택','Cat_현재 주택의 유형_오피스텔',
    'Cat_현재 주택의 유형_일반 단독주택','Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
    'Cat_현재 주택의 점유형태_무상','Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_현재 주택의 점유형태_보증금 있는 월세','Cat_현재 주택의 점유형태_전세',
    'Cat_현재 의료시설 접근용이성_대체로 만족','Cat_현재 의료시설 접근용이성_매우 만족',
    'Cat_현재 의료시설 접근용이성_매우 불만족','Cat_현재 의료시설 접근용이성_약간 불만족',
    'Cat_현재 주택에 대한 전반적인 만족도_대체로 만족','Cat_현재 주택에 대한 전반적인 만족도_매우 만족','Cat_현재 주택에 대한 전반적인 만족도_매우 불만족','Cat_현재 주택에 대한 전반적인 만족도_약간 불만족',
    'Cat_이사 계획 중인 주택의 점유형태_무상 to 무상이나 기타','Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 있는 월세','Cat_이사 계획 중인 주택의 점유형태_무상 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 주택의 점유형태_무상 to 자가','Cat_이사 계획 중인 주택의 점유형태_무상 to 전세',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 무상이나 기타',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 있는 월세',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 자가',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 전세',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 무상이나 기타','Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 있는 월세','Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 자가','Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 전세',
    'Cat_이사 계획 중인 주택의 점유형태_전세 to 무상이나 기타','Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 있는 월세','Cat_이사 계획 중인 주택의 점유형태_전세 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 주택의 점유형태_전세 to 자가','Cat_이사 계획 중인 주택의 점유형태_전세 to 전세',
    'Cat_이사 계획 중인 거주 지역_국내 to 국외','Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
    'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권','Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권','Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
    'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
    'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급','Cat_현재 가장 필요한 주거지원 1순위_없음',
    'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원','Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
    'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급','Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등','Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원', 
    'Cat_기초생활보장 수급가구 여부_아니오','Cat_기초생활보장 수급가구 여부_예'
]]

In [80]:
X_41 = comp_41.drop('target', axis=1)
y_41 = comp_41['target']
X_41.shape

(10564, 72)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8470010972278204


In [84]:
optuna_41 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [85]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.8532940733569969


In [86]:
X_train = X_train.values
y_train = y_train.values

In [87]:
auc_bootstrap = []

In [88]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84626228, 0.8533014 ])

In [89]:
np.mean(auc_bootstrap)

0.8498361312789072

In [90]:
t_41 = auc_bootstrap
print(t_41)

[0.8487935176221719, 0.8474521350973432, 0.8494953569251731, 0.8514762826718296, 0.8466629020114014, 0.8476426087268295, 0.8503298555089456, 0.8483928507403822, 0.8467086156824782, 0.8531139077121653, 0.8492076297013373, 0.8489763723064789, 0.8481001936108422, 0.8501236958158546, 0.8499722132587573, 0.8503370262808791, 0.8485524004159047, 0.8514108493779357, 0.8485080312645656, 0.8486931268151017, 0.8464899071385035, 0.8502926571295399, 0.8455586031336273, 0.8476184073715536, 0.8492614104908394, 0.8499157434297802, 0.8480127998279015, 0.851242336237496, 0.8470868739019755, 0.850454447671292, 0.8497293033595066, 0.8490471836793231, 0.8490700405148615, 0.849342978021584, 0.8487639381879459, 0.8483489297622889, 0.8548250331648202, 0.8546206661647128, 0.8493653866838765, 0.8535602882650317, 0.8480226596393102, 0.8448325624753505, 0.847593309669786, 0.8488930120827508, 0.8469219461475028, 0.850603241188914, 0.8471774048976372, 0.848492345200961, 0.8513248001147323, 0.8501891291097486, 0.848

In [91]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 42.
column_to_drop_41 = 'Cat_가구주 종사상 지위'

In [None]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

In [57]:
comp_42 = comp[[
    '현재 주택 거주 기간(총 개월)','현재 주택의 면적(㎡)','총 이사 횟수','가구주 나이','소득 대비 주택 임대료의 비율',
    '소득 중 사적이전소득의 비중(월평균)','장기부채부담지표','target',
    'Cat_현재 주택의 유형_고시원','Cat_현재 주택의 유형_기타','Cat_현재 주택의 유형_다가구 단독주택',
    'Cat_현재 주택의 유형_다세대주택','Cat_현재 주택의 유형_비거주용 건물 내 주택','Cat_현재 주택의 유형_아파트',
    'Cat_현재 주택의 유형_연립주택','Cat_현재 주택의 유형_영업겸용 단독주택','Cat_현재 주택의 유형_오피스텔',
    'Cat_현재 주택의 유형_일반 단독주택','Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
    'Cat_현재 주택의 점유형태_무상','Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_현재 주택의 점유형태_보증금 있는 월세','Cat_현재 주택의 점유형태_전세',
    'Cat_현재 의료시설 접근용이성_대체로 만족','Cat_현재 의료시설 접근용이성_매우 만족',
    'Cat_현재 의료시설 접근용이성_매우 불만족','Cat_현재 의료시설 접근용이성_약간 불만족',
    'Cat_현재 주택에 대한 전반적인 만족도_대체로 만족','Cat_현재 주택에 대한 전반적인 만족도_매우 만족','Cat_현재 주택에 대한 전반적인 만족도_매우 불만족','Cat_현재 주택에 대한 전반적인 만족도_약간 불만족',

    'Cat_이사 계획 중인 거주 지역_국내 to 국외','Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
    'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권','Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권','Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
    'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
    'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급','Cat_현재 가장 필요한 주거지원 1순위_없음',
    'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원','Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
    'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급','Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등','Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원', 
    'Cat_기초생활보장 수급가구 여부_아니오','Cat_기초생활보장 수급가구 여부_예'
]]

In [58]:
X_42 = comp_42.drop('target', axis=1)
y_42 = comp_42['target']
X_42.shape

(10564, 48)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [60]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [61]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8470612768149044


In [62]:
optuna_42 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [63]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.8534751353483202


In [64]:
X_train = X_train.values
y_train = y_train.values

In [65]:
auc_bootstrap = []

In [66]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84597012, 0.85322976])

In [68]:
np.mean(auc_bootstrap)

0.849700203918827

In [69]:
t_42 = auc_bootstrap
print(t_42)

[0.8480217632928186, 0.8486124556308486, 0.8483865763149403, 0.8503899107238895, 0.8467480549281129, 0.8512790864436557, 0.8479751532752502, 0.8487478039510952, 0.8480997454375965, 0.8496423577498117, 0.850615341866552, 0.8509339930443511, 0.8493107095478828, 0.8508802122548493, 0.8511921408339608, 0.8515013803735972, 0.8474606503890143, 0.8511948298734359, 0.8509223405399591, 0.8499453228640064, 0.8480782331217955, 0.8480764404288121, 0.8530762611595138, 0.8470877702484673, 0.8464406080814599, 0.8513543795489584, 0.8496566992936789, 0.8495034240435984, 0.8497189953748521, 0.8485407479115126, 0.8521852927467642, 0.8518428883869349, 0.8500847047434656, 0.850564698289771, 0.8488262342691191, 0.8528871320497652, 0.8534652755369116, 0.8478353232225448, 0.8476820479724642, 0.8497750170305833, 0.8512853608690976, 0.8492578251048727, 0.8510836829084651, 0.8457826897565524, 0.848297838012262, 0.8525070811372844, 0.8516967839087879, 0.8490202932845721, 0.8435292746764189, 0.8515094474920225, 0.

In [70]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 43.
column_to_drop_42 = 'Cat_남편/아내의 부모님과 동거 의향'

In [None]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

In [72]:
comp_43 = comp[[
    '현재 주택 거주 기간(총 개월)','현재 주택의 면적(㎡)','총 이사 횟수','가구주 나이','소득 대비 주택 임대료의 비율',
    '소득 중 사적이전소득의 비중(월평균)','장기부채부담지표','target',
    'Cat_현재 주택의 유형_고시원','Cat_현재 주택의 유형_기타','Cat_현재 주택의 유형_다가구 단독주택',
    'Cat_현재 주택의 유형_다세대주택','Cat_현재 주택의 유형_비거주용 건물 내 주택','Cat_현재 주택의 유형_아파트',
    'Cat_현재 주택의 유형_연립주택','Cat_현재 주택의 유형_영업겸용 단독주택','Cat_현재 주택의 유형_오피스텔',
    'Cat_현재 주택의 유형_일반 단독주택','Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
    'Cat_현재 주택의 점유형태_무상','Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_현재 주택의 점유형태_보증금 있는 월세','Cat_현재 주택의 점유형태_전세',
    'Cat_현재 의료시설 접근용이성_대체로 만족','Cat_현재 의료시설 접근용이성_매우 만족','Cat_현재 의료시설 접근용이성_매우 불만족','Cat_현재 의료시설 접근용이성_약간 불만족',
    'Cat_이사 계획 중인 거주 지역_국내 to 국외','Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
    'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권','Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권','Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
    'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
    'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급','Cat_현재 가장 필요한 주거지원 1순위_없음',
    'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원','Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
    'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급','Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등','Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원', 
    'Cat_기초생활보장 수급가구 여부_아니오','Cat_기초생활보장 수급가구 여부_예'
]]

In [73]:
X_43 = comp_43.drop('target', axis=1)
y_43 = comp_43['target']
X_43.shape

(10564, 44)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [40]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [41]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8460522190872856


In [42]:
optuna_43 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [43]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.8533048295148973


In [44]:
X_train = X_train.values
y_train = y_train.values

In [45]:
auc_bootstrap = []

In [46]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84610122, 0.85312209])

In [47]:
np.mean(auc_bootstrap)

0.8496291155749167

In [48]:
t_43 = auc_bootstrap
print(t_43)

[0.8475090530995661, 0.848492345200961, 0.8507143881538848, 0.8493671793768599, 0.8492156968197626, 0.8472544906959235, 0.8509725359434942, 0.8487899322362052, 0.8515210999964148, 0.8480235559858019, 0.8468013875443692, 0.8500779821447779, 0.8481862428740453, 0.8471684414327203, 0.8501720985264062, 0.8446810799182531, 0.8469622817396293, 0.8500098598114088, 0.8469923093471012, 0.8461268868093651, 0.8504289017962784, 0.8488939084292425, 0.8498193861819225, 0.8487827614642718, 0.8493860026531856, 0.8487504929905705, 0.8482279229859095, 0.849925155067943, 0.8494872898067477, 0.8486841633501846, 0.8455586031336273, 0.8514565630490122, 0.8491466781399017, 0.85194059015453, 0.8503244774299954, 0.8524631601591912, 0.8498556882148363, 0.8500735004123194, 0.8516645154350866, 0.8488365422537737, 0.8490310494424725, 0.85067494890825, 0.8510868201211861, 0.850098598114087, 0.8492246602846796, 0.8472652468538238, 0.8496459431357786, 0.8503567459036965, 0.8495231436664157, 0.8468188663009573, 0.8480

In [49]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
## 44
column_to_drop_43 = '소득 중 정부 보조금의 비중(월평균)'

In [None]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

In [13]:
alll = list(comp)
alll

['현재 주택 거주 기간(총 개월)',
 '현재 무주택 기간(총 개월)',
 '현재 주택의 면적(㎡)',
 '총 이사 횟수',
 '가구주 나이',
 '총 가구원 수',
 '소득 대비 주택 임대료의 비율',
 '소득 중 근로/사업소득의 비중(월평균)',
 '소득 중 재산소득의 비중(월평균)',
 '소득 중 사회보험 수혜금의 비중(월평균)',
 '소득 중 정부 보조금의 비중(월평균)',
 '소득 중 사적이전소득의 비중(월평균)',
 '소득 대비 생활비의 비율',
 '소득 대비 주거관리비의 비율',
 '자산 중 부동산 자산의 비중',
 '자산 중 금융자산의 비중',
 '자산 중 기타자산의 비중',
 '부채 중 금융기관 대출금의 비중',
 '부채 중 비금융기관 대출금의 비중',
 '부채 중 임대 보증금의 비중',
 '중기부채부담지표',
 '장기부채부담지표',
 'target',
 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
 'Cat_현재 주택의 유형_고시원',
 'Cat_현재 주택의 유형_기타',
 'Cat_현재 주택의 유형_다가구 단독주택',
 'Cat_현재 주택의 유형_다세대주택',
 'Cat_현재 주택의 유형_비거주용 건물 내 주택',
 'Cat_현재 주택의 유형_아파트',
 'Cat_현재 주택의 유형_연립주택',
 'Cat_현재

In [14]:
comp_44 = comp[[
    '현재 주택 거주 기간(총 개월)','현재 주택의 면적(㎡)','총 이사 횟수','가구주 나이','소득 대비 주택 임대료의 비율',
    '소득 중 사적이전소득의 비중(월평균)','장기부채부담지표','target',
    'Cat_현재 주택의 유형_고시원','Cat_현재 주택의 유형_기타','Cat_현재 주택의 유형_다가구 단독주택',
    'Cat_현재 주택의 유형_다세대주택','Cat_현재 주택의 유형_비거주용 건물 내 주택','Cat_현재 주택의 유형_아파트',
    'Cat_현재 주택의 유형_연립주택','Cat_현재 주택의 유형_영업겸용 단독주택','Cat_현재 주택의 유형_오피스텔',
    'Cat_현재 주택의 유형_일반 단독주택','Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
    'Cat_현재 주택의 점유형태_무상','Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_현재 주택의 점유형태_보증금 있는 월세','Cat_현재 주택의 점유형태_전세',
    'Cat_이사 계획 중인 거주 지역_국내 to 국외','Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
    'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권','Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권','Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
    'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
    'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급','Cat_현재 가장 필요한 주거지원 1순위_없음',
    'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원','Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
    'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급','Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등','Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원', 
    'Cat_기초생활보장 수급가구 여부_아니오','Cat_기초생활보장 수급가구 여부_예'
]]

In [15]:
X_44 = comp_44.drop('target', axis=1)
y_44 = comp_44['target']
X_44.shape

(10564, 40)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [17]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [18]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8452950759102513


In [19]:
optuna_44 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [21]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.8528037718260372


In [22]:
X_train = X_train.values
y_train = y_train.values

In [23]:
auc_bootstrap = []

In [24]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84552963, 0.8525938 ])

In [25]:
np.mean(auc_bootstrap)

0.8492280630400488

In [26]:
t_44 = auc_bootstrap
print(t_44)

[0.849838657631494, 0.848847298411674, 0.8482386791438098, 0.85072245527231, 0.846801835717615, 0.8472751066652325, 0.8482046179771252, 0.848569430999247, 0.8496029185041769, 0.8501425190921802, 0.8522220429529239, 0.851089957333907, 0.8482458499157435, 0.8489916101968378, 0.8482987343587538, 0.8517093327596716, 0.8469564554874334, 0.8494030332365279, 0.8492614104908393, 0.8511016098382991, 0.8483063533039332, 0.8495025276971065, 0.8511598723602597, 0.8499403929583019, 0.8490364275214227, 0.8501505862106056, 0.845783586103044, 0.8527244451615216, 0.848265121365315, 0.8506202717722562, 0.85035853859668, 0.8497920476139257, 0.8461188196909397, 0.8471316912265605, 0.8508165716539384, 0.8515426123122154, 0.8499184324692554, 0.8473925280556451, 0.8525043920978093, 0.8486532393962212, 0.8493886916926607, 0.8520087124878992, 0.8517980710623498, 0.8505808325266215, 0.8469681079918253, 0.8503352335878958, 0.8518249614571008, 0.8489387257538275, 0.8485945287010146, 0.8509492309347101, 0.84722849

In [31]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

NameError: name 'study' is not defined

In [None]:
# 45
column_to_drop_44 = '가구주 나이'

In [None]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

In [13]:
comp_45 = comp[[
    '현재 주택 거주 기간(총 개월)','현재 주택의 면적(㎡)','총 이사 횟수','가구주 나이','소득 대비 주택 임대료의 비율',
    '소득 중 사적이전소득의 비중(월평균)','장기부채부담지표','target',
    'Cat_현재 주택의 유형_고시원','Cat_현재 주택의 유형_기타','Cat_현재 주택의 유형_다가구 단독주택',
    'Cat_현재 주택의 유형_다세대주택','Cat_현재 주택의 유형_비거주용 건물 내 주택','Cat_현재 주택의 유형_아파트',
    'Cat_현재 주택의 유형_연립주택','Cat_현재 주택의 유형_영업겸용 단독주택','Cat_현재 주택의 유형_오피스텔',
    'Cat_현재 주택의 유형_일반 단독주택','Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
    'Cat_현재 주택의 점유형태_무상','Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
    'Cat_현재 주택의 점유형태_보증금 있는 월세','Cat_현재 주택의 점유형태_전세',
    'Cat_이사 계획 중인 거주 지역_국내 to 국외','Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
    'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권','Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
    'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권','Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
    'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
    'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급','Cat_현재 가장 필요한 주거지원 1순위_없음',
    'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원','Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
    'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급','Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등','Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
    'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원',
]]

In [14]:
X_45 = comp_45.drop('target', axis=1)
y_45 = comp_45['target']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [17]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8440117112275516


In [18]:
optuna_45 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [19]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.851314043956832


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
auc_bootstrap = []

In [22]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84487917, 0.8521867 ])

In [23]:
np.mean(auc_bootstrap)

0.8487899591266

In [24]:
t_45 = auc_bootstrap
print(t_45)

[0.8483444480298304, 0.8462703022480371, 0.8499910365350829, 0.8485147538632534, 0.8480576171524864, 0.8497884622279588, 0.8473952170951203, 0.8444910544620129, 0.8479626044243663, 0.8487155354773941, 0.8472500089634647, 0.848337277257897, 0.8511213294611164, 0.8510254203865045, 0.8500134451973755, 0.8520347065361585, 0.8490677996486322, 0.8484959305869276, 0.8500645369474024, 0.849154745258327, 0.8477922985909434, 0.8526657344663153, 0.8492318310566133, 0.8489234878634685, 0.8463052597612133, 0.8456329998924386, 0.8469855867484135, 0.8487711089598795, 0.8514090566849521, 0.8527222042952924, 0.8499202251622388, 0.8491215804381342, 0.8479850130866589, 0.8498485174429027, 0.8510424509698469, 0.8498028037718262, 0.8503594349431716, 0.8512728120182138, 0.849289645405328, 0.8489145243985516, 0.849818041662185, 0.8514027822595104, 0.8469793123229715, 0.8502868308773439, 0.8504517586318168, 0.8493671793768599, 0.8523851780144134, 0.8462559607041698, 0.8511123659961994, 0.8505835215660966, 0.8

In [25]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
# 46.
column_to_drop_45 = '현재 주택 거주 기간(총 개월)'

In [27]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(10564, 37)


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [29]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [30]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8432517689999552


In [31]:
optuna_46 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [32]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.8505543903051164


In [33]:
X_train = X_train.values
y_train = y_train.values

In [34]:
auc_bootstrap = []

In [35]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84446205, 0.85148596])

In [36]:
np.mean(auc_bootstrap)

0.8481225475959986

In [37]:
t_46 = auc_bootstrap
print(t_46)

[0.8456769208705317, 0.8460614535154711, 0.8486124556308485, 0.8475852425513608, 0.8484363235452297, 0.8469524219282206, 0.8490140188591302, 0.851102058011545, 0.8494178229536411, 0.8432419956258292, 0.8477107310601987, 0.8474010433473163, 0.8459108673048654, 0.8497512638485533, 0.8505409451077408, 0.8484107776702162, 0.8495513785809042, 0.8460668315944211, 0.846347836219569, 0.8462425155067943, 0.8489588935498906, 0.8481158796744469, 0.8457961349539278, 0.8490041590477215, 0.8474664766412106, 0.846135850274282, 0.8507896812591875, 0.8480903337994335, 0.8506458176472698, 0.847772578968126, 0.8481741421964075, 0.8460991000681224, 0.8483435516833387, 0.8476748772005307, 0.8495751317629343, 0.8466503531605178, 0.8471800939371124, 0.8449607400236636, 0.8460108099386899, 0.8489835430784123, 0.8473315764942095, 0.8455751855437239, 0.8481324620845433, 0.8489293141156644, 0.8483587895736977, 0.8507367968161774, 0.8462918145638377, 0.8466265999784875, 0.8477707862751426, 0.8468592018930838, 0.8

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [39]:
# 47.
column_to_drop_46 = '소득 중 사적이전소득의 비중(월평균)'

In [40]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(10564, 36)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8465343555465985


In [44]:
optuna_47 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [45]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.8500990462873328


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84424877, 0.85111117])

In [49]:
np.mean(auc_bootstrap)

0.8477660302606576

In [50]:
t_47 = auc_bootstrap
print(t_47)

[0.8499726614320031, 0.8461143379584812, 0.8483959879531032, 0.8479348176831236, 0.8493246029185042, 0.8475345989745795, 0.845272668602775, 0.8487746943458463, 0.8487854505037467, 0.8483986769925782, 0.8477474812663585, 0.848053135420028, 0.8466781399017604, 0.8481553189200818, 0.8469918611738552, 0.8476950449965939, 0.8509779140224447, 0.8459373095263705, 0.8491852210390448, 0.8473916317091534, 0.8464571904915565, 0.8503258219497327, 0.8454855508945538, 0.8489862321178876, 0.8469707970313005, 0.8500134451973754, 0.846359936897207, 0.8503751210067763, 0.8468600982395755, 0.8488289233085942, 0.8484166039224122, 0.8479375067225987, 0.8484040550715284, 0.8504669965221756, 0.8455420207235308, 0.8490893119644329, 0.8507193180595891, 0.8450172098526406, 0.8509129289017963, 0.8485031013588613, 0.8445040514861424, 0.8468834032483596, 0.8479312322971567, 0.8472925854218207, 0.8493232583987667, 0.8474579613495393, 0.849770087124879, 0.847106593524793, 0.8451512136531498, 0.845729805313542, 0.849

In [51]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
# 48
column_to_drop_47 = 'Cat_현재 주택의 점유형태'

In [53]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(10564, 32)


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [55]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [56]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8454840118231896


In [57]:
optuna_48 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [58]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.8494854971137642


In [59]:
X_train = X_train.values
y_train = y_train.values

In [60]:
auc_bootstrap = []

In [61]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84334984, 0.85015164])

In [62]:
np.mean(auc_bootstrap)

0.8468868770391882

In [63]:
t_48 = auc_bootstrap
print(t_48)

[0.8479679825033164, 0.8491318884227887, 0.846562959377577, 0.8458543974758883, 0.8484134667096914, 0.8495079057760567, 0.8472612132946112, 0.846858305546592, 0.8478935857445056, 0.8468435158294789, 0.848650550356746, 0.8479980101107885, 0.8467628446452261, 0.8466382524828797, 0.8455648775590692, 0.8480544799397653, 0.8494967014449104, 0.8482489871284644, 0.8450033164820193, 0.8459243125022409, 0.849510146642286, 0.8476018249614571, 0.846189631063784, 0.8450620271772257, 0.8464450898139184, 0.8485555376286258, 0.8498650998529991, 0.8465392061955469, 0.8433571761500125, 0.8462694059015453, 0.8468919185400308, 0.8486935749883475, 0.8457168082894124, 0.848581531676885, 0.8465616148578394, 0.8462962962962963, 0.8439985120648237, 0.8462514789717113, 0.8467843569610269, 0.8465535477394142, 0.8480863002402208, 0.8458028575526155, 0.8483354845649135, 0.8485645010935426, 0.8454129468287261, 0.844156269047363, 0.845927001541716, 0.8486873005629056, 0.8447120038722168, 0.8456621311534187, 0.85129

In [64]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [65]:
# 49
column_to_drop_48 = '총 이사 횟수'

In [66]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(10564, 31)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [68]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [69]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8431405067401139


In [70]:
optuna_49 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [71]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.8489750277867413


In [72]:
X_train = X_train.values
y_train = y_train.values

In [73]:
auc_bootstrap = []

In [74]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84269425, 0.84943808])

In [75]:
np.mean(auc_bootstrap)

0.8461331863325087

In [76]:
t_49 = auc_bootstrap
print(t_49)

[0.8483480334157973, 0.8478900003585386, 0.8476188555447993, 0.8494438170019003, 0.8460063282062315, 0.8452309884909109, 0.8424442472482163, 0.8458575346886092, 0.8458064429385823, 0.8463361837151769, 0.8471128679502349, 0.8426611630992076, 0.8448164282385, 0.8453430318023736, 0.8436525223190277, 0.8452663941773332, 0.8472849664766413, 0.8445636585278405, 0.8477954358036643, 0.8466265999784877, 0.8458884586425728, 0.8476972858628231, 0.8470124771431644, 0.8417634720877704, 0.8448209099709584, 0.8427924778602417, 0.8465333799433509, 0.8475901724570649, 0.847679807106235, 0.8454187730809222, 0.8430815496038149, 0.8460269441755406, 0.843406475207056, 0.8456894697214156, 0.8458024093793696, 0.8466198773797999, 0.8448056720805994, 0.8452354702233695, 0.8508851421605537, 0.8454967552257, 0.8473593632354524, 0.8482220967337133, 0.8465409988885304, 0.8430564519020474, 0.8484018142052991, 0.8486102147646195, 0.8456809544297444, 0.8446631529884192, 0.8405946362625937, 0.8442920655408555, 0.84571

In [77]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [78]:
# 50
column_to_drop_49 = '현재 주택의 면적(㎡)'

In [79]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(10564, 30)


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [81]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [82]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8419362152357921


In [83]:
optuna_50 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [84]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.848775590692338


In [85]:
X_train = X_train.values
y_train = y_train.values

In [86]:
auc_bootstrap = []

In [87]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84233033, 0.84901187])

In [88]:
np.mean(auc_bootstrap)

0.8458492134559535

In [89]:
t_50 = auc_bootstrap
print(t_50)

[0.8460457674518662, 0.843726470904593, 0.8480266931985228, 0.8467050302965115, 0.8448486967122011, 0.8446900433831702, 0.8446627048151734, 0.8465938833315407, 0.8474978487684199, 0.8433907891434513, 0.8436404216413897, 0.8470281632067692, 0.8456504786490266, 0.8488298196550859, 0.8458974221074897, 0.8468551683338711, 0.8461739450001793, 0.8437811480405866, 0.8458499157434297, 0.8461959054892261, 0.8447819188985695, 0.8470935965006633, 0.8456473414363057, 0.8501039761930372, 0.8452699795633001, 0.8464486751998852, 0.8466185328600624, 0.8458042020723533, 0.8491977698899287, 0.8450893657452225, 0.8500040335592127, 0.8452681868703165, 0.8419073356996881, 0.8469972392528056, 0.8455662220788067, 0.8472213258757306, 0.8456805062564985, 0.8450718869886342, 0.8449015811552114, 0.8473351618801764, 0.8477134200996738, 0.8493653866838766, 0.8447545803305727, 0.8454062242300384, 0.8458207844824495, 0.8475417697465133, 0.8449020293284573, 0.8479124090208312, 0.8456926069341364, 0.8466207737262916, 

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [91]:
# 51
column_to_drop_50 = '장기부채부담지표'

In [92]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(10564, 29)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [94]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [95]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.06999999999999999, 'n_estimators': 66, 'subsample': 0.5, 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 9}
0.8412595447624166


In [96]:
optuna_51 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [97]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.8459189344232906


In [98]:
X_train = X_train.values
y_train = y_train.values

In [99]:
auc_bootstrap = []

In [100]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84174124, 0.84649846])

In [101]:
np.mean(auc_bootstrap)

0.8442520770589078

In [102]:
t_51 = auc_bootstrap
print(t_51)

[0.8419476712918145, 0.8447079703130043, 0.8456612348069269, 0.8437264709045927, 0.8437860779462909, 0.8461044781470726, 0.8440401921766879, 0.8451619698110502, 0.845183482126851, 0.8437188519594135, 0.8430967874941737, 0.8450669570829299, 0.8437847334265535, 0.8430470402638844, 0.8428785271234449, 0.8471379656520024, 0.8440146463016742, 0.8437605320712775, 0.842161449930085, 0.8458145100570076, 0.8441392384640207, 0.8443480871965867, 0.8439299415582087, 0.8446958696353664, 0.8434772865799003, 0.8448052239073536, 0.8438640600910687, 0.8438878132730988, 0.8445618658348573, 0.8422022336954573, 0.8451350794162991, 0.8439021548169661, 0.8452251622387149, 0.8439873077336775, 0.8440536373740631, 0.8446165429708506, 0.8441217597074326, 0.8440478111218672, 0.8428511885554479, 0.8439595209924349, 0.8438559929726435, 0.8439003621239827, 0.8446627048151735, 0.8442082571438817, 0.8427754472768995, 0.842943960417339, 0.8424366283030367, 0.8452681868703166, 0.8452802875479546, 0.8449871822451687, 0.

In [103]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [104]:
# 52
column_to_drop_51 = 'Cat_현재 주택의 유형'

In [105]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(10564, 18)


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [107]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [108]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.836146379148193


In [109]:
optuna_52 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [110]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.8426530959807823


In [111]:
X_train = X_train.values
y_train = y_train.values

In [112]:
auc_bootstrap = []

In [113]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.83757747, 0.84377437])

In [114]:
np.mean(auc_bootstrap)

0.8408929336524327

In [115]:
t_52 = auc_bootstrap
print(t_52)

[0.8433491090315872, 0.8407653902692626, 0.8404489799576924, 0.8409002904162632, 0.8408469578000071, 0.8401020938654047, 0.8389036786060019, 0.8386186404216415, 0.8432249650424868, 0.8415133914165859, 0.8417329963070526, 0.8431272632748916, 0.8424308020508408, 0.8422371912086335, 0.8391577928363989, 0.842382399340289, 0.8408187228855187, 0.8398923487863469, 0.8408554730916783, 0.8402974974005951, 0.8430506256498513, 0.8388611021476462, 0.839360367143523, 0.840907461188197, 0.8410508766268688, 0.8397691011437382, 0.841182191387903, 0.8406376608941952, 0.8422833530529563, 0.8392492201785523, 0.8433437309526371, 0.838829281847191, 0.8410755261553907, 0.841023538058872, 0.8420408913269513, 0.8387015524721236, 0.8430425585314258, 0.8398865225341507, 0.8424079452153024, 0.8418620702018573, 0.8409634828439281, 0.8407895916245384, 0.8422802158402352, 0.8404216413896956, 0.8443081997777061, 0.8415362482521244, 0.8416841454232548, 0.8417549567960991, 0.8405816392384641, 0.8412050482234412, 0.841

In [116]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [118]:
# 53
column_to_drop_52 = '가구주 나이'

In [119]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(10564, 17)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [121]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [122]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8361680717900488


In [123]:
optuna_53 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [124]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.8409908214119249


In [125]:
X_train = X_train.values
y_train = y_train.values

In [126]:
auc_bootstrap = []

In [127]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.83639455, 0.84246084])

In [128]:
np.mean(auc_bootstrap)

0.839512574844932

In [129]:
t_53 = auc_bootstrap
print(t_53)

[0.8390098956652684, 0.8346863683625543, 0.8394670323760351, 0.8403261804883295, 0.8402880857624322, 0.8403369366462299, 0.8382914739521709, 0.8406950270696638, 0.8392873149044495, 0.841745993331182, 0.8406717220608797, 0.8380239145243985, 0.8381534365924491, 0.8395880391524146, 0.8410199526729052, 0.8397834426876053, 0.8388570685884336, 0.8415712057653005, 0.841590925388118, 0.8391232834964684, 0.8394495536194471, 0.8423120361406905, 0.8411544046466601, 0.8407595640170663, 0.8369469542146213, 0.8396669176436843, 0.8379136639059193, 0.8394522426589222, 0.8382345559499479, 0.8380243626976444, 0.8388534832024667, 0.8391031157004051, 0.8401083682908465, 0.8372929439604174, 0.8390152737442185, 0.8405829837582015, 0.8394007027356494, 0.8396436126349001, 0.8394849593058691, 0.8400389014377397, 0.8406493133985873, 0.8392873149044494, 0.8384151697680255, 0.839700530637123, 0.8372696389516331, 0.8394688250690185, 0.8374905883618371, 0.8395028862357032, 0.8399591265999784, 0.8394885446918361, 0.

In [130]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [131]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [132]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(10564, 10)


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [134]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [135]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8244379506471405


In [136]:
optuna_54 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [137]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.8293956831952961


In [138]:
X_train = X_train.values
y_train = y_train.values

In [139]:
auc_bootstrap = []

In [140]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.82325602, 0.83155648])

In [141]:
np.mean(auc_bootstrap)

0.8274134017245707

In [142]:
t_54 = auc_bootstrap
print(t_54)

[0.8245827507081138, 0.8261997597791402, 0.8280525079774838, 0.8272955433652432, 0.8294754580330571, 0.8281224230038363, 0.8277459574773225, 0.8250546771359937, 0.82655292029687, 0.8246531139077121, 0.8287960273923488, 0.8306626689613136, 0.8281896489907139, 0.8250336129934389, 0.8288350184647377, 0.8316638879925424, 0.8270418773080923, 0.8262786382704097, 0.8258941056254705, 0.8270037825821951, 0.8255230181779067, 0.8280681940410886, 0.8310996378760174, 0.8257896812591876, 0.8267321895952099, 0.8278956473414363, 0.8270400846151087, 0.8287377648703884, 0.8274976694991216, 0.8262669857660176, 0.8261347746584919, 0.8274210318740812, 0.8272825463411136, 0.8280655050016134, 0.8241614678570147, 0.8267079882399341, 0.8258430138754437, 0.8282658384425083, 0.8270203649922915, 0.8332463877236385, 0.827541590477215, 0.8308621060557169, 0.8266102864723388, 0.8260478290487971, 0.8265869814635546, 0.8287619662256642, 0.8275111146964972, 0.8280906027033811, 0.8277647807536481, 0.8226928041303646, 0.

In [143]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [144]:
# 55
column_to_drop_54 = '소득 대비 주택 임대료의 비율'

In [145]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(10564, 9)


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [147]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [148]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.08, 'n_estimators': 53, 'subsample': 0.7000000000000001, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7767897129293744


In [149]:
optuna_55 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [150]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7781246638700656


In [151]:
X_train = X_train.values
y_train = y_train.values

In [152]:
auc_bootstrap = []

In [153]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7773556 , 0.77871715])

In [154]:
np.mean(auc_bootstrap)

0.7780430707934459

In [155]:
t_55 = auc_bootstrap
print(t_55)

[0.777830662220788, 0.7777437166110932, 0.7775545875013445, 0.7786364777168262, 0.7779256749489082, 0.7786221361729591, 0.7777580581549604, 0.7781246638700656, 0.7773555985801871, 0.778541464988706, 0.7786965329317701, 0.7781246638700656, 0.7781040479007565, 0.7779480836112007, 0.7775545875013445, 0.7779256749489082, 0.7783666774228245, 0.7781954752429099, 0.7781246638700656, 0.7775545875013445, 0.7775545875013445, 0.7781246638700656, 0.7775545875013445, 0.7786965329317701, 0.7786221361729591, 0.7781246638700656, 0.7781246638700656, 0.7775545875013445, 0.7775339715320354, 0.7787171489010791, 0.7786364777168262, 0.7778243877953461, 0.7787171489010791, 0.7773555985801871, 0.7778243877953461, 0.7775545875013445, 0.7786965329317701, 0.7781246638700656, 0.7779256749489082, 0.7781040479007565, 0.778147072532358, 0.7786364777168262, 0.7781246638700656, 0.7778450037646552, 0.7787171489010791, 0.7775447276899359, 0.7781246638700656, 0.7786965329317701, 0.7781246638700656, 0.7778243877953461, 0.

In [156]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc