In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
고령가구 = pd.read_csv('고령가구_변수추가.csv', encoding='cp949')
고령가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [7]:
고령가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [8]:
cat = 고령가구.select_dtypes(include = 'object')
num = 고령가구.select_dtypes(exclude = 'object')
num_고령 = num.drop('target',axis=1)
target = 고령가구.target

In [9]:
scaler=RobustScaler()
scaler.fit(num_고령)
num_scaled_고령=scaler.transform(num_고령)
num_df_scaled_고령=pd.DataFrame(data=num_scaled_고령, columns=num_고령.columns)

In [10]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [11]:
comp =pd.concat([num_df_scaled_고령, target,cat2],axis=1)

In [12]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(10564, 210)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_0 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [None]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_0 = auc_bootstrap
print(t_0)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [18]:
column_to_drop = 'Cat_가구주 동거 여부'

In [19]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(10564, 208)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_1= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [None]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_1 = auc_bootstrap
print(t_1)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [20]:
column_to_drop_1 = '부채 중 임대 보증금의 비중'

In [21]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(10564, 207)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_2= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [None]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_2 = auc_bootstrap
print(t_2)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [22]:
column_to_drop_2 = '소득 중 재산소득의 비중(월평균)'

In [23]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(10564, 206)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_3 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [None]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_3 = auc_bootstrap
print(t_3)

In [24]:
column_to_drop_3 = 'Cat_가구주 주민등록상 등재 여부'

In [25]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(10564, 204)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_4= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [None]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_4 = auc_bootstrap
print(t_4)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
column_to_drop_4 = '부채 중 비금융기관 대출금의 비중'

In [27]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(10564, 203)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_5 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [None]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_5 = auc_bootstrap
print(t_5)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
column_to_drop_5 = '소득 대비 생활비의 비율'

In [29]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(10564, 202)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_6 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [None]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_6 = auc_bootstrap
print(t_6)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
column_to_drop_6 = '중기부채부담지표'

In [31]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(10564, 201)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_7 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [None]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_7 = auc_bootstrap
print(t_7)

In [32]:
column_to_drop_7 = '장기부채부담지표'

In [33]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(10564, 200)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_8 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [None]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_8 = auc_bootstrap
print(t_8)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
column_to_drop_8 = '자산 중 부동산 자산의 비중'

In [35]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(10564, 199)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_9 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [None]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_9 = auc_bootstrap
print(t_9)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [36]:
column_to_drop_9 = '자산 중 기타자산의 비중'

In [37]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(10564, 198)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_10 = ExtraTreesClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [None]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_10 = auc_bootstrap
print(t_10)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [38]:
column_to_drop_10 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [39]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(10564, 197)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_11 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [None]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_11 = auc_bootstrap
print(t_11)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
column_to_drop_11 = '부채 중 금융기관 대출금의 비중'

In [41]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(10564, 196)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_12 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [None]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_12 = auc_bootstrap
print(t_12)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [42]:
column_to_drop_12 = '소득 대비 주거관리비의 비율'

In [43]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(10564, 195)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_13 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [None]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_13 = auc_bootstrap
print(t_13)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
column_to_drop_13 = '총 가구원 수'

In [45]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(10564, 194)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_14 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [None]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_14 = auc_bootstrap
print(t_14)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
column_to_drop_14 = '현재 무주택 기간(총 개월)'

In [47]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(10564, 193)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_15 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [None]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_15 = auc_bootstrap
print(t_15)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [48]:
column_to_drop_15 = 'Cat_가구주 장애 여부'

In [49]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(10564, 191)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_16 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [None]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_16 = auc_bootstrap
print(t_16)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [50]:
column_to_drop_16 = 'Cat_현재 대중교통 접근용이성'

In [51]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(10564, 187)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_17 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [None]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_17 = auc_bootstrap
print(t_17)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
column_to_drop_17 = '소득 대비 주택 임대료의 비율'

In [53]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(10564, 186)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_18 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [None]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_18 = auc_bootstrap
print(t_18)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [54]:
column_to_drop_18 = 'Cat_이사 계획 첫 번째 이유'

In [55]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(10564, 174)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_19 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [None]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_19 = auc_bootstrap
print(t_19)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
column_to_drop_19 = '소득 중 정부 보조금의 비중(월평균)'

In [57]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(10564, 173)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_20 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [None]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_20 = auc_bootstrap
print(t_20)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [58]:
column_to_drop_20 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [59]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(10564, 169)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_21 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [None]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_21 = auc_bootstrap
print(t_21)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [60]:
column_to_drop_21 = 'Cat_이사 예상 기간'

In [61]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(10564, 165)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_22 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [None]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_22 = auc_bootstrap
print(t_22)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [62]:
column_to_drop_22 = 'Cat_현재 상업시설 접근용이성'

In [63]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(10564, 161)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_23 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [None]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_23 = auc_bootstrap
print(t_23)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [64]:
column_to_drop_23 = 'Cat_현재 문화시설 접근용이성'

In [65]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(10564, 157)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_24 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [None]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_24 = auc_bootstrap
print(t_24)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [66]:
column_to_drop_24 = 'Cat_소득 계층'

In [67]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(10564, 155)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_25 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [None]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_25 = auc_bootstrap
print(t_25)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [68]:
column_to_drop_25 = 'Cat_현재 교육환경'

In [69]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(10564, 151)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_26 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [None]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_26 = auc_bootstrap
print(t_26)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [70]:
column_to_drop_26 = 'Cat_현재 주택의 위치'

In [71]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(10564, 147)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_27 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [None]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_27 = auc_bootstrap
print(t_27)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
column_to_drop_27 = '현재 주택의 면적(㎡)'

In [73]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(10564, 146)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_28 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [None]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_28 = auc_bootstrap
print(t_28)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [74]:
column_to_drop_28 = 'Cat_현재 청소/쓰레기 처리상태'

In [75]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(10564, 142)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [None]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [None]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

In [None]:
optuna_29 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [None]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

In [None]:
X_train = X_train.values
y_train = y_train.values

In [None]:
auc_bootstrap = []

In [None]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

In [None]:
t_29 = auc_bootstrap
print(t_29)

In [None]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
column_to_drop_29 = '현재 주택 거주 기간(총 개월)'

In [None]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

In [13]:
comp_30 = comp[['총 이사 횟수',
 '가구주 나이',
 '소득 중 근로/사업소득의 비중(월평균)',
 '소득 중 사적이전소득의 비중(월평균)',
 '자산 중 금융자산의 비중',
 'target',
 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
 'Cat_현재 주택의 유형_고시원',
 'Cat_현재 주택의 유형_기타',
 'Cat_현재 주택의 유형_다가구 단독주택',
 'Cat_현재 주택의 유형_다세대주택',
 'Cat_현재 주택의 유형_비거주용 건물 내 주택',
 'Cat_현재 주택의 유형_아파트',
 'Cat_현재 주택의 유형_연립주택',
 'Cat_현재 주택의 유형_영업겸용 단독주택',
 'Cat_현재 주택의 유형_오피스텔',
 'Cat_현재 주택의 유형_일반 단독주택',
 'Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
 'Cat_현재 주택의 점유형태_무상',
 'Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_현재 주택의 점유형태_보증금 있는 월세',
 'Cat_현재 주택의 점유형태_전세',
 'Cat_현재 주택의 구조_원룸형',
 'Cat_현재 주택의 구조_원룸형 아님',
 'Cat_현재 의료시설 접근용이성_대체로 만족',
 'Cat_현재 의료시설 접근용이성_매우 만족',
 'Cat_현재 의료시설 접근용이성_매우 불만족',
 'Cat_현재 의료시설 접근용이성_약간 불만족',
 'Cat_현재 공공기관 접근용이성_대체로 만족',
 'Cat_현재 공공기관 접근용이성_매우 만족',
 'Cat_현재 공공기관 접근용이성_매우 불만족',
 'Cat_현재 공공기관 접근용이성_약간 불만족',
 'Cat_현재 주차시설 이용편의성_대체로 만족',
 'Cat_현재 주차시설 이용편의성_매우 만족',
 'Cat_현재 주차시설 이용편의성_매우 불만족',
 'Cat_현재 주차시설 이용편의성_약간 불만족',
 'Cat_현재 주변도로의 보행 안전_대체로 만족',
 'Cat_현재 주변도로의 보행 안전_매우 만족',
 'Cat_현재 주변도로의 보행 안전_매우 불만족',
 'Cat_현재 주변도로의 보행 안전_약간 불만족',
 'Cat_현재 치안 및 범죄 등 방범 상태_대체로 만족',
 'Cat_현재 치안 및 범죄 등 방범 상태_매우 만족',
 'Cat_현재 치안 및 범죄 등 방범 상태_매우 불만족',
 'Cat_현재 치안 및 범죄 등 방범 상태_약간 불만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_대체로 만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_매우 만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_매우 불만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_약간 불만족',
 'Cat_현재 대기오염 정도_대체로 만족',
 'Cat_현재 대기오염 정도_매우 만족',
 'Cat_현재 대기오염 정도_매우 불만족',
 'Cat_현재 대기오염 정도_약간 불만족',
 'Cat_현재 주택에 대한 전반적인 만족도_대체로 만족',
 'Cat_현재 주택에 대한 전반적인 만족도_매우 만족',
 'Cat_현재 주택에 대한 전반적인 만족도_매우 불만족',
 'Cat_현재 주택에 대한 전반적인 만족도_약간 불만족',
 'Cat_이사 계획 중인 거주 지역_국내 to 국외',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 기타',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_기타 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_기타 to 기타',
 'Cat_이사 계획 중인 주택의 유형_기타 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_기타 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 기타',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 준주택',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 전세',
 'Cat_주택 보유 의식_아니오',
 'Cat_주택 보유 의식_예',
 'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급',
 'Cat_현재 가장 필요한 주거지원 1순위_없음',
 'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등',
 'Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원',
 'Cat_가구주 성별_남',
 'Cat_가구주 성별_여',
 'Cat_기초생활보장 수급가구 여부_아니오',
 'Cat_기초생활보장 수급가구 여부_예',
 'Cat_가구주 최종 학력_고등학교 졸업',
 'Cat_가구주 최종 학력_대학 졸업 이상',
 'Cat_가구주 최종 학력_중학교 졸업 이하',
 'Cat_가구주 종사상 지위_무급가족종사자',
 'Cat_가구주 종사상 지위_무직 및 기타',
 'Cat_가구주 종사상 지위_사업자 및 자영자',
 'Cat_가구주 종사상 지위_상용근로자',
 'Cat_가구주 종사상 지위_임시일용근로자']]

In [14]:
X_30 = comp_30.drop('target', axis=1)
y_30 = comp_30['target']
X_30.shape

(10564, 141)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [17]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 105, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8469759057727619


In [17]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 105, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8469759057727619


In [18]:
optuna_30 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [19]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.8548815029937972


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
auc_bootstrap = []

In [22]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84801121, 0.85492854])

In [23]:
np.mean(auc_bootstrap)

0.8515269645674232

In [24]:
t_30 = auc_bootstrap
print(t_30)

[0.8534554157255029, 0.84968358968843, 0.8510334875049299, 0.852684557742641, 0.8520642859703847, 0.8510747194435482, 0.8495034240435985, 0.8524631601591911, 0.8518393030009681, 0.8513283855006992, 0.8508282241583307, 0.8519145961062708, 0.85126026316733, 0.8498709261051951, 0.8495688573374924, 0.8540505897959916, 0.8527625398874189, 0.8503065505001612, 0.8503441970528127, 0.8520893836721523, 0.850134451973755, 0.8527105517909004, 0.8473073751389337, 0.8526944175540496, 0.851442221505145, 0.8498843713025708, 0.8525581728873113, 0.8547398802481085, 0.8506964612240507, 0.8527741923918108, 0.8548071062349861, 0.8551432361693736, 0.8489692015345451, 0.8513507941629916, 0.8501425190921803, 0.8539008999318776, 0.8522623785450505, 0.849149367179377, 0.8516188017640101, 0.8511060915707576, 0.8503657093686134, 0.8511643540927181, 0.8512163421892367, 0.8492945753110323, 0.8523116776020938, 0.8517416012333727, 0.8525034957513177, 0.853789752966907, 0.8511186404216414, 0.8515721917464414, 0.852248

In [25]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
# 31
column_to_drop_30 = '자산 중 금융자산의 비중'

In [27]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(10564, 140)


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [29]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [30]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8482956581127681


In [31]:
optuna_31= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [32]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.8547595998709261


In [33]:
X_train = X_train.values
y_train = y_train.values

In [34]:
auc_bootstrap = []

In [35]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84786745, 0.85493262])

In [36]:
np.mean(auc_bootstrap)

0.8514712913663907

In [37]:
t_31 = auc_bootstrap
print(t_31)

[0.8515336488472984, 0.853218780251694, 0.8545211717041339, 0.8515892223297837, 0.8504607220967336, 0.8528593453085226, 0.8513274891542075, 0.8508129862679716, 0.8489709942275285, 0.8552310781255601, 0.8505799361801298, 0.8522749273959342, 0.8549236312789072, 0.8519782367071815, 0.8498476210964111, 0.8530260657559787, 0.8527741923918111, 0.8510998171453157, 0.8507125954609013, 0.8501452081316554, 0.8525406941307231, 0.8492067333548458, 0.8469846904019217, 0.8492560324118892, 0.8496925531533469, 0.8543445914452692, 0.8473414363056182, 0.8514915205621886, 0.8524434405363738, 0.8512557814348714, 0.8524649528521746, 0.8527759850847945, 0.8525599655802947, 0.8533451651070239, 0.8520633896238929, 0.8497839804955003, 0.8492730629952314, 0.8484555949948012, 0.8496701444910544, 0.8496271198594528, 0.8519647915098061, 0.8509384747768096, 0.8484851744290274, 0.8521064142554946, 0.8494137893944282, 0.8477017675952817, 0.850477752680076, 0.8487585601089958, 0.8501487935176222, 0.8508811086013409, 0

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [39]:
# 32
column_to_drop_31 = '총 이사 횟수'

In [40]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(10564, 139)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 137, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.8479520746562768


In [44]:
optuna_32 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [45]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.8535674590369653


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84747759, 0.85403633])

In [49]:
np.mean(auc_bootstrap)

0.8508678065953175

In [50]:
t_32 = auc_bootstrap
print(t_32)

[0.853189200817468, 0.8502769710659351, 0.8521494388870963, 0.8509949446057868, 0.8501971962281739, 0.851083682908465, 0.8525743071241618, 0.8509797067154278, 0.8530153095980781, 0.8509474382417267, 0.8494989423111399, 0.8503836362984475, 0.850253666057151, 0.849638772363845, 0.8509555053601521, 0.8491170987056758, 0.8510379692373884, 0.8502482879782007, 0.8475583521566096, 0.8505517012656412, 0.8538005091248071, 0.8498198343551683, 0.8524631601591911, 0.8490812448460077, 0.849584991574343, 0.8520114015273744, 0.8493107095478828, 0.8525276971065934, 0.8511235703273458, 0.8523394643433366, 0.852803771826037, 0.8501496898641138, 0.8516931985228211, 0.8520526334659925, 0.8507699616363702, 0.8509729841167402, 0.853251945071887, 0.8517093327596715, 0.8512665375927718, 0.8505073321143022, 0.8514431178516366, 0.853592556738733, 0.8516972320820335, 0.8507681689433868, 0.8515063102793016, 0.8475260836829085, 0.8529256749489081, 0.8518527481983436, 0.8494577103725216, 0.8473051342727045, 0.85032

In [51]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
# 33.
column_to_drop_32 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [53]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(10564, 135)


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [55]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [56]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 83, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8492956189260602


In [57]:
optuna_33 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [58]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.8544279516689971


In [59]:
X_train = X_train.values
y_train = y_train.values

In [60]:
auc_bootstrap = []

In [61]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84816231, 0.8546183 ])

In [62]:
np.mean(auc_bootstrap)

0.8513392003692948

In [63]:
t_33 = auc_bootstrap
print(t_33)

[0.8503164103115701, 0.8503683984080888, 0.8506444731275322, 0.8482879782008534, 0.8518859130185366, 0.8546309741493672, 0.8495504822344125, 0.8521180667598867, 0.8521996342906314, 0.8522883725933097, 0.852543831343444, 0.852866516080456, 0.8518061381807751, 0.8498933347674879, 0.8493402889821089, 0.850411871212936, 0.8513875443691514, 0.853565666343982, 0.8522610340253128, 0.8528109425979706, 0.8523336380911405, 0.8501156286974292, 0.8521436126349002, 0.8504212828510989, 0.8501084579254956, 0.8531475207056038, 0.8512647448997884, 0.8525518984618695, 0.8515219963429064, 0.8529561507296259, 0.8511293965795419, 0.8517595281632069, 0.851376788211251, 0.8523493241547453, 0.8517362231544225, 0.8533236527912231, 0.8544422932128645, 0.8535782151948657, 0.850785199526729, 0.8515336488472984, 0.84941468574092, 0.8511186404216414, 0.8511580796672761, 0.85315872503675, 0.8490570434907317, 0.8529355347603169, 0.8515327525008067, 0.8534993367035962, 0.8516340396543689, 0.8507574127854863, 0.8515094

In [64]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [65]:
# 34
column_to_drop_33 = 'Cat_현재 주택의 구조'

In [66]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(10564, 133)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [68]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [69]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.8479296822517801


In [70]:
optuna_34 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [71]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.8551378580904234


In [72]:
X_train = X_train.values
y_train = y_train.values

In [73]:
auc_bootstrap = []

In [74]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84898827, 0.85560447])

In [75]:
np.mean(auc_bootstrap)

0.8523497387149978

In [76]:
t_34 = auc_bootstrap
print(t_34)

[0.8534536230325195, 0.851824961457101, 0.8511679394786849, 0.853732386791438, 0.853054748843713, 0.849775913377075, 0.8538937291599442, 0.8533989458965258, 0.853099566168298, 0.852801082786562, 0.8508739378294073, 0.8519755476677064, 0.8524143092753935, 0.8494568140260299, 0.8527186189093255, 0.8520911763651358, 0.8546332150155964, 0.8531210784840988, 0.8505548384783623, 0.8507027356494925, 0.8526908321680828, 0.8518348212685096, 0.8546269405901545, 0.8510334875049299, 0.8513794772507259, 0.8498628589867698, 0.8536257215589258, 0.8508784195618659, 0.8492972643505073, 0.8504938869169267, 0.8548868810727475, 0.851932523036105, 0.8541402244451616, 0.8546054282743538, 0.8508138826144633, 0.8511249148470832, 0.8500161342368505, 0.8523573912731705, 0.8513911297551182, 0.856342547775268, 0.8539609551468216, 0.8536382704098097, 0.8561677602093865, 0.8512728120182138, 0.8532151948657274, 0.856133699042702, 0.8533935678175755, 0.8527195152558174, 0.8511760065971102, 0.8521333046502457, 0.852778

In [77]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [78]:
# 35
column_to_drop_34 = 'Cat_현재 공공기관 접근용이성'

In [79]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(10564, 129)


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [81]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [82]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.8468198587039276


In [83]:
optuna_35 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [84]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.8542908106557672


In [85]:
X_train = X_train.values
y_train = y_train.values

In [86]:
auc_bootstrap = []

In [87]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84846045, 0.85506888])

In [88]:
np.mean(auc_bootstrap)

0.8518305703452727

In [89]:
t_35 = auc_bootstrap
print(t_35)

[0.8526298806066474, 0.852018572299308, 0.8517684916281238, 0.8510863719479401, 0.8543508658707111, 0.8513991968735435, 0.8531461761858665, 0.850809400882005, 0.8533478541464987, 0.8520158832598329, 0.8474893334767488, 0.8498727187981786, 0.8541612885877166, 0.8527679179663692, 0.8518106199132336, 0.8510137678821125, 0.8521673658169302, 0.8523237782797319, 0.8561260800975226, 0.8536902585063282, 0.8509420601627766, 0.8504714782546341, 0.8503450933993044, 0.8517003692947547, 0.8524057939837225, 0.8518034491413, 0.851740704886881, 0.8545494066186223, 0.8527930156681367, 0.8527401312251265, 0.8518406475207055, 0.8527132408303754, 0.850785199526729, 0.8514691118998962, 0.853484995159729, 0.8522888207665555, 0.8532743537341796, 0.8506408877415653, 0.8518115162597253, 0.8505189846186944, 0.8507009429565092, 0.8510178014413251, 0.851588325983292, 0.8521808110143057, 0.8519172851457459, 0.8527885339356781, 0.8476184073715536, 0.8501541715965724, 0.852329156358682, 0.8536046574163707, 0.8526200

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [91]:
# 36
column_to_drop_35 = 'Cat_현재 주변도로의 보행 안전'

In [92]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(10564, 125)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [94]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [95]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 5}
0.8498960152716198


In [96]:
optuna_36 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [97]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.8556469828977089


In [98]:
X_train = X_train.values
y_train = y_train.values

In [99]:
auc_bootstrap = []

In [100]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84892074, 0.85544192])

In [101]:
np.mean(auc_bootstrap)

0.8522447453927791

In [102]:
t_36 = auc_bootstrap
print(t_36)

[0.8527912229751533, 0.8529929009357857, 0.8523762145494963, 0.8536230325194507, 0.8538480154888675, 0.8545364095944927, 0.8501873364167654, 0.8521368900362125, 0.8539008999318776, 0.852690832168083, 0.8536830877343946, 0.8515049657595641, 0.8525510021153777, 0.850516743752465, 0.8532017496683517, 0.8511558388010468, 0.8507946111648919, 0.8515515757771324, 0.8517469793123231, 0.8524165501416228, 0.8530112760388656, 0.8542625757412785, 0.8539192750349575, 0.8528109425979706, 0.851928937650138, 0.8532268473701194, 0.854117815782869, 0.8521270302248037, 0.8492623068373311, 0.8492963680040156, 0.8512347172923167, 0.8534473486070775, 0.8507753397153204, 0.8538126098024452, 0.8523219855867483, 0.8513221110752571, 0.8525061847907928, 0.8505086766340397, 0.8494066186224947, 0.8502066078663368, 0.8520185722993081, 0.8534652755369114, 0.8494904270194688, 0.8500273385679967, 0.854918253199957, 0.8531999569753683, 0.8529274676418916, 0.8535728371159156, 0.8491126169732173, 0.8537861675809402, 0.85

In [103]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [104]:
# 37
column_to_drop_36 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [105]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(10564, 121)


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [107]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [108]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8482851616731606


In [109]:
optuna_37 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [110]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.8550419490158117


In [111]:
X_train = X_train.values
y_train = y_train.values

In [112]:
auc_bootstrap = []

In [113]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84880318, 0.85531536])

In [114]:
np.mean(auc_bootstrap)

0.8521321857857375

In [115]:
t_37 = auc_bootstrap
print(t_37)

[0.8519226632246962, 0.8509187551539924, 0.851269226632247, 0.853586282313291, 0.8488284751353484, 0.8496150191818149, 0.8508111935749885, 0.8486254526549784, 0.8541859381162382, 0.8504176974651321, 0.8513014951059481, 0.8509685023842817, 0.8540738948047758, 0.855739306586354, 0.8514843497902549, 0.8520898318453982, 0.8541384317521781, 0.855255279480836, 0.8534303180237353, 0.8549442472482163, 0.8502617331755764, 0.8512226166146786, 0.8506139973468143, 0.8516017711806677, 0.8505718690617047, 0.8538731131906349, 0.8549442472482163, 0.8527719515255817, 0.8499175361227636, 0.8525756516438995, 0.8535602882650318, 0.8499273959341723, 0.8518249614571008, 0.8514986913341221, 0.8542450969846904, 0.8525420386504607, 0.8530825355849557, 0.8506561256319243, 0.8518280986698219, 0.8505960704169804, 0.8545131045857086, 0.8522767200889176, 0.8507681689433867, 0.8534675164031408, 0.8500932200351368, 0.850809400882005, 0.8540263884407155, 0.8525245598938727, 0.8513126994370944, 0.8505387042415117, 0.85

In [116]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [117]:
# 38
column_to_drop_37 = 'Cat_가구주 최종 학력'

In [118]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(10564, 118)


In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [120]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [121]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 78, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 7}
0.847836613820592


In [122]:
optuna_38 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [123]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.854299774120684


In [124]:
X_train = X_train.values
y_train = y_train.values

In [125]:
auc_bootstrap = []

In [126]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8476773 , 0.85430251])

In [127]:
np.mean(auc_bootstrap)

0.8510619023609767

In [128]:
t_38 = auc_bootstrap
print(t_38)

[0.8521817073607973, 0.8490458391595855, 0.8503585385966799, 0.8496795561292174, 0.8517931411566455, 0.8517613208561902, 0.8514431178516368, 0.8527374421856514, 0.8490821411924994, 0.85106261876591, 0.8500026890394751, 0.8507986447241045, 0.8483112832096376, 0.8496930013265929, 0.8495894733068015, 0.8487885877164676, 0.8506350614893694, 0.8541326054999819, 0.8500336129934387, 0.8510406582768637, 0.8512024488186153, 0.8512024488186154, 0.8519074253343372, 0.853167688501667, 0.8504168011186404, 0.8492811301136567, 0.8518523000250977, 0.8525429349969523, 0.8502850381843605, 0.8517236743035388, 0.8500103079846545, 0.852089831845398, 0.8509416119895307, 0.8510030117242121, 0.8514942096016637, 0.8513149403033238, 0.8499883474956079, 0.8459408949123373, 0.8534061166684594, 0.8519482090997095, 0.849874063317916, 0.8505776953139006, 0.8524698827578789, 0.8508358431035101, 0.8503657093686137, 0.850286830877344, 0.8493142949338496, 0.8513673765730883, 0.8500797748377613, 0.847718350005378, 0.8528

In [129]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [130]:
# 38
column_to_drop_38 = '소득 중 사적이전소득의 비중(월평균)'

In [131]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(10564, 117)


In [132]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [133]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [134]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 81, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 4}
0.8499366015047696


In [135]:
optuna_39 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [136]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.8556559463626259


In [137]:
X_train = X_train.values
y_train = y_train.values

In [138]:
auc_bootstrap = []

In [139]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84816953, 0.85502562])

In [140]:
np.mean(auc_bootstrap)

0.8516863912193897

In [141]:
t_39 = auc_bootstrap
print(t_39)

[0.8531542433042916, 0.8523116776020938, 0.8498391058047399, 0.8517411530601269, 0.8520154350865872, 0.8496298088989279, 0.8517640098956653, 0.8544638055286652, 0.8502666630812806, 0.854392545982575, 0.8515260299021189, 0.8502787637589186, 0.8501895772829946, 0.8514726972858628, 0.854150532429816, 0.8529198486967122, 0.853565218170736, 0.8514408769854075, 0.853174411100355, 0.8511025061847908, 0.8514937614284178, 0.8529570470761176, 0.8517590799899609, 0.852938671973038, 0.8539735039977053, 0.8512979097199813, 0.8499690760460363, 0.8527056218851958, 0.8527118963106377, 0.853373848194758, 0.8529704922734932, 0.8498745114911621, 0.8547192642787996, 0.8549048080025815, 0.8514856943099924, 0.8519634469900685, 0.8531762037933384, 0.8532210211179234, 0.8554040729984583, 0.8510567925137141, 0.8508851421605536, 0.8545919830769784, 0.8520889354989064, 0.8517604245096986, 0.8537126671686208, 0.8514390842924241, 0.8508846939873077, 0.8531260083898032, 0.8538063353770033, 0.8514216055358359, 0.849

In [142]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [143]:
# 40
column_to_drop_39 = 'Cat_현재 주차시설 이용편의성'

In [144]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(10564, 113)


In [145]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [146]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [147]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8484594025706481


In [148]:
optuna_40 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [149]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.8542052095658098


In [150]:
X_train = X_train.values
y_train = y_train.values

In [151]:
auc_bootstrap = []

In [152]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84839992, 0.8549501 ])

In [153]:
np.mean(auc_bootstrap)

0.8516316634398192

In [154]:
t_40 = auc_bootstrap
print(t_40)

[0.8520678713563514, 0.8522664121042631, 0.8525124592162345, 0.8530879136639059, 0.8501922663224697, 0.850411871212936, 0.854407335699688, 0.8490341866551934, 0.8535916603922411, 0.8518612634900148, 0.8511952780466817, 0.8521449571546378, 0.8514628374744541, 0.8516286615754185, 0.8520853501129397, 0.8498946792872253, 0.8529660105410347, 0.8525828224158332, 0.8551880534939587, 0.8514332580402282, 0.8532107131332688, 0.8518608153167688, 0.8477080420207235, 0.8498664443727366, 0.8521324083037539, 0.8504773045068301, 0.8526401885913018, 0.8521763292818472, 0.8509402674697931, 0.8533420278943027, 0.8523945896525762, 0.8508089527087591, 0.8543732745330034, 0.850286830877344, 0.852735201319422, 0.8525380050912481, 0.8520786275142519, 0.8495155247212364, 0.8504078376537235, 0.8496947940195761, 0.8503616758094008, 0.8512333727725789, 0.8530798465454805, 0.853025169409487, 0.8529373274533003, 0.8510469327023055, 0.8513055286651607, 0.8515210999964147, 0.8487177763436233, 0.8517044028539672, 0.85

In [155]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [156]:
# 41.
column_to_drop_40 = 'Cat_현재 대기오염 정도'

In [157]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(10564, 109)


In [158]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [159]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [160]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.8500674571185454


In [161]:
optuna_41 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [162]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.8534652755369114


In [163]:
X_train = X_train.values
y_train = y_train.values

In [164]:
auc_bootstrap = []

In [165]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84865895, 0.85479245])

In [166]:
np.mean(auc_bootstrap)

0.8517998343999856

In [167]:
t_41 = auc_bootstrap
print(t_41)

[0.8500752931053027, 0.8501962998816822, 0.8525675845254743, 0.8510711340575814, 0.8509330966978595, 0.8515453013516905, 0.8483498261087807, 0.8530229285432578, 0.8504992649958767, 0.8508022301100715, 0.8497638126994371, 0.8495715463769675, 0.8524748126635832, 0.851224409307662, 0.8536920511993117, 0.8493420816750923, 0.8501478971711306, 0.8516340396543688, 0.8492237639381879, 0.8520813165537272, 0.8520947617511024, 0.8505584238643289, 0.8512127568032699, 0.8500071707719337, 0.8516600337026281, 0.8523950378258219, 0.8514520813165538, 0.8527168262163423, 0.8508582517658025, 0.8549249757986447, 0.847327542934997, 0.8516654117815783, 0.8518814312860779, 0.8513951633143308, 0.8528450037646552, 0.850233050087842, 0.8544118174321466, 0.8543495213509734, 0.8530807428919724, 0.8515896705030297, 0.8539766412104263, 0.8521064142554945, 0.853878491269585, 0.8516958875622961, 0.8485568821483631, 0.8526513929224482, 0.8509779140224445, 0.8494030332365279, 0.8520517371195009, 0.8519540353519055, 0.8

In [168]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [169]:
# 42.
column_to_drop_41 = '소득 중 근로/사업소득의 비중(월평균)'

In [170]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(10564, 108)


In [171]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [172]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [173]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.8497693582336872


In [174]:
optuna_42 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [175]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.8535934530852246


In [176]:
X_train = X_train.values
y_train = y_train.values

In [177]:
auc_bootstrap = []

In [178]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84792948, 0.85396367])

In [179]:
np.mean(auc_bootstrap)

0.8509576346312431

In [180]:
t_42 = auc_bootstrap
print(t_42)

[0.848963823455595, 0.8491466781399017, 0.8504199383313614, 0.8508353949302643, 0.8495316589580868, 0.8503567459036966, 0.8495424151159872, 0.852454196694274, 0.8532201247714316, 0.8513005987594564, 0.8508941056254706, 0.8531699293678965, 0.8509218923667132, 0.8505723172349504, 0.850441898820408, 0.8512217202681869, 0.8502012297873867, 0.8513991968735434, 0.8513965078340685, 0.8500681223333693, 0.852108655121724, 0.8523228819332401, 0.8526652862930695, 0.848835197734036, 0.8499027464056506, 0.849671937184038, 0.8472656950270697, 0.8501564124628016, 0.8487997920476139, 0.8515533684701158, 0.8489907138503461, 0.8524976694991215, 0.854060897780646, 0.8535513248001146, 0.847414040371446, 0.8529731813129684, 0.8509828439281488, 0.8499448746907605, 0.8526836613961493, 0.8487478039510954, 0.848787691369976, 0.8492300383636299, 0.8477143164461655, 0.8500515399232728, 0.8519580689111184, 0.8516770642859703, 0.8531139077121652, 0.8494375425764583, 0.8473965616148578, 0.8502092969058119, 0.851435

In [181]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [182]:
# 43.
column_to_drop_42 = 'Cat_주택 보유 의식'

In [183]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(10564, 106)


In [184]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [185]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [186]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.8486861256661741


In [187]:
optuna_43 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [188]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.8515721917464415


In [189]:
X_train = X_train.values
y_train = y_train.values

In [190]:
auc_bootstrap = []

In [191]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84681237, 0.85296887])

In [192]:
np.mean(auc_bootstrap)

0.8498349915743431

In [193]:
t_43 = auc_bootstrap
print(t_43)

[0.8502698002940017, 0.8505230181779068, 0.8496410132300742, 0.8508887275465206, 0.8471402065182316, 0.8484623175934889, 0.8491139614929548, 0.8496298088989279, 0.847877003334409, 0.8470205442615897, 0.8478017102291062, 0.8517684916281238, 0.8509541608404145, 0.8491076870675129, 0.8511356710049837, 0.8477434477071457, 0.8479428848015489, 0.8491269585170843, 0.8508936574522248, 0.8475821053386398, 0.8497463339428488, 0.8516407622530565, 0.8505136065397441, 0.848960686242874, 0.849579165322147, 0.8491466781399017, 0.8501631350614893, 0.8521189631063784, 0.8456477896095514, 0.8482996307052455, 0.8492094223943207, 0.8486388978523538, 0.8488576063963287, 0.8504096303467067, 0.8494707073966512, 0.8501062170592665, 0.8508192606934137, 0.8518021046215626, 0.8515475422179197, 0.8518280986698219, 0.8512893944283102, 0.8499179842960094, 0.8511872109282564, 0.8478940339177512, 0.8474404825929511, 0.84815442257359, 0.8496275680326987, 0.8500242013552759, 0.8504979204761393, 0.8483834391022194, 0.85

In [194]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [195]:
## 44
column_to_drop_43 = 'Cat_현재 의료시설 접근용이성'

In [196]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(10564, 102)


In [197]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [198]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [199]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 86, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.8487589009807873


In [200]:
optuna_44 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [201]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.8527889821089241


In [202]:
X_train = X_train.values
y_train = y_train.values

In [203]:
auc_bootstrap = []

In [204]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84636527, 0.85269782])

In [205]:
np.mean(auc_bootstrap)

0.8495894089939406

In [206]:
t_44 = auc_bootstrap
print(t_44)

[0.8496378760173532, 0.8506471621670073, 0.849182083826324, 0.8481087089025134, 0.8478277042773655, 0.8500434728048475, 0.8498059409845469, 0.8496347388046323, 0.8499480119034815, 0.8525339715320354, 0.8496172600480443, 0.8476829443189559, 0.8521682621634219, 0.8509483345882184, 0.8508336022372809, 0.8506063784016351, 0.8464361263490014, 0.8518630561829981, 0.8503648130221217, 0.848208203363092, 0.8501169732171667, 0.8481584561328027, 0.8514251909218027, 0.8452166469470438, 0.8503930479366104, 0.8507072173819512, 0.8493246029185042, 0.8505319816428238, 0.8466274963249795, 0.8476793589329892, 0.8485048940518446, 0.8499211215087303, 0.8457262199275751, 0.849426338245312, 0.8496414614033201, 0.8491753612276361, 0.8507036319959844, 0.8486339679466495, 0.8502245347961709, 0.8487029866265103, 0.8482646731920692, 0.8515574020293285, 0.8517626653759278, 0.8505454268401994, 0.8504199383313613, 0.8487043311462479, 0.8510088379764083, 0.8497297515327524, 0.849169983148686, 0.8508389803162311, 0.8

In [207]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [208]:
# 45
column_to_drop_44 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [209]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(10564, 98)


In [210]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [211]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [212]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 51, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 5}
0.8446799845492409


In [213]:
optuna_45 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [214]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.8499175361227636


In [215]:
X_train = X_train.values
y_train = y_train.values

In [216]:
auc_bootstrap = []

In [217]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84409215, 0.85041175])

In [218]:
np.mean(auc_bootstrap)

0.8474387828959162

In [219]:
t_45 = auc_bootstrap
print(t_45)

[0.8495863360940807, 0.8446936287691369, 0.8485273027141372, 0.8459247606754867, 0.8488033774335808, 0.8490731777275824, 0.8493398408088632, 0.8466629020114015, 0.8489669606683159, 0.8485671901330178, 0.8446304363414722, 0.8474364490337385, 0.8456657165393855, 0.8465087304148291, 0.8471653042199994, 0.8489566526836614, 0.8477604782904882, 0.8483690975583521, 0.8479984582840343, 0.8496051593704061, 0.843738123408985, 0.8495379333835288, 0.8480347603169481, 0.8463711412283532, 0.8476851851851851, 0.8436713455953533, 0.8453708185436163, 0.8487908285826969, 0.8473293356279803, 0.8483825427557277, 0.8507242479652934, 0.8478138109067441, 0.8466104657416371, 0.8473284392814886, 0.8476820479724643, 0.847595998709261, 0.8483283137929798, 0.8496513212147287, 0.8456769208705317, 0.8466292890179627, 0.8467193718403786, 0.8472177404897637, 0.8484538023018178, 0.848737047793195, 0.8463971352766126, 0.8468860922878347, 0.8473642931411567, 0.8478353232225448, 0.8482826001219032, 0.8468488939084292, 0.

In [220]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [221]:
# 46.
column_to_drop_45 = 'Cat_이사 계획 중인 주택의 유형'

In [222]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(10564, 82)


In [223]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [224]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [225]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 165, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.8471081609118187


In [226]:
optuna_46 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [227]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.8514641819941917


In [228]:
X_train = X_train.values
y_train = y_train.values

In [229]:
auc_bootstrap = []

In [230]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84743633, 0.85280702])

In [231]:
np.mean(auc_bootstrap)

0.8501707620737873

In [232]:
t_46 = auc_bootstrap
print(t_46)

[0.8505575275178372, 0.8493479079272884, 0.849303986949195, 0.8497216844143272, 0.8482194076942382, 0.8486599619949089, 0.8526016456921588, 0.848865225341508, 0.8506355096626151, 0.8493062278154243, 0.8500815675307447, 0.8515699508802121, 0.8497723279911084, 0.8525666881789824, 0.8525218708543975, 0.8479079272883726, 0.8525097701767597, 0.8502917607830485, 0.8521064142554947, 0.8524394069771611, 0.8482754293499695, 0.8494232010325913, 0.8496042630239146, 0.8495921623462767, 0.8486102147646194, 0.8492340719228424, 0.8513705137858092, 0.8512347172923165, 0.8517581836434693, 0.8478801405471298, 0.8487478039510954, 0.8504822344125346, 0.8495746835896884, 0.850671811695529, 0.8500049299057044, 0.8480903337994335, 0.8529171596572371, 0.8500295794342261, 0.8487016421067728, 0.8495755799361802, 0.8487594564554874, 0.851618353590764, 0.8500613997346815, 0.8494187193001327, 0.850217812197483, 0.8503401634936001, 0.8492260048044171, 0.8513633430138754, 0.8487930694489262, 0.8534123910939012, 0.85

In [233]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [234]:
# 47.
column_to_drop_46 = '가구주 나이'

In [235]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(10564, 81)


In [236]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [237]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [238]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.8466204263513816


In [239]:
optuna_47 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [240]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.8518222724176258


In [241]:
X_train = X_train.values
y_train = y_train.values

In [242]:
auc_bootstrap = []

In [243]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84632036, 0.85215574])

In [244]:
np.mean(auc_bootstrap)

0.849253817091535

In [245]:
t_47 = auc_bootstrap
print(t_47)

[0.8503881180309061, 0.8472007099064213, 0.8456652683661396, 0.8506059302283892, 0.8476426087268294, 0.8492425872145137, 0.8517909002904164, 0.847052812735291, 0.8491825319995696, 0.8506606073643828, 0.8497687426051415, 0.850513606539744, 0.8483740274640565, 0.8483112832096376, 0.849835968592019, 0.8489476892187444, 0.8517756624000574, 0.8488275787888566, 0.8504105266931985, 0.8473396436126348, 0.8516627227421032, 0.850668674482808, 0.8485918396615395, 0.849749022982324, 0.8473638449679108, 0.849507009429565, 0.8494765336488472, 0.8484260155605752, 0.8482915635868201, 0.8502267756624001, 0.8496136746620773, 0.8486716144993007, 0.8464809436735865, 0.8498978164999463, 0.8499247068946972, 0.8498296941665772, 0.8487917249291888, 0.8466288408447169, 0.852552794808361, 0.8499157434297802, 0.8483686493851064, 0.8515013803735972, 0.8499955182675415, 0.849205837008354, 0.848008318095443, 0.8478227743716612, 0.8520311211501919, 0.8472518016564484, 0.8492936789645406, 0.8503925997633646, 0.847506

In [246]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [247]:
# 48
column_to_drop_47 = 'Cat_가구주 종사상 지위'

In [248]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(10564, 76)


In [249]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [250]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [251]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.8453629528863811


In [252]:
optuna_48 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [253]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.8490597325302068


In [254]:
X_train = X_train.values
y_train = y_train.values

In [255]:
auc_bootstrap = []

In [256]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84307823, 0.85001599])

In [257]:
np.mean(auc_bootstrap)

0.8466549195977198

In [258]:
t_48 = auc_bootstrap
print(t_48)

[0.8473100641784089, 0.8469954465598221, 0.8477600301172421, 0.8480764404288121, 0.8488661216879997, 0.8467408841561794, 0.8436081531676886, 0.8488033774335806, 0.8476578466171882, 0.8464773582876197, 0.8494128930479365, 0.8470716360116166, 0.843594707970313, 0.8466306335377003, 0.8478003657093687, 0.8447787816858485, 0.8511939335269442, 0.8500627442544191, 0.8466709691298269, 0.8481759348893909, 0.8440823204617978, 0.8424384209960202, 0.8483856799684485, 0.8489270732494353, 0.8469972392528056, 0.8465141084937795, 0.8485532967623965, 0.8470958373668926, 0.8452574307124162, 0.8485792908106559, 0.8452269549316984, 0.8479410921085656, 0.8471074898712847, 0.8468233480334157, 0.8485308881001039, 0.848124843139364, 0.847903445555914, 0.8420825714388154, 0.8445636585278405, 0.8468116955290237, 0.8499193288157472, 0.8446264027822596, 0.8483059051306875, 0.8464845290595533, 0.8446694274138612, 0.8446703237603529, 0.8464567423183106, 0.8451695887562296, 0.845667061059123, 0.848151733534115, 0.84

In [259]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [260]:
# 49
column_to_drop_48 = 'Cat_가구주 성별'

In [261]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(10564, 74)


In [262]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [263]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [264]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 64, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 6}
0.8464300909131623


In [265]:
optuna_49 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [266]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.8487805205980423


In [267]:
X_train = X_train.values
y_train = y_train.values

In [268]:
auc_bootstrap = []

In [269]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84354929, 0.84943853])

In [270]:
np.mean(auc_bootstrap)

0.8465165557437884

In [271]:
t_49 = auc_bootstrap
print(t_49)

[0.8488593990893121, 0.849166845935965, 0.8451180488329568, 0.8472208777024848, 0.8504091821734611, 0.844753235810835, 0.8448016385213869, 0.8479666379835791, 0.8435100032268473, 0.8471061453515473, 0.8469896203076261, 0.8470057545444767, 0.843756050338819, 0.8452533971532035, 0.8476233372772578, 0.8452883546663799, 0.8461192678641856, 0.8448948585565236, 0.8477873686852391, 0.8462340002151231, 0.8446259546090137, 0.8445730701660035, 0.8433558316302749, 0.8435279301566814, 0.8456899178946615, 0.8454855508945539, 0.8436372844286686, 0.8469976874260514, 0.845864257287297, 0.8484067441110035, 0.8480957118783837, 0.8472065361586174, 0.8462259330966979, 0.8455545695744147, 0.8476708436413181, 0.8452677386970707, 0.8446420888458642, 0.8455644293858234, 0.847930784123911, 0.8472146032770428, 0.8470622243734538, 0.8465853680398696, 0.8468282779391201, 0.8474368972069845, 0.8468345523645621, 0.8445036033128968, 0.8466248072855042, 0.8451126707540066, 0.845681850776236, 0.8473938725753828, 0.845

In [272]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [273]:
# 50
column_to_drop_49 = 'Cat_현재 거주 지역'

In [274]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(10564, 57)


In [275]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [276]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [277]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 4}
0.8469661090957946


In [278]:
optuna_50 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [279]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.847969327023054


In [280]:
X_train = X_train.values
y_train = y_train.values

In [281]:
auc_bootstrap = []

In [282]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8432941 , 0.84861488])

In [283]:
np.mean(auc_bootstrap)

0.8460117831468933

In [284]:
t_50 = auc_bootstrap
print(t_50)

[0.8477748198343551, 0.8438506148936933, 0.8459821268509555, 0.8463128787063927, 0.8461694632677208, 0.8446770463590407, 0.8430842386432899, 0.8477922985909433, 0.8462541680111865, 0.8451664515435087, 0.8430976838406655, 0.8465405507152848, 0.8469699006848088, 0.8474431716324263, 0.8450777132408304, 0.8454918253199957, 0.8479998028037719, 0.8456388261446345, 0.8453708185436161, 0.8443911118281883, 0.8463433544871105, 0.8449809078197268, 0.8476215445842745, 0.8458969739342441, 0.846688447886415, 0.8467709117636514, 0.8444045570255638, 0.8472549388691692, 0.8459408949123374, 0.8453609587322075, 0.8469757269370046, 0.8467417805026712, 0.843732297156789, 0.845027966010541, 0.8448276325696461, 0.8462546161844324, 0.8458754616184432, 0.8463308056362269, 0.8467704635904054, 0.8468704062242302, 0.8457042594385286, 0.8478985156502097, 0.844030780538525, 0.8475713491807393, 0.8468677171847548, 0.8466879997131692, 0.8432249650424868, 0.8470183033953606, 0.8467122010684451, 0.8440137499551827, 0.8

In [285]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [286]:
# 51
column_to_drop_50 = 'Cat_기초생활보장 수급가구 여부'

In [287]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(10564, 55)


In [288]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [289]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [290]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.8440369026826101


In [291]:
optuna_51 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [292]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.847805295615073


In [293]:
X_train = X_train.values
y_train = y_train.values

In [294]:
auc_bootstrap = []

In [295]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84208992, 0.8477973 ])

In [296]:
np.mean(auc_bootstrap)

0.8450580187157147

In [297]:
t_51 = auc_bootstrap
print(t_51)

[0.8474879889570113, 0.8459767487720053, 0.8471384138252482, 0.844119967014449, 0.8426835717615002, 0.8467467104083755, 0.8455173711950091, 0.845234573876878, 0.8439707253235813, 0.8453439281488653, 0.8442754831307591, 0.847085529382238, 0.8441992936789645, 0.8430985801871572, 0.8449889749381521, 0.8444063497185472, 0.8445685884335449, 0.847291689075329, 0.8450866767057473, 0.8428305725861389, 0.8460820694847802, 0.8455178193682551, 0.8445914452690831, 0.842797407765946, 0.8447438241726722, 0.8475489405184469, 0.8455433652432686, 0.8470191997418522, 0.8451207378724319, 0.8447944677494532, 0.8462062134738804, 0.8459292424079452, 0.8456415151841097, 0.8433504535513248, 0.843907981069162, 0.8470595353339787, 0.8449468466530422, 0.8448482485389552, 0.8459946757018392, 0.8463711412283532, 0.8481468036284107, 0.846894607579506, 0.8448554193108888, 0.8475937578430318, 0.8435131404395684, 0.8428009931519128, 0.8453797820085333, 0.8445981678677708, 0.8471912982682586, 0.8463908608511707, 0.8434

In [298]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [299]:
# 52
column_to_drop_51 = 'Cat_현재 주택의 유형'

In [300]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(10564, 44)


In [301]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [302]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [303]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 54, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5}
0.8395220341260246


In [304]:
optuna_52 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [305]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.8422560144849593


In [306]:
X_train = X_train.values
y_train = y_train.values

In [307]:
auc_bootstrap = []

In [308]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.83867063, 0.84400954])

In [309]:
np.mean(auc_bootstrap)

0.8414626071134057

In [310]:
t_52 = auc_bootstrap
print(t_52)

[0.8426020042307554, 0.8399792943960417, 0.8421932702305404, 0.8430654153669642, 0.8406318346419992, 0.8404857301638521, 0.839348266465885, 0.8406918898569431, 0.838637463697967, 0.8428637374063318, 0.8415918217346097, 0.8428377433580725, 0.8415183213222903, 0.8431066473055824, 0.8427131511957262, 0.84155507152845, 0.8409858915062205, 0.843615772112868, 0.8398636656986124, 0.8391107346455845, 0.8417728837259331, 0.8411176544405005, 0.8421260442436629, 0.8408657810763328, 0.841940500519881, 0.8414421318704958, 0.8440200243806245, 0.8418553476031694, 0.8428897314545909, 0.8401890394750995, 0.8404140224445161, 0.8399595747732244, 0.8404453945717256, 0.8414251012871535, 0.8412216306335376, 0.8420399949804597, 0.8422757341077766, 0.8415676203793339, 0.8422578071779426, 0.8424514180201499, 0.8427023950378258, 0.8421099100068122, 0.840614804058657, 0.8410584955720484, 0.8406820300455342, 0.8425580832526621, 0.8413291922125417, 0.8407008533218601, 0.844654637696748, 0.8435100032268473, 0.84121

In [311]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [312]:
# 53
column_to_drop_52 = 'Cat_이사 계획 중인 주택의 점유형태'

In [313]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(10564, 20)


In [314]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [315]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [316]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 4}
0.8396752821442965


In [317]:
optuna_53 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [318]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.8426324800114733


In [319]:
X_train = X_train.values
y_train = y_train.values

In [320]:
auc_bootstrap = []

In [321]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.83873947, 0.84380412])

In [322]:
np.mean(auc_bootstrap)

0.8413505893478183

In [323]:
t_53 = auc_bootstrap
print(t_53)

[0.8420641963357356, 0.8413775949230935, 0.8401836613961493, 0.8422820085332186, 0.8431837331038687, 0.8408209637517479, 0.8393437847334264, 0.8418858233838872, 0.8407931770105052, 0.842582284607938, 0.8428664264458069, 0.8406676885016672, 0.8396243411853286, 0.8408120002868308, 0.8432043490731778, 0.8403916137822236, 0.8393966691764367, 0.840835305295615, 0.843746638700656, 0.8425482234412534, 0.8423322039367538, 0.8420310315155426, 0.8400447276899358, 0.8396996342906313, 0.8383882793732745, 0.8406336273349826, 0.8413982108924025, 0.8400554838478361, 0.8418661037610699, 0.8402257896812592, 0.8425921444193467, 0.8415496934494999, 0.84322137965652, 0.8392147108386219, 0.8423823993402889, 0.8434042343408267, 0.8399237209135564, 0.841982628804991, 0.8417585421820658, 0.8412798931554981, 0.8411615754185939, 0.8424585887920835, 0.8406515542648166, 0.8384151697680255, 0.8395176759528162, 0.8410800078878491, 0.8387701229787387, 0.8420489584453765, 0.842611864042164, 0.84123866121688, 0.839596

In [324]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [325]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [326]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(10564, 13)


In [327]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [328]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [329]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 7}
0.8357929990147341


In [330]:
optuna_54 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [331]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.8344443548097953


In [332]:
X_train = X_train.values
y_train = y_train.values

In [333]:
auc_bootstrap = []

In [334]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.83022315, 0.8349239 ])

In [335]:
np.mean(auc_bootstrap)

0.8327421108063533

In [336]:
t_54 = auc_bootstrap
print(t_54)

[0.8340696819762647, 0.834041895235022, 0.8320242192822057, 0.8313017640098956, 0.8324437094403213, 0.8346899537485211, 0.8332764153311104, 0.8328300347782439, 0.8327018572299308, 0.8323289770893836, 0.8320797927646911, 0.8333588792083468, 0.8320125667778135, 0.8325656125631924, 0.83376133878312, 0.8342211645333619, 0.8324526729052383, 0.8343690617044924, 0.8321138539313757, 0.8337909182173461, 0.8312031658958087, 0.8347688322397906, 0.8349561686565559, 0.8331087985371626, 0.8350063640600911, 0.8318987307733677, 0.8318395719049156, 0.8321909397296618, 0.8323343551683339, 0.8307424437990749, 0.8319802983041125, 0.8324948011903481, 0.830957566957083, 0.8343358968842997, 0.833868900362124, 0.8315249542863289, 0.8348477107310602, 0.8338536624717651, 0.8335524900505539, 0.8317839984224302, 0.8320932379620667, 0.8337236922304686, 0.8339836327130615, 0.8312793553476032, 0.8323764834534437, 0.8335157398443942, 0.8335740023663547, 0.8313134165142878, 0.8329295292388226, 0.8332764153311104, 0.83

In [337]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [338]:
# 55
column_to_drop_54 = 'Cat_현재 주택의 점유형태'

In [339]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(10564, 9)


In [340]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [341]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [342]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 7}
0.7768177034349949


In [343]:
optuna_55 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [344]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7781040479007565


In [345]:
X_train = X_train.values
y_train = y_train.values

In [346]:
auc_bootstrap = []

In [347]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77780377, 0.77871715])

In [348]:
np.mean(auc_bootstrap)

0.7781590530995661

In [349]:
t_55 = auc_bootstrap
print(t_55)

[0.7781246638700656, 0.7781246638700656, 0.7781246638700656, 0.7787171489010791, 0.7781246638700656, 0.7787171489010791, 0.7781040479007565, 0.7781246638700656, 0.7781040479007565, 0.7787171489010791, 0.7786965329317701, 0.7781040479007565, 0.7781040479007565, 0.7786965329317701, 0.777803771826037, 0.7781040479007565, 0.7787171489010791, 0.7786965329317701, 0.7781040479007565, 0.7778243877953461, 0.777803771826037, 0.7781040479007565, 0.7778243877953461, 0.7781040479007565, 0.7781246638700656, 0.7781040479007565, 0.7781040479007565, 0.777803771826037, 0.777803771826037, 0.7786965329317701, 0.7786965329317701, 0.7781040479007565, 0.7786965329317701, 0.7781040479007565, 0.7781246638700656, 0.7778243877953461, 0.7781040479007565, 0.7781040479007565, 0.7781246638700656, 0.7781040479007565, 0.7786965329317701, 0.7787171489010791, 0.7781246638700656, 0.7781246638700656, 0.7786965329317701, 0.7781246638700656, 0.7781040479007565, 0.7781040479007565, 0.777803771826037, 0.7781040479007565, 0.77

In [350]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc