In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
신혼가구 = pd.read_csv('신혼가구_변수추가.csv', encoding='cp949')
신혼가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [7]:
신혼가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'Cat_주택 마련 예상 소요연수','Cat_남편/아내의 부모님과 동거 의향','Cat_가족계획 시 중요 고려 사항 1순위',
    'target'    
]

In [8]:
cat = 신혼가구.select_dtypes(include = 'object')
num = 신혼가구.select_dtypes(exclude = 'object')
num_신혼 = num.drop('target',axis=1)
target = 신혼가구.target

In [9]:
scaler=RobustScaler()
scaler.fit(num_신혼)
num_scaled_신혼=scaler.transform(num_신혼)
num_df_scaled_신혼=pd.DataFrame(data=num_scaled_신혼, columns=num_신혼.columns)

In [10]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [11]:
comp =pd.concat([num_df_scaled_신혼, target,cat2],axis=1)

In [12]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(6119, 221)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.778616682943931


In [16]:
optuna_0 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [17]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

0.777012666609706


In [18]:
X_train = X_train.values
y_train = y_train.values

In [19]:
auc_bootstrap = []

In [20]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76042646, 0.77867963])

In [21]:
t_0 = auc_bootstrap
print(t_0)

[0.7704977998974709, 0.7582450444292549, 0.7685086295283663, 0.7715417805878333, 0.767862482911825, 0.7703562884483937, 0.7751356373889269, 0.7828920668147641, 0.774639012303486, 0.770799512987013, 0.7755548316814764, 0.7692081766917294, 0.7696781015037595, 0.7664527084757349, 0.7638974923103213, 0.7727566430280246, 0.7761021872863978, 0.7654701384142173, 0.7776481331168832, 0.7680573949077238, 0.770967724709501, 0.7652511961722488, 0.7739528152768285, 0.7775146317498292, 0.7691307458988381, 0.7701640464798359, 0.7701400162337662, 0.7664687286397813, 0.7704657595693779, 0.7654194078947367, 0.7752718087833219, 0.7746096420027341, 0.7687329118250171, 0.7735416310663021, 0.766159005468216, 0.766695680963773, 0.767827772556391, 0.7771007775119618, 0.7665061090225563, 0.7746496924128503, 0.7731197667464115, 0.7701053058783323, 0.7697688824333561, 0.7772956895078607, 0.7713014781271361, 0.7697048017771702, 0.7678651529391661, 0.7755761919002051, 0.7698463132262474, 0.768431198735475, 0.76617

In [22]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [13]:
column_to_drop = '부채 중 임대 보증금의 비중'

In [14]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(6119, 220)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [26]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [27]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7768846150635648


In [28]:
optuna_1= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [29]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7779391660970608


In [30]:
X_train = X_train.values
y_train = y_train.values

In [31]:
auc_bootstrap = []

In [32]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75836373, 0.77687703])

In [33]:
t_1 = auc_bootstrap
print(t_1)

[0.7673818779904307, 0.7633688268967874, 0.7689758843130554, 0.7737846035543404, 0.7662124060150375, 0.7686127605946684, 0.771531100478469, 0.7565522470950103, 0.7637159304511278, 0.7600873632946004, 0.7671789559125085, 0.7626772898154478, 0.762252755468216, 0.7742171479835953, 0.771699312200957, 0.7699878246753247, 0.777511961722488, 0.7660655545112782, 0.7605919984620643, 0.7700625854408749, 0.7641004143882434, 0.7701133159603554, 0.766693010936432, 0.764754571086808, 0.7597723000683527, 0.7689091336295284, 0.7711599666780589, 0.7680760850991114, 0.7638307416267942, 0.7684632390635681, 0.7644662081339714, 0.7663165370813397, 0.7830816387559809, 0.7680814251537935, 0.775739063568011, 0.7665087790498974, 0.7669386534518112, 0.7652698863636362, 0.7675340695488722, 0.7605118976418318, 0.7568005596377306, 0.7636384996582365, 0.7696220309295968, 0.7698009227614491, 0.7667197112098428, 0.7680680750170882, 0.7677209714627479, 0.7661082749487356, 0.7673711978810663, 0.760789580485304, 0.76567

In [34]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [15]:
column_to_drop_1 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [16]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(6119, 219)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [38]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [39]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7773938012838171


In [40]:
optuna_2= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [41]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.7755201213260423


In [42]:
X_train = X_train.values
y_train = y_train.values

In [43]:
auc_bootstrap = []

In [44]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75940998, 0.777862  ])

In [45]:
t_2 = auc_bootstrap
print(t_2)

[0.7642819762474367, 0.7762143284347232, 0.765625, 0.7701079759056733, 0.7728527640123035, 0.7684418788448395, 0.7723481288448393, 0.7676702409432673, 0.7687168916609706, 0.7729862653793574, 0.7711813268967873, 0.7681561859193439, 0.7746977529049898, 0.7642072154818865, 0.7694057587149692, 0.7678277725563909, 0.759614768455229, 0.7696540712576897, 0.7746924128503074, 0.7622153750854408, 0.7751596676349966, 0.767093515037594, 0.7666796607997266, 0.7664794087491456, 0.7789030459671908, 0.7728607740943266, 0.7749166951469583, 0.7744921607997266, 0.7728287337662338, 0.7630457535885167, 0.7662177460697197, 0.7751623376623377, 0.7671656057758032, 0.7774852614490773, 0.7720303955912509, 0.7781554383116882, 0.7701880767259057, 0.7652592062542721, 0.7733413790157211, 0.7687863123718386, 0.7741236970266575, 0.7727432928913192, 0.7702414772727272, 0.76799865430622, 0.7656490302460698, 0.7728180536568694, 0.7734561901913876, 0.7720651059466848, 0.772900824504443, 0.7657958817498292, 0.777891105604

In [46]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [17]:
column_to_drop_2 = 'Cat_가구주 동거 여부'

In [18]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(6119, 217)


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [50]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [51]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.781463117388292


In [52]:
optuna_3 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [53]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7760995172590567


In [54]:
X_train = X_train.values
y_train = y_train.values

In [55]:
auc_bootstrap = []

In [56]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76096307, 0.77827372])

In [57]:
t_3 = auc_bootstrap
print(t_3)

[0.7713895890293916, 0.7704951298701299, 0.7670427845181135, 0.7759126153451812, 0.7649094326725906, 0.7707461124401914, 0.7747057629870129, 0.7768871753246753, 0.7685833902939165, 0.777311709671907, 0.7734642002734108, 0.7712747778537252, 0.7631979451469584, 0.7746577024948736, 0.7731491370471634, 0.7590594027682843, 0.7727886833561175, 0.7686367908407381, 0.7680814251537936, 0.7745295411825017, 0.7654514482228297, 0.7697902426520848, 0.7734561901913874, 0.7777228938824334, 0.7698062628161313, 0.7763237995557074, 0.7686100905673274, 0.7730850563909775, 0.7615585483595352, 0.7692722573479153, 0.770532510252905, 0.7713762388926863, 0.7699290840738209, 0.771501730177717, 0.7702735176008202, 0.7748793147641833, 0.7732132177033493, 0.7679692840054683, 0.7752531185919344, 0.7694351290157211, 0.7712907980177717, 0.7725457108680793, 0.7721932672590567, 0.7717126623376624, 0.7800538277511961, 0.77000117481203, 0.773034325871497, 0.7776347829801776, 0.7665675196514012, 0.772639161825017, 0.7642

In [58]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [19]:
column_to_drop_3 = '소득 중 재산소득의 비중(월평균)'

In [20]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(6119, 216)


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [62]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [63]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7801400679471449


In [64]:
optuna_4= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [65]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7805477828092959


In [66]:
X_train = X_train.values
y_train = y_train.values

In [67]:
auc_bootstrap = []

In [68]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76207887, 0.77985391])

In [69]:
t_4 = auc_bootstrap
print(t_4)

[0.7704390592959671, 0.7702761876281614, 0.7689438439849625, 0.7735576512303486, 0.7653980476760082, 0.7739074248120301, 0.7697688824333561, 0.769598000683527, 0.7761128673957621, 0.7661136150034178, 0.7706446514012304, 0.7756776529391661, 0.7757684338687629, 0.7768898453520163, 0.7721665669856461, 0.7778563952494875, 0.769795582706767, 0.778814935064935, 0.7662738166438824, 0.7629576426862611, 0.7677823820915927, 0.7686528110047848, 0.7716058612440191, 0.7670801649008886, 0.7635130083732058, 0.7767937243677374, 0.7739127648667122, 0.7729115046138072, 0.772999615516063, 0.7752904989747095, 0.7638280715994532, 0.7699958347573479, 0.7755922120642514, 0.7703402682843473, 0.7714216293574846, 0.7625464584757349, 0.7689491840396445, 0.767961273923445, 0.7796640037593985, 0.7665888798701299, 0.7696567412850308, 0.7692268668831168, 0.7689892344497609, 0.7693870685235816, 0.7706686816473001, 0.7678811731032126, 0.7740489362611074, 0.7626132091592618, 0.7728394138755981, 0.7771248077580313, 0.77

In [70]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [21]:
column_to_drop_4 = '소득 중 사적이전소득의 비중(월평균)'

In [22]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(6119, 215)


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [74]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [75]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 93, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7809581048255829


In [76]:
optuna_5 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [77]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7758512047163362


In [78]:
X_train = X_train.values
y_train = y_train.values

In [79]:
auc_bootstrap = []

In [80]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76219889, 0.77980672])

In [81]:
t_5 = auc_bootstrap
print(t_5)

[0.7699130639097745, 0.7738673744019139, 0.7730583561175668, 0.7664473684210527, 0.770764802631579, 0.7670054041353384, 0.7707461124401914, 0.7700198650034177, 0.7692135167464116, 0.7739261150034178, 0.7724709501025291, 0.767229686431989, 0.7742304981203008, 0.7771461679767602, 0.7790632476076556, 0.7721238465481886, 0.7749781057758032, 0.7726071214969242, 0.7680974453178402, 0.7725457108680793, 0.7756055622009569, 0.7707247522214629, 0.7623168361244018, 0.7727005724538618, 0.7654541182501708, 0.7714750299043063, 0.7711679767600821, 0.7671495856117567, 0.7622607655502394, 0.7735362910116199, 0.770265507518797, 0.7676809210526315, 0.7668345223855093, 0.7673311474709501, 0.7739901956596036, 0.7655715994531783, 0.7740382561517429, 0.7720170454545454, 0.7701293361244019, 0.7762063183526999, 0.7619323521872865, 0.7764920112781956, 0.7720330656185919, 0.7767483339029393, 0.7674406185919345, 0.7748472744360902, 0.7687596120984279, 0.7740516062884483, 0.7763077793916611, 0.7736083817498292, 0.

In [82]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [23]:
column_to_drop_5 = 'Cat_기초생활보장 수급가구 여부'

In [24]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(6119, 213)


In [85]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [86]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [87]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 71, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7775023163799364


In [88]:
optuna_6 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [89]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7757043532125769


In [90]:
X_train = X_train.values
y_train = y_train.values

In [91]:
auc_bootstrap = []

In [92]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75949195, 0.77746931])

In [93]:
t_6 = auc_bootstrap
print(t_6)

[0.7609764823991797, 0.7709864149008886, 0.7616973897812713, 0.7665568395420369, 0.7696994617224882, 0.7720117053998633, 0.7720223855092277, 0.7669012730690362, 0.7835996240601505, 0.7702227870813398, 0.7685860603212576, 0.7711065661312372, 0.7585547676008203, 0.7670134142173616, 0.7730503460355435, 0.773298658578264, 0.7599592019822283, 0.7691120557074504, 0.7686127605946685, 0.7690773453520164, 0.7592382946001368, 0.7686741712235132, 0.7652725563909775, 0.7637693309979494, 0.7661376452494875, 0.7694858595352017, 0.7739768455228982, 0.7668024820574162, 0.7694244489063569, 0.7644448479152426, 0.7671469155844156, 0.7762169984620643, 0.7721932672590567, 0.7708689336978811, 0.7662043959330144, 0.7658786525974026, 0.76809744531784, 0.7653019266917294, 0.7773143796992481, 0.7621966848940532, 0.7708502435064934, 0.7614464072112099, 0.7643834372863978, 0.7608803614149009, 0.7609658022898155, 0.7704443993506493, 0.7689331638755982, 0.7698116028708134, 0.7687756322624744, 0.7631952751196172, 0.

In [94]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [25]:
column_to_drop_6 = '자산 중 부동산 자산의 비중'

In [26]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(6119, 212)


In [97]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(6119, 212)


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [99]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [100]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 149, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7800315528510255


In [101]:
optuna_7 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [102]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7748472744360904


In [103]:
X_train = X_train.values
y_train = y_train.values

In [104]:
auc_bootstrap = []

In [105]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7616755 , 0.77873777])

In [106]:
t_7 = auc_bootstrap
print(t_7)

[0.7751222872522215, 0.7748873248462065, 0.7604211167122352, 0.7757150333219412, 0.7756162423103214, 0.7637693309979494, 0.7730209757347916, 0.7714162893028025, 0.7677797120642516, 0.7674860090567327, 0.7750288362952837, 0.7751863679084074, 0.7747645035885168, 0.7678891831852357, 0.7740622863978127, 0.7742865686944634, 0.7715017301777171, 0.7718968942241968, 0.7662150760423786, 0.7716031912166781, 0.7701426862611074, 0.7750261662679426, 0.7711759868421053, 0.7677369916267942, 0.7631098342447027, 0.7668612226589201, 0.777138157894737, 0.7666502904989747, 0.7664153280929598, 0.7663218771360218, 0.770265507518797, 0.7737472231715653, 0.7731785073479152, 0.7681188055365687, 0.7751997180451128, 0.7656036397812713, 0.7683777981886535, 0.7728794642857144, 0.7684979494190021, 0.776342489747095, 0.7653633373205742, 0.7721532168489406, 0.7601220736500343, 0.7596307886192754, 0.7685833902939166, 0.7792367993848257, 0.7739234449760766, 0.7725884313055366, 0.7685032894736843, 0.7668211722488039, 0.

In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
column_to_drop_7 = 'Cat_가구주 주민등록상 등재 여부'

In [28]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(6119, 210)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [30]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [31]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7806784697701983


In [32]:
optuna_8 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [33]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7819388670539986


In [34]:
X_train = X_train.values
y_train = y_train.values

In [35]:
auc_bootstrap = []

In [36]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76200157, 0.77978349])

In [37]:
t_8 = auc_bootstrap
print(t_8)

[0.7745829417293233, 0.7802781100478469, 0.7706099410457962, 0.7710104451469582, 0.7753679297676008, 0.7654327580314422, 0.7777522641831851, 0.7719716549897471, 0.7749460654477103, 0.7709864149008886, 0.7791620386192755, 0.7729061645591251, 0.7704123590225564, 0.773538961038961, 0.7752904989747096, 0.7785025418660287, 0.7740249060150377, 0.7663058569719754, 0.7662844967532467, 0.7725403708133971, 0.7752985090567328, 0.7667891319207109, 0.7701533663704716, 0.7762810791182503, 0.7669973940533151, 0.766159005468216, 0.7782515592959672, 0.759046052631579, 0.7718061132946002, 0.770663341592618, 0.7713628887559809, 0.7677743720095693, 0.7736484321599453, 0.771902234278879, 0.7689331638755981, 0.7736778024606973, 0.7723481288448394, 0.7739261150034177, 0.7632673658578264, 0.7757043532125769, 0.7617774906015038, 0.773637752050581, 0.7687863123718387, 0.7695846505468217, 0.7753652597402598, 0.7741290370813397, 0.7693042976760082, 0.7718114533492823, 0.7658653024606971, 0.770831553315106, 0.7782

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [39]:
column_to_drop_8 = 'Cat_가구주 장애 여부'

In [40]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(6119, 208)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 133, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7809163682501523


In [44]:
optuna_9 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [45]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7791967489747095


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7618654 , 0.77940735])

In [49]:
t_9 = auc_bootstrap
print(t_9)

[0.7712187072795625, 0.7733253588516746, 0.7705485304169515, 0.7722573479152425, 0.7748098940533151, 0.765956083390294, 0.7734909005468216, 0.7721905972317157, 0.7600927033492821, 0.7746283321941216, 0.7737151828434724, 0.7695205698906358, 0.7728100435748464, 0.7681321556732742, 0.7731544771018455, 0.7740142259056733, 0.7746790627136022, 0.7743105989405332, 0.7728367438482571, 0.7674673188653452, 0.7652458561175667, 0.7664874188311688, 0.7723908492822966, 0.7681161355092277, 0.7749140251196173, 0.7743800196514011, 0.7648613721804511, 0.772235987696514, 0.7726872223171565, 0.7740943267259057, 0.7760728169856459, 0.7747244531784006, 0.7684926093643198, 0.773207877648667, 0.7750101461038961, 0.7701400162337663, 0.7718114533492824, 0.7729702452153111, 0.7741130169172934, 0.7745482313738893, 0.7769592660628846, 0.7711492865686944, 0.7722146274777854, 0.7752264183185235, 0.7667490815105946, 0.7730289858168147, 0.7718942241968558, 0.7674032382091592, 0.7648159817156528, 0.7629095821941216, 0.

In [50]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [51]:
column_to_drop_9 = '소득 대비 주거관리비의 비율'

In [52]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(6119, 207)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [54]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [55]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7773437173933005


In [56]:
optuna_10 = ExtraTreesClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [57]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.7778724154135338


In [58]:
X_train = X_train.values
y_train = y_train.values

In [59]:
auc_bootstrap = []

In [60]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76211445, 0.77938212])

In [61]:
t_10 = auc_bootstrap
print(t_10)

[0.7795652127477786, 0.7669146232057416, 0.7709757347915243, 0.7620311431989064, 0.7675367395762133, 0.7756803229665071, 0.7617588004101163, 0.7657451512303486, 0.7689465140123035, 0.7701506963431306, 0.7713174982911823, 0.7732799683868763, 0.7737231929254955, 0.7620044429254956, 0.7751222872522214, 0.7702548274094327, 0.771101226076555, 0.778577302631579, 0.7837891960013671, 0.7761902981886535, 0.7705485304169514, 0.7718141233766234, 0.7784838516746411, 0.7716966421736158, 0.771405609193438, 0.7678491327751196, 0.7653072667464114, 0.7735923615857826, 0.7745001708817498, 0.7709570446001366, 0.7652912465823649, 0.7662150760423787, 0.7694751794258373, 0.7733120087149692, 0.7772743292891318, 0.7721265165755297, 0.7687596120984279, 0.7708929639439508, 0.7761689379699247, 0.7708315533151059, 0.765851952323992, 0.77096505468216, 0.7746443523581682, 0.7710932159945318, 0.7765160415242653, 0.7699424342105263, 0.771803443267259, 0.7722973983253589, 0.7725296907040329, 0.7735069207108681, 0.7722

In [62]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [63]:
column_to_drop_10 = '부채 중 비금융기관 대출금의 비중'

In [64]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(6119, 206)


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [66]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [67]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 78, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7784914732176396


In [68]:
optuna_11 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [69]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7776961936090226


In [70]:
X_train = X_train.values
y_train = y_train.values

In [71]:
auc_bootstrap = []

In [72]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75942426, 0.77773644])

In [73]:
t_11 = auc_bootstrap
print(t_11)

[0.7700118549213943, 0.7677022812713602, 0.7765160415242652, 0.7663378973000683, 0.7675821300410115, 0.772636491797676, 0.7787455143540669, 0.7736484321599453, 0.7702040968899522, 0.7709463644907725, 0.7596227785372522, 0.7716405715994532, 0.7793169002050582, 0.7708929639439507, 0.7693229878673957, 0.7666155801435407, 0.7750955869788105, 0.774238508202324, 0.765825252050581, 0.7724175495557075, 0.7710878759398495, 0.766159005468216, 0.7768818352699932, 0.7696620813397129, 0.7704604195146958, 0.7614036867737526, 0.7626452494873548, 0.7673017771701983, 0.7747111030416951, 0.7672510466507176, 0.7651096847231715, 0.7699798145933016, 0.7661696855775804, 0.7568913405673274, 0.770169386534518, 0.7704630895420368, 0.7762143284347233, 0.7670454545454545, 0.7723027383800409, 0.7613849965823649, 0.7686581510594668, 0.766060214456596, 0.7668505425495556, 0.7710718557758032, 0.7696514012303487, 0.7664527084757348, 0.7679879741968558, 0.7642873163021191, 0.7659934637730691, 0.7671629357484621, 0.772

In [74]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [75]:
column_to_drop_11 = '소득 대비 생활비의 비율'

In [76]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(6119, 205)


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [78]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [79]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7779989816275596


In [80]:
optuna_12 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [81]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7733226888243336


In [82]:
X_train = X_train.values
y_train = y_train.values

In [83]:
auc_bootstrap = []

In [84]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76040897, 0.77851122])

In [85]:
t_12 = auc_bootstrap
print(t_12)

[0.7713869190020506, 0.7702227870813397, 0.7733627392344498, 0.7674779989747095, 0.769632711038961, 0.7700465652768284, 0.7653606672932332, 0.7681748761107313, 0.7604104366028708, 0.7739581553315105, 0.7714616797676006, 0.7655502392344498, 0.7673978981544771, 0.7616786995898838, 0.7755681818181818, 0.7721932672590568, 0.7739127648667122, 0.7632967361585782, 0.7676034902597402, 0.7781340780929598, 0.7725083304853042, 0.7709570446001367, 0.7701400162337662, 0.7685193096377307, 0.7703269181476418, 0.769798252734108, 0.7725510509227616, 0.7733547291524264, 0.7781100478468901, 0.7671415755297335, 0.7716672718728641, 0.7634142173615859, 0.7688103426179084, 0.774873974709501, 0.7658492822966507, 0.771701982228298, 0.772639161825017, 0.765857292378674, 0.7744093899521531, 0.7703562884483938, 0.770695381920711, 0.7764359407040329, 0.7635664089200274, 0.7814662722146274, 0.7657932117224879, 0.7651337149692412, 0.768260316985646, 0.7726124615516063, 0.7686928614149009, 0.7685673701298701, 0.78334

In [86]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [87]:
column_to_drop_12 = '현재 무주택 기간(총 개월)'

In [88]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(6119, 204)


In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [90]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [91]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 161, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 3}
0.7803570981393835


In [92]:
optuna_13 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [93]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.7778537252221462


In [94]:
X_train = X_train.values
y_train = y_train.values

In [95]:
auc_bootstrap = []

In [96]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76288035, 0.78016971])

In [97]:
t_13 = auc_bootstrap
print(t_13)

[0.773867374401914, 0.7648800623718388, 0.7754346804511278, 0.7659854536910459, 0.7734935705741627, 0.7744894907723856, 0.7723481288448393, 0.7673418275803144, 0.7727566430280247, 0.7734001196172249, 0.7666022300068351, 0.7666049000341764, 0.7715391105604921, 0.773803293745728, 0.7800458176691729, 0.7646584501025291, 0.7647679212235132, 0.7701373462064252, 0.7703295881749828, 0.7721879272043746, 0.7738753844839372, 0.7737632433356116, 0.7766415328092959, 0.7703482783663705, 0.7742598684210527, 0.7661536654135339, 0.7754186602870814, 0.7741450572453862, 0.7762250085440874, 0.7738032937457279, 0.7722520078605605, 0.7800057672590568, 0.7700171949760766, 0.7689438439849623, 0.7856128246753247, 0.7657398111756665, 0.7679933142515379, 0.7717206724196856, 0.7758351845522897, 0.7730770463089542, 0.772137196684894, 0.7740916566985646, 0.7663966379015721, 0.772270698051948, 0.7625811688311689, 0.7711252563226245, 0.7750288362952836, 0.7697822325700614, 0.7735282809295967, 0.7679586038961039, 0.7

In [98]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [99]:
column_to_drop_13 = 'Cat_현재 주택의 위치'

In [100]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(6119, 201)


In [101]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [102]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [103]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 53, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7798729538643895


In [104]:
optuna_14 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [105]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7807453648325359


In [106]:
X_train = X_train.values
y_train = y_train.values

In [107]:
auc_bootstrap = []

In [108]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75893618, 0.77822553])

In [109]:
t_14 = auc_bootstrap
print(t_14)

[0.7744120599794942, 0.7680520548530416, 0.7698383031442242, 0.7677102913533835, 0.7654274179767601, 0.7668345223855092, 0.7732105476760082, 0.7583571855775801, 0.7670988550922762, 0.7739688354408749, 0.7747858638072455, 0.7682896872863978, 0.7610192028366372, 0.772366819036227, 0.7729115046138072, 0.7663325572453861, 0.7589392515379358, 0.7669813738892686, 0.7719022342788789, 0.7603757262474367, 0.7661696855775804, 0.7712587576896787, 0.7761288875598086, 0.7701720565618593, 0.7656436901913876, 0.7573986457621327, 0.7657024307928912, 0.7678170924470267, 0.7609150717703349, 0.7806599239576213, 0.7636625299043062, 0.7720090353725222, 0.7770180066643882, 0.7614063568010936, 0.7637426307245386, 0.7654033877306903, 0.7724923103212576, 0.7629736628503077, 0.7707728127136022, 0.7638601119275461, 0.7724843002392345, 0.7614250469924811, 0.7765053614149009, 0.7700252050580998, 0.7775520121326043, 0.7647839413875598, 0.7706206211551606, 0.767464648838004, 0.7725377007860561, 0.7708716037252221, 0

In [110]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [111]:
column_to_drop_14 = '총 이사 횟수'

In [112]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(6119, 200)


In [113]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [114]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [115]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7802026728102905


In [116]:
optuna_15 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [117]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7770954374572796


In [118]:
X_train = X_train.values
y_train = y_train.values

In [119]:
auc_bootstrap = []

In [120]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7627082 , 0.77898442])

In [121]:
t_15 = auc_bootstrap
print(t_15)

[0.7733173487696514, 0.7617668104921393, 0.7688503930280247, 0.7616546693438141, 0.7700572453861927, 0.7783637004442925, 0.7736564422419685, 0.773069036226931, 0.7625357783663704, 0.7695606203007519, 0.7734455100820232, 0.7703536184210527, 0.7645730092276144, 0.7715551307245387, 0.7665728597060834, 0.7694778494531784, 0.7748312542720437, 0.7776668233082706, 0.7705084800068353, 0.7729862653793576, 0.7805290926179084, 0.7675073692754613, 0.7738593643198908, 0.7786146830143541, 0.7738620343472317, 0.7727940234107997, 0.7727833433014354, 0.7657905416951469, 0.7722546778879016, 0.7743639994873548, 0.7606881194463432, 0.7731304468557758, 0.7729622351332877, 0.7662017259056734, 0.7718434936773753, 0.7665621795967191, 0.7699504442925496, 0.7584239362611073, 0.7750555365686945, 0.7690479750512645, 0.7733280288790157, 0.7780940276828434, 0.7699210739917977, 0.7741263670539986, 0.7647225307587149, 0.7809696471291866, 0.7696861115857827, 0.7704470693779906, 0.7727139225905674, 0.7744441003075873, 

In [122]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [123]:
column_to_drop_15 = '현재 주택 거주 기간(총 개월)'

In [124]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(6119, 199)


In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [126]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [127]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 140, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.77957245052129


In [128]:
optuna_16 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [129]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7803608808954203


In [130]:
X_train = X_train.values
y_train = y_train.values

In [131]:
auc_bootstrap = []

In [132]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76252103, 0.7792382 ])

In [133]:
t_16 = auc_bootstrap
print(t_16)

[0.7735896915584415, 0.7727219326725906, 0.7613102358168147, 0.7744814806903623, 0.7742652084757347, 0.7799977571770336, 0.7765534219070402, 0.7721905972317158, 0.7632166353383458, 0.7707167421394394, 0.772732612781955, 0.7628428315105946, 0.769029284859877, 0.7698383031442243, 0.7740809765892002, 0.774169087491456, 0.7738139738550923, 0.775071556732741, 0.7700252050580997, 0.7807079844497609, 0.771165306732741, 0.7655288790157211, 0.7739821855775804, 0.775036846377307, 0.769066665242652, 0.764618399692413, 0.7726979024265209, 0.7755948820915927, 0.7707514524948735, 0.7706419813738893, 0.7792982100136706, 0.7706900418660287, 0.765793211722488, 0.7684231886534518, 0.7751516575529733, 0.7682175965481886, 0.7733654092617908, 0.7693576982228298, 0.7739474752221462, 0.7745882817840055, 0.7706793617566643, 0.770465759569378, 0.7751890379357484, 0.7680734150717703, 0.7688156826725905, 0.763822731544771, 0.7709089841079972, 0.7658519523239917, 0.765325956937799, 0.767432608509911, 0.7690853554

In [134]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [135]:
column_to_drop_16 = '소득 중 근로/사업소득의 비중(월평균)'

In [136]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(6119, 198)


In [137]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [138]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [139]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7779864606549304


In [140]:
optuna_17 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [141]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7747324632604238


In [142]:
X_train = X_train.values
y_train = y_train.values

In [143]:
auc_bootstrap = []

In [144]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75940043, 0.77723808])

In [145]:
t_17 = auc_bootstrap
print(t_17)

[0.7755601717361587, 0.7755895420369106, 0.7699451042378674, 0.7679052033492824, 0.7636491797676008, 0.7618202110389609, 0.7678010722829801, 0.7617481203007519, 0.7677877221462748, 0.7671255553656869, 0.7696460611756665, 0.7721158364661654, 0.7598604109706083, 0.7695606203007519, 0.7698676734449761, 0.7747511534518113, 0.7697955827067671, 0.773368079289132, 0.771501730177717, 0.7712881279904307, 0.7623622265892003, 0.7724041994190021, 0.7675367395762133, 0.7699451042378674, 0.7754507006151744, 0.7713869190020506, 0.7754106502050582, 0.7592409646274778, 0.7748980049555707, 0.760885701469583, 0.7680787551264525, 0.7684525589542036, 0.7707781527682844, 0.7660094839371155, 0.7697955827067668, 0.7718621838687627, 0.7703108979835955, 0.7696433911483254, 0.7608990516062883, 0.7662577964798358, 0.7586962790498974, 0.7619724025974026, 0.7655342190704033, 0.7687035415242653, 0.7709864149008887, 0.7697234919685577, 0.7691974965823649, 0.7629175922761449, 0.7665434894053316, 0.762653259569378, 0.7

In [146]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [147]:
column_to_drop_17 = '가구주 나이'

In [148]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(6119, 197)


In [149]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [150]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [151]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 184, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7792302106027595


In [152]:
optuna_18 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [153]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.779882946001367


In [154]:
X_train = X_train.values
y_train = y_train.values

In [155]:
auc_bootstrap = []

In [156]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76260226, 0.77943071])

In [157]:
t_18 = auc_bootstrap
print(t_18)

[0.7725377007860561, 0.7755308014354066, 0.7780966977101845, 0.767293767088175, 0.7704577494873547, 0.7738193139097744, 0.7738566942925496, 0.7713522086466165, 0.7770687371838687, 0.77420379784689, 0.7701079759056733, 0.7670240943267259, 0.7676195104237866, 0.7736430921052632, 0.7690132646958304, 0.7677423316814764, 0.7697395121326043, 0.7682122564935064, 0.7683297376965139, 0.7737819335269993, 0.7717527127477786, 0.7706820317840054, 0.774566921565277, 0.7625784988038278, 0.7811859193438141, 0.7740756365345182, 0.7646611201298701, 0.7728153836295284, 0.7717794130211895, 0.7702281271360218, 0.7726872223171566, 0.7724923103212575, 0.7678678229665071, 0.7713468685919344, 0.7734989106288448, 0.764826661825017, 0.7727272727272727, 0.7702147769993165, 0.7687409219070404, 0.7718434936773753, 0.772804703520164, 0.7722439977785371, 0.7739020847573479, 0.7799950871496925, 0.7747351332877648, 0.7656703904647983, 0.7694351290157212, 0.7633928571428572, 0.7730743762816132, 0.7756002221462748, 0.770

In [158]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [159]:
column_to_drop_18 = '자산 중 기타자산의 비중'

In [160]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(6119, 196)


In [161]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [162]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [163]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7826609571031478


In [164]:
optuna_19 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [165]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7730423359535201


In [166]:
X_train = X_train.values
y_train = y_train.values

In [167]:
auc_bootstrap = []

In [168]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76087299, 0.77814075])

In [169]:
t_19 = auc_bootstrap
print(t_19)

[0.7727085825358853, 0.7630083732057417, 0.7745509014012304, 0.7691974965823649, 0.7608002605946685, 0.7675874700956937, 0.773719187884484, 0.7761582578605605, 0.7638761320915926, 0.7688984535201641, 0.7681348257006152, 0.7730236457621326, 0.7758325145249487, 0.772268028024607, 0.7678411226930963, 0.777210248632946, 0.7690319548872181, 0.7663298872180452, 0.7735736713943951, 0.770625961209843, 0.7693096377306903, 0.7693657083048531, 0.7673765379357484, 0.7831190191387559, 0.7708742737525631, 0.7615532083048531, 0.7765347317156527, 0.7739848556049214, 0.7672643967874231, 0.7744334201982227, 0.7751650076896787, 0.7625437884483938, 0.7774639012303486, 0.767696941216678, 0.7716939721462747, 0.7699824846206426, 0.7744627904989747, 0.7690880254613807, 0.7678945232399179, 0.7735816814764183, 0.7754693908065617, 0.7734268198906358, 0.769096035543404, 0.7688851033834587, 0.7741210269993165, 0.7652004656527683, 0.7685486799384824, 0.7677183014354068, 0.7710117801606289, 0.7693924085782639, 0.764

In [170]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [171]:
column_to_drop_19 = '자산 중 금융자산의 비중'

In [172]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(6119, 195)


In [173]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [174]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [175]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 56, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 5}
0.7780741074633344


In [176]:
optuna_20 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [177]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7706473214285714


In [178]:
X_train = X_train.values
y_train = y_train.values

In [179]:
auc_bootstrap = []

In [180]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75862672, 0.77685914])

In [181]:
t_20 = auc_bootstrap
print(t_20)

[0.7708021830143541, 0.7727940234107997, 0.77615692284689, 0.7683404178058784, 0.761382326555024, 0.775744403622693, 0.7738353340738209, 0.7726204716336296, 0.7716859620642516, 0.7716085312713602, 0.7681722060833902, 0.7700572453861928, 0.7650055536568695, 0.7680333646616542, 0.7691200657894737, 0.7566003075871497, 0.7510092703349283, 0.7691788063909775, 0.7751917079630896, 0.7593050452836638, 0.763021723342447, 0.7650215738209158, 0.7724709501025291, 0.7671549256664387, 0.7691574461722488, 0.7702174470266576, 0.7600900333219412, 0.7639575679254955, 0.7640817241968558, 0.7756643028024607, 0.7664633885850992, 0.7695446001367054, 0.7637933612440191, 0.766171020591251, 0.7674379485645932, 0.7714002691387559, 0.7701266660970609, 0.769897043745728, 0.7721452067669174, 0.7682976973684212, 0.7661536654135338, 0.7682389567669172, 0.7712721078263841, 0.7643246966848941, 0.7627387004442924, 0.7599084714627478, 0.7723828392002734, 0.7751676777170199, 0.7698196129528366, 0.7613716464456597, 0.7638

In [182]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [183]:
column_to_drop_20 = '소득 중 정부 보조금의 비중(월평균)'

In [184]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(6119, 194)


In [185]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [186]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [187]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7768261838579622


In [188]:
optuna_21 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [189]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7747164430963774


In [190]:
X_train = X_train.values
y_train = y_train.values

In [191]:
auc_bootstrap = []

In [192]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75967318, 0.77790179])

In [193]:
t_21 = auc_bootstrap
print(t_21)

[0.7723214285714286, 0.7651310449419002, 0.764319356630212, 0.766492758885851, 0.7766281826725907, 0.7721719070403281, 0.7663912978468901, 0.7689358339029392, 0.7748205741626794, 0.7729435449419002, 0.765659710355434, 0.7648907424812029, 0.7687516020164047, 0.7731277768284348, 0.7659293831168831, 0.7619003118591934, 0.7655475692071086, 0.7780299470266575, 0.7778750854408749, 0.7693203178400547, 0.7606961295283664, 0.7682362867395762, 0.7794877819548872, 0.7715284304511278, 0.7722386577238551, 0.7614036867737525, 0.7729809253246753, 0.7779338260423787, 0.7636224794941902, 0.7692188568010937, 0.7680467147983596, 0.7743933697881066, 0.7582824248120299, 0.7684445488721805, 0.7651657552973343, 0.7700625854408749, 0.7627013200615175, 0.766893262987013, 0.7653846975393028, 0.7681508458646616, 0.7677610218728641, 0.7725830912508544, 0.7664073180109363, 0.7715471206425154, 0.7715978511619959, 0.7697955827067668, 0.7660495343472318, 0.7666209201982228, 0.7679586038961039, 0.7674379485645932, 0.7

In [194]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [195]:
column_to_drop_21 = 'Cat_가구주 성별'

In [196]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(6119, 192)


In [197]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [198]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [199]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 98, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7770515613652869


In [200]:
optuna_22 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [201]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7758885850991115


In [202]:
X_train = X_train.values
y_train = y_train.values

In [203]:
auc_bootstrap = []

In [204]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76120564, 0.77817286])

In [205]:
t_22 = auc_bootstrap
print(t_22)

[0.7672056561859194, 0.7699451042378673, 0.7704470693779903, 0.7734348299726589, 0.7570034817156528, 0.7702948778195488, 0.7736217318865346, 0.7758325145249487, 0.7695899906015038, 0.7662337662337662, 0.7709276742993848, 0.7682683270676692, 0.7686928614149008, 0.7716138713260423, 0.7636705399863295, 0.7659774436090225, 0.7736110517771702, 0.7746603725222145, 0.7633447966507176, 0.772502990430622, 0.7713308484278878, 0.7701373462064252, 0.7694725093984962, 0.7733654092617908, 0.7750101461038961, 0.7695365900546822, 0.7616813696172249, 0.7605519480519481, 0.7739474752221465, 0.7741957877648669, 0.7691093856801093, 0.7658786525974025, 0.7679746240601504, 0.7608696813055367, 0.7698730134996583, 0.7631365345181134, 0.765293916609706, 0.7693737183868763, 0.7768497949419002, 0.7622020249487355, 0.779113978127136, 0.7729248547505125, 0.7725190105946685, 0.7719636449077238, 0.7707701426862612, 0.7743025888585099, 0.7700145249487355, 0.7748312542720438, 0.7672216763499657, 0.7696594113123718, 0.

In [206]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [207]:
column_to_drop_22 = '중기부채부담지표'

In [208]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(6119, 191)


In [209]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [210]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [211]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 173, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 3}
0.7776233524486849


In [212]:
optuna_23 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [213]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.7760274265208476


In [214]:
X_train = X_train.values
y_train = y_train.values

In [215]:
auc_bootstrap = []

In [216]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75964581, 0.77733574])

In [217]:
t_23 = auc_bootstrap
print(t_23)

[0.7645409688995215, 0.7740783065618593, 0.7688423829460014, 0.7643060064935064, 0.7688904434381408, 0.7722760381066303, 0.7743746795967191, 0.7760808270676691, 0.7605466079972658, 0.7729809253246753, 0.7678811731032125, 0.7741423872180451, 0.765590289644566, 0.766393967874231, 0.7692722573479153, 0.76609492481203, 0.7693336679767601, 0.7681374957279563, 0.7705058099794941, 0.771629891490089, 0.7717767429938482, 0.7748392643540671, 0.7684125085440875, 0.7716592617908407, 0.7650749743677376, 0.7720223855092276, 0.7702334671907041, 0.7661429853041695, 0.7722386577238551, 0.7673204673615858, 0.7683724581339713, 0.7693790584415584, 0.7660468643198906, 0.7701506963431306, 0.7736484321599452, 0.761152704203691, 0.7700892857142858, 0.7681908962747779, 0.771902234278879, 0.7735816814764183, 0.7694885295625427, 0.7599698820915927, 0.7681882262474367, 0.7711199162679426, 0.7681775461380724, 0.7668532125768969, 0.765924043062201, 0.7683991584073822, 0.7655689294258373, 0.7695499401913874, 0.77374

In [218]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [219]:
column_to_drop_23 = '소득 대비 주택 임대료의 비율'

In [220]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(6119, 190)


In [221]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [222]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [223]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 5}
0.7794681090827136


In [224]:
optuna_24 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [225]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7755575017088175


In [226]:
X_train = X_train.values
y_train = y_train.values

In [227]:
auc_bootstrap = []

In [228]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76071676, 0.77826598])

In [229]:
t_24 = auc_bootstrap
print(t_24)

[0.7685807202665755, 0.7692108467190704, 0.7729969454887219, 0.7651977956254272, 0.7736244019138756, 0.7569420710868078, 0.7686074205399864, 0.7707381023581681, 0.7750528665413534, 0.7661056049213945, 0.7630617737525631, 0.7654300880041012, 0.7698997137730691, 0.7706206211551607, 0.7723134184894054, 0.7720250555365686, 0.7737952836637048, 0.7630110432330827, 0.7690266148325359, 0.7637933612440191, 0.7734107997265891, 0.768292357313739, 0.7625811688311688, 0.774436090225564, 0.7716913021189337, 0.774374679596719, 0.7761716079972659, 0.7740783065618592, 0.7731865174299385, 0.7705191601161995, 0.767154925666439, 0.7714082792207793, 0.7750288362952837, 0.7745936218386875, 0.7770527170198223, 0.7659373931989064, 0.7664713986671223, 0.7717180023923444, 0.7714269694121667, 0.7727780032467533, 0.7678598128844839, 0.7710291353383458, 0.7704470693779905, 0.7704203691045797, 0.766855882604238, 0.7727539730006835, 0.7740462662337663, 0.7679612739234449, 0.7731251068010936, 0.7757470736500341, 0.76

In [230]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [231]:
column_to_drop_24 = '장기부채부담지표'

In [232]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(6119, 189)


In [233]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [234]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [235]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7809873204283841


In [236]:
optuna_25 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [237]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.7762036483253588


In [238]:
X_train = X_train.values
y_train = y_train.values

In [239]:
auc_bootstrap = []

In [240]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76090953, 0.77895124])

In [241]:
t_25 = auc_bootstrap
print(t_25)

[0.7695018796992481, 0.7735523111756665, 0.7693336679767602, 0.7678170924470267, 0.7744627904989747, 0.7624610176008202, 0.7701320061517429, 0.7747564935064934, 0.7657878716678059, 0.778641383287765, 0.7638841421736159, 0.76485870215311, 0.7733760893711552, 0.7663085269993165, 0.7643727571770336, 0.7678170924470266, 0.7627333603896104, 0.7660228340738208, 0.7691814764183184, 0.7716192113807245, 0.7723000683527, 0.7709890849282297, 0.7697181519138756, 0.7611740644224197, 0.7704016789131921, 0.7686047505126452, 0.7795865729665072, 0.7790605775803144, 0.7669893839712918, 0.761457087320574, 0.7691227358168148, 0.7692375469924813, 0.7643140165755298, 0.7755895420369104, 0.7585908129699248, 0.7769298957621326, 0.7719583048530417, 0.7762223385167464, 0.7745642515379358, 0.7694297889610391, 0.7772449589883801, 0.7739581553315107, 0.7740809765892003, 0.7741530673274094, 0.7637906912166781, 0.7724228896103896, 0.7764199205399862, 0.7661563354408749, 0.7685513499658236, 0.7658839926520846, 0.7661

In [242]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [243]:
column_to_drop_25 = 'Cat_소득 계층'

In [244]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(6119, 187)


In [245]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [246]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [247]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 163, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 6}
0.7770348667351146


In [248]:
optuna_26 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [249]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.774137047163363


In [250]:
X_train = X_train.values
y_train = y_train.values

In [251]:
auc_bootstrap = []

In [252]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7596966 , 0.77671416])

In [253]:
t_26 = auc_bootstrap
print(t_26)

[0.7733600692071088, 0.7655181989063568, 0.7706152811004785, 0.7727566430280246, 0.7599378417634997, 0.7709303443267258, 0.770900974025974, 0.7718541737867395, 0.7635744190020505, 0.7690720052973342, 0.7691868164730007, 0.7600045924470267, 0.7669439935064936, 0.766527469241285, 0.7745909518113465, 0.7730369958988379, 0.7662204160970609, 0.7607388499658236, 0.7689678742310321, 0.7675661098769652, 0.7741557373547504, 0.776841784859877, 0.7614624273752564, 0.7718701939507859, 0.7737285329801777, 0.7628908920027341, 0.7683110475051265, 0.7686928614149009, 0.767093515037594, 0.7703429383116883, 0.7634649478810663, 0.7723000683526999, 0.7683804682159946, 0.7668238422761448, 0.7652031356801094, 0.7668532125768968, 0.7736591122693096, 0.7675100393028025, 0.7631418745727957, 0.7657745215311005, 0.7704604195146958, 0.7637746710526315, 0.7638254015721121, 0.7707514524948735, 0.778251559295967, 0.773301328605605, 0.767192306049214, 0.7706847018113465, 0.7685460099111414, 0.7643433868762817, 0.7752

In [254]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [255]:
column_to_drop_26 = '총 가구원 수'

In [256]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(6119, 186)


In [257]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [258]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [259]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 188, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7796517500146078


In [260]:
optuna_27 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [261]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

0.7779097957963089


In [262]:
X_train = X_train.values
y_train = y_train.values

In [263]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76010065, 0.77793676])

In [264]:
t_27 = auc_bootstrap
print(t_27)

[0.7733600692071088, 0.7655181989063568, 0.7706152811004785, 0.7727566430280246, 0.7599378417634997, 0.7709303443267258, 0.770900974025974, 0.7718541737867395, 0.7635744190020505, 0.7690720052973342, 0.7691868164730007, 0.7600045924470267, 0.7669439935064936, 0.766527469241285, 0.7745909518113465, 0.7730369958988379, 0.7662204160970609, 0.7607388499658236, 0.7689678742310321, 0.7675661098769652, 0.7741557373547504, 0.776841784859877, 0.7614624273752564, 0.7718701939507859, 0.7737285329801777, 0.7628908920027341, 0.7683110475051265, 0.7686928614149009, 0.767093515037594, 0.7703429383116883, 0.7634649478810663, 0.7723000683526999, 0.7683804682159946, 0.7668238422761448, 0.7652031356801094, 0.7668532125768968, 0.7736591122693096, 0.7675100393028025, 0.7631418745727957, 0.7657745215311005, 0.7704604195146958, 0.7637746710526315, 0.7638254015721121, 0.7707514524948735, 0.778251559295967, 0.773301328605605, 0.767192306049214, 0.7706847018113465, 0.7685460099111414, 0.7643433868762817, 0.7752

In [265]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [266]:
column_to_drop_27 = 'Cat_현재 공공기관 접근용이성'

In [267]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(6119, 182)


In [268]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [269]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [270]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 120, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 5}
0.7772894598452408


In [271]:
optuna_28 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [272]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7772583091250854


In [273]:
X_train = X_train.values
y_train = y_train.values

In [274]:
auc_bootstrap = []

In [275]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76074613, 0.77755308])

In [276]:
t_28 = auc_bootstrap
print(t_28)

[0.7698436431989064, 0.7685379998291183, 0.7739047547846889, 0.7635984492481204, 0.7613716464456597, 0.7654621283321941, 0.7672777469241285, 0.767128225393028, 0.7695552802460697, 0.7662951768626111, 0.7670881749829118, 0.7749514055023924, 0.7670080741626795, 0.7684044984620642, 0.7651043446684894, 0.765360667293233, 0.7655502392344496, 0.7665942199248119, 0.766826512303486, 0.766994724025974, 0.7699985047846891, 0.7617668104921395, 0.771066515721121, 0.7755868720095694, 0.7768711551606289, 0.7664847488038277, 0.7658199119958989, 0.772273368079289, 0.7794637517088175, 0.7712560876623377, 0.7710051050922762, 0.7709356843814081, 0.7656543703007519, 0.7662230861244019, 0.756779199419002, 0.7689358339029391, 0.7677797120642516, 0.7671255553656868, 0.7772502990430623, 0.775309189166097, 0.7700385551948052, 0.7726845522898155, 0.7658305921052632, 0.7673471676349966, 0.7614116968557758, 0.7634676179084074, 0.7715204203691046, 0.771234727443609, 0.7687863123718387, 0.7769245557074504, 0.770583

In [277]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [278]:
column_to_drop_28 = 'Cat_현재 대기오염 정도'

In [279]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(6119, 178)


In [280]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [281]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [282]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 198, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7790674379585807


In [283]:
optuna_29 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [284]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7762944292549556


In [285]:
X_train = X_train.values
y_train = y_train.values

In [286]:
auc_bootstrap = []

In [287]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76166535, 0.77800111])

In [288]:
t_29 = auc_bootstrap
print(t_29)

[0.7790338773069037, 0.7706152811004785, 0.7695045497265891, 0.7696914516404647, 0.7686074205399864, 0.7686795112781957, 0.7714616797676008, 0.7769939764183185, 0.7669706937799042, 0.774804553998633, 0.7758538747436774, 0.7704951298701298, 0.7665354793233082, 0.7659507433356118, 0.7674806690020506, 0.7685727101845523, 0.7680707450444293, 0.7747965439166098, 0.7608563311688312, 0.7710745258031442, 0.7717340225563909, 0.7752718087833218, 0.7715524606971976, 0.7680707450444292, 0.7607094796650717, 0.7714296394395078, 0.7628641917293233, 0.7659373931989064, 0.7645943694463431, 0.7725964413875598, 0.7673978981544771, 0.7690613251879699, 0.7756295924470267, 0.7733734193438141, 0.7698703434723172, 0.7679479237867395, 0.7671522556390978, 0.7723855092276144, 0.7667437414559126, 0.7686154306220095, 0.7636091293574846, 0.7701480263157894, 0.7640176435406698, 0.766495428913192, 0.7749060150375939, 0.7661429853041696, 0.7679452537593985, 0.7733734193438142, 0.7655929596719071, 0.7698756835269993, 0

In [289]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [291]:
column_to_drop_29 = 'Cat_이사 예상 기간'

In [292]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(6119, 174)


In [297]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [298]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [299]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 84, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 5}
0.7821309025951803


In [300]:
optuna_30 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [301]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.7797708048530418


In [302]:
X_train = X_train.values
y_train = y_train.values

In [303]:
auc_bootstrap = []

In [304]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76160995, 0.77939233])

In [305]:
t_30 = auc_bootstrap
print(t_30)

[0.766727721291866, 0.7754266703691046, 0.772802033492823, 0.7654968386876282, 0.765830592105263, 0.7771034475393028, 0.7700545753588517, 0.7654728084415584, 0.7703669685577581, 0.7734802204374573, 0.76438877734108, 0.7647946214969242, 0.774908685064935, 0.7638440917634997, 0.774102336807929, 0.770535180280246, 0.7645062585440875, 0.7718595138414217, 0.7678144224196857, 0.7779044557416268, 0.7717927631578947, 0.7722840481886535, 0.7730957365003418, 0.7760007262474369, 0.7685914003759399, 0.7718408236500341, 0.7708155331510596, 0.7678437927204373, 0.7723427887901573, 0.7752210782638415, 0.777207578605605, 0.7691334159261791, 0.7704016789131921, 0.7744227400888586, 0.7632059552289816, 0.7712801179084073, 0.7643166866028708, 0.7681134654818866, 0.7728100435748462, 0.7654914986329461, 0.7696647513670538, 0.7763131194463432, 0.7685460099111414, 0.7727753332194122, 0.7713869190020506, 0.7644795582706767, 0.7690346249145591, 0.7741610774094327, 0.7656517002734109, 0.7796987141148325, 0.769651

In [306]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [307]:
# 31
column_to_drop_30 = 'Cat_현재 주변도로의 보행 안전'

In [308]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(6119, 170)


In [309]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [310]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [311]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7813921652100602


In [312]:
optuna_31= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [313]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7788015849282297


In [314]:
X_train = X_train.values
y_train = y_train.values

In [315]:
auc_bootstrap = []

In [316]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76153445, 0.77951996])

In [317]:
t_31 = auc_bootstrap
print(t_31)

[0.7718221334586466, 0.7768658151059467, 0.7688557330827068, 0.7610031826725906, 0.7702254571086807, 0.7617053998632946, 0.7749914559125086, 0.7715044002050581, 0.7703776486671224, 0.7683858082706767, 0.7686207706766917, 0.7722653579972658, 0.7760701469583049, 0.7742812286397812, 0.7675661098769652, 0.7714082792207791, 0.7756536226930963, 0.7679479237867396, 0.7750154861585783, 0.7715391105604922, 0.7703589584757348, 0.7678357826384142, 0.7737472231715653, 0.7738967447026658, 0.7753145292207791, 0.7724041994190021, 0.7651390550239234, 0.7670668147641833, 0.7690399649692414, 0.7716432416267942, 0.7719262645249487, 0.7676462106971975, 0.7726952323991798, 0.7656169899179768, 0.7727926883971292, 0.7703829887218046, 0.7754987611073137, 0.7663165370813398, 0.7768684851332878, 0.769734172077922, 0.7780513072453862, 0.7690693352699933, 0.7728581040669856, 0.7651443950786057, 0.766225756151743, 0.7712854579630897, 0.7617134099453178, 0.7700812756322625, 0.7670961850649352, 0.7686074205399864, 0

In [318]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [319]:
# 32
column_to_drop_31 = 'Cat_주택 보유 의식'

In [320]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(6119, 168)


In [321]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [322]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [323]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 4}
0.7805323917561916


In [324]:
optuna_32 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [325]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.7811592190704033


In [326]:
X_train = X_train.values
y_train = y_train.values

In [327]:
auc_bootstrap = []

In [328]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76310696, 0.77954619])

In [329]:
t_32 = auc_bootstrap
print(t_32)

[0.7762703990088857, 0.7751676777170198, 0.7772396189336979, 0.7738353340738209, 0.7639775931305537, 0.7741557373547505, 0.7665381493506492, 0.7681294856459331, 0.7711279263499659, 0.7725563909774437, 0.7739848556049214, 0.7804383116883117, 0.7750181561859193, 0.7726204716336296, 0.7738620343472318, 0.773271958304853, 0.7697288320232399, 0.7708475734791524, 0.7669413234791524, 0.7641591549897472, 0.7713468685919345, 0.7692829374572795, 0.7730183057074504, 0.7754854109706084, 0.7665541695146958, 0.7681855562200958, 0.7654968386876282, 0.768094775290499, 0.7716539217361587, 0.7701506963431306, 0.7729809253246752, 0.7737178528708133, 0.7734428400546821, 0.7771915584415584, 0.7714136192754614, 0.771200017088175, 0.7753625897129186, 0.7690906954887219, 0.7685593600478469, 0.771061175666439, 0.771966314935065, 0.7721185064935066, 0.7712400674982911, 0.7711252563226247, 0.7705859107997266, 0.7666823308270676, 0.7687836423444976, 0.7687756322624744, 0.7736617822966507, 0.7822806305536568, 0.77

In [330]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [331]:
# 33.
column_to_drop_32 = 'Cat_현재 문화시설 접근용이성'

In [332]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(6119, 164)


In [333]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [334]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [335]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7811960033055367


In [336]:
optuna_33 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [337]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.7745615815105946


In [338]:
X_train = X_train.values
y_train = y_train.values

In [339]:
auc_bootstrap = []

In [340]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76132425, 0.77824662])

In [341]:
np.mean(auc_bootstrap)

0.7700029210099112

In [342]:
t_33 = auc_bootstrap
print(t_33)

[0.7732826384142174, 0.766727721291866, 0.7748098940533152, 0.7649067626452495, 0.765857292378674, 0.7706606715652768, 0.7726525119617225, 0.7654033877306903, 0.7677877221462748, 0.7689785543403964, 0.771469689849624, 0.7686768412508544, 0.7695499401913874, 0.7670694847915243, 0.7671869659945317, 0.7763638499658236, 0.7702067669172932, 0.777140827922078, 0.7617053998632947, 0.7657211209842789, 0.7766201725905674, 0.7663485774094326, 0.7733974495898838, 0.773106416609706, 0.7655689294258373, 0.765926713089542, 0.7602048444976077, 0.7711092361585783, 0.7727646531100479, 0.7758859150717705, 0.771998355263158, 0.7716031912166782, 0.7755922120642514, 0.7703402682843472, 0.7719262645249487, 0.7694057587149692, 0.769229536910458, 0.7740008757689678, 0.7716646018455229, 0.7699691344839371, 0.7713522086466165, 0.7693176478127137, 0.7712988080997949, 0.7748205741626795, 0.7765240516062885, 0.7680146744702666, 0.7761876281613124, 0.7696887816131237, 0.767659560833903, 0.776072816985646, 0.7698249

In [343]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [344]:
# 34
column_to_drop_33 = 'Cat_현재 교육환경'

In [345]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(6119, 160)


In [346]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [347]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [348]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 53, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7797018339051244


In [349]:
optuna_34 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [350]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.7815276828434723


In [351]:
X_train = X_train.values
y_train = y_train.values

In [352]:
auc_bootstrap = []

In [353]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76098089, 0.77833129])

In [354]:
t_34 = auc_bootstrap
print(t_34)

[0.7760834970950102, 0.7708582535885168, 0.7666769907723856, 0.7683804682159946, 0.7670347744360902, 0.775259793660287, 0.7762543788448393, 0.7704310492139439, 0.7643033364661653, 0.7613129058441559, 0.7729862653793576, 0.7674886790840738, 0.7603009654818865, 0.7666235902255638, 0.7703829887218046, 0.7725457108680793, 0.7693309979494191, 0.7683457578605606, 0.7702628374914559, 0.7709663896958304, 0.7690800153793576, 0.7647171907040329, 0.7761689379699248, 0.7704524094326725, 0.7607094796650717, 0.7670908450102529, 0.7691868164730007, 0.7687756322624744, 0.7695232399179768, 0.7670000640806562, 0.7702201170539986, 0.7727219326725906, 0.7650242438482571, 0.7689198137388927, 0.7563172846889953, 0.7787642045454546, 0.7774078306561859, 0.7700545753588517, 0.773173167293233, 0.768646135936432, 0.7679505938140807, 0.7726111265379357, 0.7612408151059467, 0.7650242438482571, 0.7700892857142857, 0.7772716592617909, 0.7676702409432672, 0.7723828392002734, 0.7704123590225564, 0.7660548744019138, 0.

In [355]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [356]:
# 35
column_to_drop_34 = 'Cat_현재 상업시설 접근용이성'

In [357]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(6119, 156)


In [358]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [359]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [360]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 140, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7813253866893715


In [361]:
optuna_35 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [362]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.7786360432330827


In [363]:
X_train = X_train.values
y_train = y_train.values

In [364]:
auc_bootstrap = []

In [365]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76468755, 0.78101824])

In [366]:
np.mean(auc_bootstrap)

0.7727988508202325

In [367]:
t_35 = auc_bootstrap
print(t_35)

[0.7786947838345865, 0.7740302460697198, 0.7748365943267259, 0.7720197154818865, 0.7692081766917294, 0.7760968472317156, 0.7754240003417634, 0.7721772470950102, 0.7720143754272042, 0.7696166908749146, 0.7700465652768285, 0.767061474709501, 0.7727886833561175, 0.7824648624401913, 0.7819735774094327, 0.7764145804853042, 0.7697448521872864, 0.7747938738892687, 0.7665194591592618, 0.7708555835611757, 0.7717153323650034, 0.7774745813397128, 0.7756910030758715, 0.7749353853383458, 0.7779071257689678, 0.7668905929596718, 0.7711359364319892, 0.7747111030416952, 0.7721932672590568, 0.7721825871496923, 0.7737312030075187, 0.7763077793916608, 0.7757871240601503, 0.7670240943267259, 0.7827745856117567, 0.7764199205399862, 0.7782916097060835, 0.7725350307587151, 0.7723641490088858, 0.7772476290157211, 0.7771007775119616, 0.7652752264183185, 0.7696060107655502, 0.7692295369104579, 0.7749300452836637, 0.7748552845181135, 0.7676141703691046, 0.7717153323650034, 0.7736163918318524, 0.7672029861585783, 

In [368]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [369]:
# 36
column_to_drop_35 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [370]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(6119, 152)


In [371]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [372]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [373]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 196, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 5}
0.7804113556874431


In [374]:
optuna_36 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [375]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.7772556390977443


In [376]:
X_train = X_train.values
y_train = y_train.values

In [377]:
auc_bootstrap = []

In [378]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76446607, 0.78001999])

In [379]:
t_36 = auc_bootstrap
print(t_36)

[0.7726418318523582, 0.7727993634654818, 0.776475991114149, 0.767728981544771, 0.7755227913533835, 0.7755575017088174, 0.7744227400888586, 0.766695680963773, 0.774735133287765, 0.773231907894737, 0.764522278708134, 0.7695899906015038, 0.7763852101845524, 0.7726097915242651, 0.7760621368762817, 0.7728981544771019, 0.7797975051264525, 0.7700572453861927, 0.7677476717361587, 0.7774372009569378, 0.768959864149009, 0.7748899948735475, 0.768161525974026, 0.7677102913533834, 0.776409240430622, 0.7776481331168832, 0.774972765721121, 0.7760594668489406, 0.7733066686602871, 0.7667944719753931, 0.7761689379699248, 0.773336038961039, 0.774601631920711, 0.7715070702323992, 0.7749033450102529, 0.7698356331168832, 0.7731277768284347, 0.7741343771360218, 0.7779525162337662, 0.7760327665755297, 0.7724949803485988, 0.7653606672932332, 0.7678117523923446, 0.7723721590909091, 0.7701373462064252, 0.773135786910458, 0.7661483253588517, 0.7725457108680793, 0.7766682330827068, 0.7793062200956937, 0.7726738721

In [380]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [381]:
# 37
column_to_drop_36 = 'Cat_가족계획 시 중요 고려 사항 1순위'

In [382]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(6119, 145)


In [383]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [384]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [385]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7833203949949499


In [386]:
optuna_37 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [387]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7793542805878333


In [388]:
X_train = X_train.values
y_train = y_train.values

In [389]:
auc_bootstrap = []

In [390]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76373843, 0.7809298 ])

In [391]:
t_37 = auc_bootstrap
print(t_37)

[0.7801793190362267, 0.7672029861585783, 0.7738446791695147, 0.7783610304169515, 0.7728848043403964, 0.7778510551948052, 0.7786093429596719, 0.7742051328605606, 0.7775947325700616, 0.7714349794941899, 0.7769592660628845, 0.7707247522214629, 0.779311560150376, 0.7727005724538618, 0.7746510274265209, 0.773207877648667, 0.7718862141148326, 0.7724923103212578, 0.773538961038961, 0.7639161825017088, 0.7770393668831168, 0.7765187115516063, 0.775138307416268, 0.7766255126452495, 0.7773464200273411, 0.7814342318865345, 0.7649441430280246, 0.7742892387218046, 0.7749247052289815, 0.7684151785714285, 0.7758271744702666, 0.7776160927887902, 0.7809696471291866, 0.769496539644566, 0.7722600179425837, 0.7811698991797675, 0.7738433441558442, 0.7743186090225564, 0.7754346804511278, 0.765259206254272, 0.772097146274778, 0.7675848000683527, 0.7773571001367054, 0.7732639482228297, 0.7731891874572796, 0.7740649564251537, 0.7750635466507177, 0.7834981630211894, 0.7672056561859193, 0.7741076768626111, 0.7689

In [392]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [393]:
# 38
column_to_drop_37 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [394]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(6119, 141)


In [395]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [396]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [397]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7821517708828956


In [398]:
optuna_38 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [399]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7845982142857143


In [400]:
X_train = X_train.values
y_train = y_train.values

In [401]:
auc_bootstrap = []

In [402]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76515754, 0.78141881])

In [403]:
t_38 = auc_bootstrap
print(t_38)

[0.7733734193438142, 0.776508031442242, 0.7696914516404647, 0.7726017814422419, 0.7702521573820916, 0.7703189080656188, 0.7747004229323309, 0.7771248077580315, 0.7745348812371838, 0.7743666695146959, 0.7734508501367054, 0.7687409219070404, 0.7679198884996582, 0.7643594070403281, 0.774372009569378, 0.7773864704374572, 0.7740088858509911, 0.7760968472317157, 0.7749487354750513, 0.7617454502734108, 0.7813541310663021, 0.7769058655160629, 0.7739020847573478, 0.7683884782980177, 0.7777656143198907, 0.7622954759056733, 0.7683003673957621, 0.7722520078605606, 0.7741824376281613, 0.7744174000341764, 0.7732639482228298, 0.7719396146616542, 0.7717927631578947, 0.7684285287081339, 0.7767109535201641, 0.7726952323991797, 0.7714883800410116, 0.783783855946685, 0.7783209800068353, 0.7704604195146958, 0.7737018327067668, 0.7723240985987697, 0.7732586081681476, 0.7745802717019823, 0.7698463132262475, 0.7720117053998633, 0.776406570403281, 0.7629015721120984, 0.7722813781613125, 0.7744254101161996, 0.7

In [404]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [405]:
# 38
column_to_drop_38 = 'Cat_남편/아내의 부모님과 동거 의향'

In [406]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(6119, 136)


In [407]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [408]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [409]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7815632851693252


In [410]:
optuna_39 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [411]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.7809295967190704


In [412]:
X_train = X_train.values
y_train = y_train.values

In [413]:
auc_bootstrap = []

In [414]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76444965, 0.78129305])

In [415]:
t_39 = auc_bootstrap
print(t_39)

[0.7765507518796992, 0.7771835483595353, 0.7727619830827068, 0.77069271189337, 0.7771621881408066, 0.7742652084757348, 0.7756669728298018, 0.7753145292207793, 0.7773250598086124, 0.7706740217019824, 0.7722066173957621, 0.7676755809979494, 0.7738326640464798, 0.7761609278879016, 0.7693817284688995, 0.7818908065618592, 0.7716565917634998, 0.7728474239576214, 0.7753705997949419, 0.7754133202323991, 0.76819623632946, 0.7747858638072453, 0.7809576320061518, 0.7739234449760766, 0.7722626879699248, 0.7755628417634997, 0.779146018455229, 0.7762196684894053, 0.773207877648667, 0.7725083304853042, 0.7733627392344496, 0.7742304981203008, 0.7763371496924127, 0.7781447582023241, 0.7733867694805194, 0.7692722573479154, 0.7711492865686945, 0.7801419386534518, 0.7746443523581681, 0.7772743292891319, 0.7737004976930963, 0.7795224923103213, 0.7762303485987697, 0.7779979066985647, 0.7729275247778539, 0.7716752819548871, 0.7796266233766234, 0.7697128118591934, 0.7743853597060835, 0.7713949290840738, 0.772

In [416]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [417]:
# 40
column_to_drop_39 = 'Cat_현재 의료시설 접근용이성'

In [418]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(6119, 132)


In [419]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [420]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [421]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7790924799038388


In [422]:
optuna_40 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [423]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.779259494617225


In [424]:
X_train = X_train.values
y_train = y_train.values

In [425]:
auc_bootstrap = []

In [426]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7639278 , 0.78118255])

In [427]:
t_40 = auc_bootstrap
print(t_40)

[0.7797400995386193, 0.7687769672761449, 0.7715831660116199, 0.7750141511449076, 0.7717500427204375, 0.774473470608339, 0.7742772235987698, 0.771519085355434, 0.7645556540498974, 0.7720477507689678, 0.7704670945830485, 0.7738473491968558, 0.7801486137218046, 0.7706993869617225, 0.7775893925153794, 0.7664513734620644, 0.7679679489917977, 0.7753185342617908, 0.7672924320745045, 0.7729809253246753, 0.780896221377307, 0.7721905972317156, 0.7776414580485305, 0.7686915264012304, 0.7584786718215994, 0.769415103810663, 0.7634208924299385, 0.7752130681818182, 0.7717994382262474, 0.7657518262987013, 0.7728407488892687, 0.7764532958817498, 0.7771127926349966, 0.7723868442412851, 0.7812793703007518, 0.7689398389439508, 0.7725790862098427, 0.7775520121326043, 0.7738460141831852, 0.7737058377477786, 0.7760554618079288, 0.771272107826384, 0.7785412572624744, 0.7675300645078605, 0.7738954096889952, 0.7693803934552289, 0.7694444741114149, 0.7771915584415583, 0.763808046394395, 0.7730329908578264, 0.776

In [428]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [429]:
# 41.
column_to_drop_40 = 'Cat_현재 주택의 구조'

In [430]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(6119, 130)


In [431]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [432]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [433]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7805240444411055


In [434]:
optuna_41 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [435]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.7798095202494874


In [436]:
X_train = X_train.values
y_train = y_train.values

In [437]:
auc_bootstrap = []

In [438]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76324934, 0.77977838])

In [439]:
t_41 = auc_bootstrap
print(t_41)

[0.7764840011961722, 0.7742024628332194, 0.7750969219924811, 0.771568480861244, 0.7758084842788789, 0.7729275247778538, 0.7819962726418319, 0.7611807394907724, 0.7675313995215312, 0.7797227443609023, 0.7710571706254272, 0.7732492630724539, 0.7733186837833219, 0.7705258351845523, 0.773150472060834, 0.7771194677033493, 0.7746456873718387, 0.7687716272214626, 0.7709543745727956, 0.7781180579289132, 0.7639362077067671, 0.7789511064593302, 0.7622621005639099, 0.7708916289302802, 0.7680934402768286, 0.771916919429255, 0.7761115323820915, 0.7779298210013671, 0.7758979301948051, 0.7639936132945999, 0.770900974025974, 0.7789831467874231, 0.7743533193779903, 0.773428154904306, 0.7702321321770335, 0.7730583561175667, 0.769981149606972, 0.7720797910970608, 0.7785092169343815, 0.7721638969583048, 0.7688784283151058, 0.7726925623718386, 0.772035735645933, 0.7715698158749145, 0.7654567882775118, 0.766005478896104, 0.7684685791182502, 0.7726752071941217, 0.7766468728639782, 0.7676795860389611, 0.77334

In [440]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [441]:
# 42.
column_to_drop_41 = 'Cat_현재 대중교통 접근용이성'

In [442]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(6119, 126)


In [443]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [444]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [445]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7787544136428518


In [446]:
optuna_42 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [447]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.7814315618591935


In [448]:
X_train = X_train.values
y_train = y_train.values

In [449]:
auc_bootstrap = []

In [450]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76348063, 0.77960306])

In [451]:
t_42 = auc_bootstrap
print(t_42)

[0.7705071449931647, 0.7789951619104579, 0.77363508202324, 0.7719943502221464, 0.7655021787423103, 0.774174427546138, 0.7706499914559125, 0.7715284304511278, 0.7688223577409432, 0.7704564144736843, 0.7677516767771703, 0.7771595181134654, 0.7715391105604921, 0.7678678229665072, 0.7739408001537935, 0.7776961936090226, 0.7742718835440875, 0.7682856822453863, 0.773370749316473, 0.772851428998633, 0.7682135915071769, 0.7829254421565277, 0.768480594241285, 0.7704991349111414, 0.7795625427204373, 0.7761902981886535, 0.7723641490088858, 0.7684085035030759, 0.7717954331852358, 0.7674085782638415, 0.7692402170198224, 0.7683510979152427, 0.7694324589883801, 0.7691320809125085, 0.7774064956425154, 0.7705031399521531, 0.7692575721975393, 0.7735469711209843, 0.7687502670027341, 0.7742892387218046, 0.7695325850136705, 0.776911205570745, 0.7832792207792207, 0.7737352080485305, 0.7658345971462748, 0.7723694890635681, 0.7727045774948735, 0.7750168211722489, 0.775289163961039, 0.7717313525290499, 0.76871

In [452]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [453]:
# 43.
column_to_drop_42 = 'Cat_현재 청소/쓰레기 처리상태'

In [454]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(6119, 122)


In [455]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [456]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [457]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7798458250903596


In [458]:
optuna_43 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [459]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.7824274820574163


In [460]:
X_train = X_train.values
y_train = y_train.values

In [461]:
auc_bootstrap = []

In [462]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76594444, 0.78153636])

In [463]:
t_43 = auc_bootstrap
print(t_43)

[0.770616616114149, 0.7768070745044429, 0.7750675516917294, 0.7775720373376623, 0.7733800944121668, 0.7763812051435407, 0.7695579502734108, 0.7656209949589884, 0.774876644736842, 0.7735977016404648, 0.7727606480690362, 0.7802887901572112, 0.7725403708133971, 0.7781727934894053, 0.7635624038790156, 0.7794917869958988, 0.7752144031954888, 0.7668225072624744, 0.7749193651742993, 0.7728220586978811, 0.7760661419172932, 0.7772916844668489, 0.7806732740943267, 0.7790952879357486, 0.7754814059295968, 0.7757163683356116, 0.7723120834757349, 0.7764573009227616, 0.778275589542037, 0.7753505745898838, 0.7711653067327409, 0.7750461914730007, 0.7735603212576897, 0.7856448650034176, 0.768080090140123, 0.7764840011961722, 0.7805104024265208, 0.7750008010082023, 0.7788896958304852, 0.7736831425153794, 0.7722933932843473, 0.7727673231373888, 0.76715759569378, 0.7736043767088174, 0.7745121860047847, 0.7808014354066986, 0.7665688546650717, 0.7758738999487355, 0.7751650076896788, 0.7770166716507176, 0.766

In [464]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [465]:
## 44
column_to_drop_43 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [466]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(6119, 118)


In [467]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [468]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [469]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7800023372482241


In [470]:
optuna_44 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [471]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.7827078349282297


In [472]:
X_train = X_train.values
y_train = y_train.values

In [473]:
auc_bootstrap = []

In [474]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76646069, 0.7824657 ])

In [475]:
t_44 = auc_bootstrap
print(t_44)

[0.7849573329630896, 0.7782662444463432, 0.7773824653964456, 0.7728113785885168, 0.7676555557928912, 0.7751156121838687, 0.7762917592276145, 0.7652258309125086, 0.775346569548872, 0.7722827131749829, 0.7699771445659603, 0.769467169343814, 0.7784544813738892, 0.7758525397300069, 0.7852590460526315, 0.7765961423444976, 0.777062062115516, 0.7752637987012987, 0.7751142771701982, 0.7711799918831168, 0.7768604750512647, 0.7763358146787422, 0.7753772748632946, 0.7779965716848941, 0.7792234492481203, 0.7756242523923444, 0.7707581275632263, 0.7766882582877648, 0.7756215823650034, 0.7778203498803827, 0.7706740217019821, 0.7746203221120984, 0.7709316793403964, 0.7788216101332878, 0.7735029156698565, 0.7693777234278879, 0.7787868997778538, 0.7727406228639782, 0.7779271509740259, 0.7695806455058101, 0.7848078114319891, 0.7743786846377305, 0.7793262453007518, 0.7664246731886535, 0.7754493656015037, 0.7714403195488722, 0.7800057672590568, 0.7820096227785373, 0.7640376687457279, 0.7696887816131236, 0.

In [476]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [477]:
# 45
column_to_drop_44 = 'Cat_현재 주차시설 이용편의성'

In [478]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(6119, 114)


In [479]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [480]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [481]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7806805565989698


In [482]:
optuna_45 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [483]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.7807373547505126


In [484]:
X_train = X_train.values
y_train = y_train.values

In [485]:
auc_bootstrap = []

In [486]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76558395, 0.78120858])

In [487]:
t_45 = auc_bootstrap
print(t_45)

[0.7771208027170199, 0.7715524606971975, 0.7722666930109365, 0.7750795668147641, 0.7742571983937115, 0.7674099132775121, 0.7773797953691046, 0.7786520633971292, 0.7788536504613808, 0.7587656997607657, 0.7715618057928914, 0.772485635252905, 0.7785465973171566, 0.7712988080997949, 0.7773704502734107, 0.7752210782638415, 0.7698596633629529, 0.7673992331681476, 0.7692949525803144, 0.766576864747095, 0.7720250555365687, 0.7778377050580998, 0.7604464819719753, 0.7752718087833219, 0.7677450017088177, 0.7713989341250853, 0.7757711038961039, 0.7717900931305536, 0.7766628930280246, 0.7738206489234449, 0.7779898966165414, 0.7745976268796992, 0.7723414537764867, 0.7769365708304853, 0.7752891639610389, 0.7753318843984962, 0.7716112012987013, 0.7728047035201641, 0.7765627670027341, 0.7762223385167464, 0.7695205698906358, 0.7745415563055367, 0.7811418638926861, 0.7714536696855776, 0.7692362119788105, 0.7699424342105264, 0.7729635701469584, 0.7771328178400547, 0.7731451320061516, 0.7693964136192755, 0

In [488]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [489]:
# 46.
column_to_drop_45 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [490]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(6119, 110)


In [491]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [492]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [493]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 182, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7782973981418876


In [494]:
optuna_46 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [495]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.7809496219241285


In [496]:
X_train = X_train.values
y_train = y_train.values

In [497]:
auc_bootstrap = []

In [498]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76716407, 0.78109964])

In [499]:
t_46 = auc_bootstrap
print(t_46)

[0.7798989661654134, 0.778814935064935, 0.7788376302973343, 0.7739634953861927, 0.7746056369617225, 0.7782488892686261, 0.7698957087320574, 0.7750194911995898, 0.7779912316302119, 0.7755641767771702, 0.7753545796308954, 0.7729168446684894, 0.7733680792891319, 0.7710798658578262, 0.7790752627306904, 0.7734882305194806, 0.7701934167805877, 0.7777282339371155, 0.7753158642344498, 0.7725497159090908, 0.7755962171052632, 0.7773397449589885, 0.7697355070915928, 0.7723254336124401, 0.7711492865686944, 0.7777896445659603, 0.7775012816131237, 0.7754053101503761, 0.7777576042378674, 0.7744174000341764, 0.7748913298872181, 0.7767870492993849, 0.7694685043574847, 0.7806559189166097, 0.7777028686773753, 0.779284859876965, 0.778196823735475, 0.7784237760594668, 0.7697875726247436, 0.7757403985816815, 0.7708529135338346, 0.7732639482228297, 0.7749834458304853, 0.7782448842276146, 0.7748459394224196, 0.7712187072795625, 0.765924043062201, 0.7717714029391661, 0.7801859941045797, 0.7795852379528366, 0.7

In [500]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [501]:
# 47.
column_to_drop_46 = 'Cat_이사 계획 중인 주택의 유형'

In [502]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(6119, 96)


In [503]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [504]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [505]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 194, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7771767710915785


In [506]:
optuna_47 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [507]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.7808975563909775


In [508]:
X_train = X_train.values
y_train = y_train.values

In [509]:
auc_bootstrap = []

In [510]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76827227, 0.78226738])

In [511]:
t_47 = auc_bootstrap
print(t_47)

[0.7721438717532468, 0.777262314166097, 0.7773704502734109, 0.777264984193438, 0.7735896915584416, 0.7745509014012303, 0.7736778024606972, 0.7701453562884484, 0.7746670475905674, 0.770730092276145, 0.7744027148838004, 0.7754373504784691, 0.7692081766917294, 0.7703776486671223, 0.7698623333902939, 0.7787628695317841, 0.7707981779733424, 0.7826010338345866, 0.7776241028708135, 0.780585163192071, 0.769447144138756, 0.7744267451298701, 0.7734628652597404, 0.7749006749829118, 0.7785319121667806, 0.7777015336637048, 0.7798976311517429, 0.7745362162508544, 0.7697675474196854, 0.7788977059125085, 0.7809082365003418, 0.7720651059466849, 0.7731050815960355, 0.7774972765721121, 0.7684592340225564, 0.7720117053998633, 0.7722773731203008, 0.7839427225734792, 0.7773944805194806, 0.7791526935235816, 0.7775039516404647, 0.7805718130553658, 0.7754507006151743, 0.777309039644566, 0.7744721355946685, 0.7688677482057416, 0.7736350820232398, 0.7769405758714969, 0.7795772278708134, 0.7789137260765551, 0.775

In [512]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [513]:
# 48
column_to_drop_47 = '현재 주택의 면적(㎡)'

In [514]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(6119, 95)


In [515]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [516]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [517]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7762940425212231


In [518]:
optuna_48 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [519]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.7798655908236499


In [520]:
X_train = X_train.values
y_train = y_train.values

In [521]:
auc_bootstrap = []

In [522]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76370759, 0.77935902])

In [523]:
t_48 = auc_bootstrap
print(t_48)

[0.7730677012132605, 0.7732866434552288, 0.7654501132091592, 0.7704470693779903, 0.7706539964969241, 0.775971355946685, 0.7679052033492823, 0.7691734663362952, 0.7704764396787422, 0.7739821855775804, 0.7760514567669172, 0.77276999316473, 0.7659988038277512, 0.7763798701298702, 0.7698810235816815, 0.7722586829289133, 0.7725977764012304, 0.7686821813055366, 0.7751142771701982, 0.7792100991114148, 0.7717260124743677, 0.773773923444976, 0.7736190618591934, 0.7769365708304854, 0.7695045497265892, 0.7716258864490773, 0.7724402447881066, 0.7733720843301437, 0.7720344006322625, 0.7763438247607655, 0.7688997885338344, 0.7731104216507176, 0.7757444036226931, 0.7731785073479152, 0.7759059402768285, 0.7614424021701983, 0.7719943502221464, 0.7703736436261107, 0.7743786846377307, 0.7703589584757349, 0.7715911760936431, 0.7733800944121667, 0.7715257604237868, 0.7806011833561175, 0.7699971697710184, 0.7745121860047846, 0.7768257646958303, 0.7684939443779905, 0.7732732933185236, 0.7800645078605605, 0.7

In [524]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [525]:
# 49
column_to_drop_48 = '부채 중 금융기관 대출금의 비중'

In [526]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(6119, 94)


In [527]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [528]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [529]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 69, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 6}
0.7780407182029899


In [530]:
optuna_49 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [531]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.7781340780929596


In [532]:
X_train = X_train.values
y_train = y_train.values

In [533]:
auc_bootstrap = []

In [534]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76345053, 0.77867209])

In [535]:
t_49 = auc_bootstrap
print(t_49)

[0.7758805750170881, 0.7656410201640464, 0.77276999316473, 0.7709196642173617, 0.7769539260082023, 0.7769873013499659, 0.7721345266575529, 0.7722693630382775, 0.7730930664730007, 0.775096921992481, 0.7695259099453178, 0.765958753417635, 0.7639856032125769, 0.7723321086807928, 0.772282713174983, 0.7738914046479836, 0.7717126623376623, 0.768796992481203, 0.7810764482228298, 0.7693042976760082, 0.7715364405331511, 0.7736337470095696, 0.7736244019138756, 0.772784678315106, 0.7740409261790842, 0.7677917271872864, 0.766823842276145, 0.7756442775974026, 0.7771755382775118, 0.7738446791695146, 0.7806519138755981, 0.774072966507177, 0.7705939208817498, 0.7720090353725222, 0.7732839734278878, 0.7804703520164047, 0.7736097167634998, 0.7736217318865345, 0.7676195104237867, 0.7697168169002051, 0.7738153088687627, 0.7675020292207793, 0.77076747265892, 0.7711519565960355, 0.7741797676008203, 0.7727432928913192, 0.7759673509056731, 0.7651497351332878, 0.7689705442583732, 0.7719262645249487, 0.77355231

In [536]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [537]:
# 50
column_to_drop_49 = 'Cat_가구주 종사상 지위'

In [538]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(6119, 89)


In [539]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [540]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [541]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7761917879114183


In [542]:
optuna_50 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [543]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.7749954609535201


In [544]:
X_train = X_train.values
y_train = y_train.values

In [545]:
auc_bootstrap = []

In [546]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76022313, 0.77687736])

In [547]:
t_50 = auc_bootstrap
print(t_50)

[0.7734375, 0.7744254101161996, 0.7676115003417636, 0.7774558911483254, 0.773736543062201, 0.7740809765892002, 0.7658386021872865, 0.7722066173957621, 0.771936944634313, 0.7688130126452495, 0.7721612269309638, 0.7643487269309639, 0.7698436431989064, 0.7658199119958988, 0.7672750768967875, 0.7652405160628845, 0.7652325059808612, 0.771232057416268, 0.772134526657553, 0.7706713516746411, 0.7710051050922762, 0.768230946684894, 0.7704951298701298, 0.7704363892686261, 0.764022983595352, 0.7718888841421736, 0.7666636406356802, 0.7715738209159262, 0.7597108894395079, 0.765625, 0.7750768967874231, 0.7679639439507859, 0.7692482271018456, 0.7676048252734109, 0.7714242993848257, 0.7662497863978128, 0.7693763884142175, 0.7646050495557074, 0.7623648966165414, 0.7744734706083389, 0.7656436901913876, 0.768364448051948, 0.771200017088175, 0.7720517558099795, 0.7674779989747095, 0.772502990430622, 0.7759553357826384, 0.7619029818865345, 0.7699103938824333, 0.7691707963089542, 0.7705565404989747, 0.76811

In [548]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [549]:
# 51
column_to_drop_50 = 'Cat_이사 계획 첫 번째 이유'

In [550]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(6119, 77)


In [551]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [552]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [553]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7757243382665966


In [554]:
optuna_51 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [555]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.7770740772385509


In [556]:
X_train = X_train.values
y_train = y_train.values

In [557]:
auc_bootstrap = []

In [558]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76170533, 0.77766702])

In [559]:
t_51 = auc_bootstrap
print(t_51)

[0.7725483808954203, 0.781111158578264, 0.7762463687628162, 0.7684472188995215, 0.7701613764524948, 0.7697154818865346, 0.7698463132262474, 0.7794771018455229, 0.7739421351674642, 0.7685727101845523, 0.7617053998632946, 0.768399158407382, 0.7725056604579632, 0.7692268668831168, 0.7595400076896788, 0.7740649564251538, 0.7686100905673274, 0.7748045539986329, 0.7718461637047163, 0.7678464627477786, 0.7655315490430622, 0.7680333646616541, 0.7689251537935748, 0.7678998632946001, 0.7689999145591252, 0.7672777469241285, 0.7653179468557758, 0.7664527084757349, 0.7725644010594668, 0.7689518540669857, 0.7662257561517429, 0.7672323564593302, 0.7672483766233764, 0.7734695403280929, 0.7719129143882433, 0.7667784518113465, 0.7736244019138756, 0.7716832920369106, 0.7733333689336979, 0.7740355861244017, 0.7679212235133287, 0.7680573949077238, 0.7662658065618592, 0.7722439977785371, 0.7733627392344498, 0.7707381023581681, 0.7697421821599453, 0.7664874188311688, 0.7695446001367054, 0.7676942711893371, 0

In [560]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [561]:
# 52
column_to_drop_51 = 'Cat_현재 거주 지역'

In [562]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(6119, 60)


In [563]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [564]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [565]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 99, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 5}
0.7652108949156503


In [566]:
optuna_52 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [567]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.7713695638243334


In [568]:
X_train = X_train.values
y_train = y_train.values

In [569]:
auc_bootstrap = []

In [570]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75979213, 0.77374055])

In [571]:
t_52 = auc_bootstrap
print(t_52)

[0.7636291545625427, 0.7686674961551605, 0.7603743912337662, 0.7642993314251538, 0.7766188375768968, 0.7652765614319891, 0.7639415477614491, 0.7652258309125086, 0.7689531890806564, 0.7663953028879016, 0.7669746988209158, 0.7644675431476418, 0.766179030673274, 0.7718021082535884, 0.7673404925666439, 0.7630283984107997, 0.7691214008031443, 0.7653112717874231, 0.76891313867054, 0.7718207984449762, 0.7692335419514695, 0.7663205421223515, 0.7663018519309637, 0.7686915264012304, 0.7684619040498974, 0.7621873397983595, 0.7625531335440876, 0.7665314742822966, 0.764942808014354, 0.7675647748632947, 0.7696874465994531, 0.7706833667976759, 0.7719089093472317, 0.7716739469412166, 0.762804116114149, 0.7649855284518112, 0.7755107762303487, 0.768218931561859, 0.7652872415413534, 0.7684405438311689, 0.7675327345352017, 0.7706726866883118, 0.7657144459159262, 0.7575681924982913, 0.769919738978127, 0.7729368698735475, 0.7650549491626794, 0.766307191985646, 0.7643100115345182, 0.7584252712747779, 0.76361

In [572]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [573]:
# 53
column_to_drop_52 = 'Cat_주택 마련 예상 소요연수'

In [574]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(6119, 54)


In [575]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [576]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [577]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 148, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 7}
0.762811041828396


In [578]:
optuna_53 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [579]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.770515155075188


In [580]:
X_train = X_train.values
y_train = y_train.values

In [581]:
auc_bootstrap = []

In [582]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75764476, 0.77090785])

In [583]:
t_53 = auc_bootstrap
print(t_53)

[0.7682483018626111, 0.7620031079118249, 0.7661496603725222, 0.7663792827238551, 0.7619416972829801, 0.7561076875427205, 0.7661923808099794, 0.7642512709330143, 0.7648253268113465, 0.7667904669343814, 0.757231769053315, 0.7606734342959671, 0.7596988743164729, 0.7658586273923446, 0.7666142451298702, 0.765706435833903, 0.7640910692925496, 0.7625291032980177, 0.7594852721291866, 0.7661576704545455, 0.7643473919172933, 0.7702748526144907, 0.7703789836807929, 0.770042560235817, 0.7589459266062886, 0.7652872415413535, 0.7666035650205059, 0.7660775696343131, 0.7673404925666438, 0.7629776678913192, 0.7687315768113465, 0.7620912188140806, 0.7701386812200958, 0.760267590140123, 0.7629029071257689, 0.765572934466849, 0.7665261342276145, 0.7628762068523581, 0.7663338922590568, 0.7686541460184554, 0.7622968109193436, 0.7661630105092276, 0.7637599859022557, 0.7625317733253589, 0.7566817434210525, 0.7664914238721805, 0.7701573714114831, 0.7712654327580314, 0.7638107164217363, 0.7679545988550922, 0.76

In [584]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [585]:
# 54
column_to_drop_53 = 'Cat_가구주 최종 학력'

In [586]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(6119, 51)


In [587]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [588]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [589]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 174, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 4}
0.7604946618920024


In [590]:
optuna_54 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [591]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.768675506237184


In [592]:
X_train = X_train.values
y_train = y_train.values

In [593]:
auc_bootstrap = []

In [594]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75605622, 0.76974352])

In [595]:
t_54 = auc_bootstrap
print(t_54)

[0.7598483958475735, 0.7645423039131921, 0.7573759505297335, 0.7629162572624744, 0.7649855284518114, 0.7616026038106629, 0.7603396808783324, 0.7646010445146959, 0.7637519758202325, 0.7664406933526999, 0.7708302183014354, 0.7628815469070405, 0.7644808932843474, 0.7595253225393029, 0.7628842169343815, 0.7657651764354068, 0.765041599025974, 0.7576376132091595, 0.7610819484791524, 0.7665074440362272, 0.7570101567840055, 0.7676875961209844, 0.7608042656356802, 0.7630470886021873, 0.7664326832706765, 0.7622140400717704, 0.7639362077067671, 0.7636344946172249, 0.7619390272556391, 0.7677196364490771, 0.7670494595864662, 0.7589192263328778, 0.7675594348086128, 0.7626225542549556, 0.7635650739063569, 0.764739885936432, 0.7596000833048531, 0.7625210932159945, 0.766309862012987, 0.7632927311175667, 0.7648680472488039, 0.7621472893882433, 0.7712307224025975, 0.7659307181305536, 0.7585507625598087, 0.7620057779391661, 0.7606841144053316, 0.7633488016917293, 0.7651243698735477, 0.7617734855604921, 0.

In [596]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [597]:
# 55
column_to_drop_54 = 'Cat_이사 계획 중인 주택의 점유형태'

In [598]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(6119, 31)


In [599]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [600]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [601]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7617655406138615


In [602]:
optuna_55 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [603]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7594625768967873


In [604]:
X_train = X_train.values
y_train = y_train.values

In [605]:
auc_bootstrap = []

In [606]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74024866, 0.75989025])

In [607]:
t_55 = auc_bootstrap
print(t_55)

[0.7561704331852357, 0.7522668532125769, 0.753273453520164, 0.7485154647983597, 0.7523896744702666, 0.7564614661654135, 0.7514338046821599, 0.7497116370471634, 0.7472739020847572, 0.7520692711893369, 0.7515379357484622, 0.7528969796650716, 0.7456184851332878, 0.7524056946343132, 0.7469721889952152, 0.7503551136363636, 0.7502376324333561, 0.7478212576896788, 0.7581515934723171, 0.7493698735475052, 0.7497516874572796, 0.7485875555365687, 0.7550783920027341, 0.7502242822966507, 0.7449216079972659, 0.7507182373547505, 0.7540904818865345, 0.7463020121326044, 0.7520185406698565, 0.7451832706766917, 0.7444997436773753, 0.7490548103212575, 0.7492283620984278, 0.7595613679084073, 0.7405667934039644, 0.7514872052289815, 0.7492256920710869, 0.7470816601161996, 0.7536552674299385, 0.7505980861244019, 0.7437574760765551, 0.7490868506493508, 0.7543067541011621, 0.7514765251196173, 0.7539836807928914, 0.7541679126794258, 0.7504485645933014, 0.7510920411825017, 0.7553160244360904, 0.7504886150034175, 

In [608]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [609]:
column_to_drop_55 = 'Cat_이사 계획 중인 거주 지역'

In [610]:
if not column_to_drop_55.startswith('Cat_'):
    comp_56 = comp_55.drop(column_to_drop_55, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']
else:
    comp_56 = comp_55.drop(comp_55.filter(regex='^' + column_to_drop_55).columns, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']

print(X_56.shape)

(6119, 24)


In [611]:
X_train, X_test, y_train, y_test = train_test_split(X_56, y_56, test_size=0.2, shuffle=True, stratify=y_56, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [612]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [613]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 166, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 3}
0.7569825290695248


In [614]:
optuna_56 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_56.fit(X_train, y_train)

In [615]:
optuna_proba_56 = optuna_56.predict_proba(X_test)[:, 1]
auc_56 = roc_auc_score(y_test, optuna_proba_56)
print(auc_56)

0.7531212619617226


In [616]:
X_train = X_train.values
y_train = y_train.values

In [617]:
auc_bootstrap = []

In [618]:
rs = RandomState(seed = 56)
bootstrap_auc(optuna_56, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74146112, 0.75467849])

In [619]:
t_56 = auc_bootstrap
print(t_56)

[0.7496528964456596, 0.7462859919685578, 0.7468013072453863, 0.7550330015379357, 0.7504405545112782, 0.7526326469583048, 0.7480241797676009, 0.7474180835611757, 0.7418697667464116, 0.7493885637388928, 0.749433954203691, 0.745290071770335, 0.7447854366028709, 0.7552919941900205, 0.7515699760765551, 0.7457199461722487, 0.7460510295625428, 0.7538421693438141, 0.7498851888243335, 0.743589264354067, 0.7491188909774437, 0.7489159688995215, 0.7513430237525632, 0.7503551136363638, 0.7460510295625428, 0.7499412593984962, 0.7506354665071772, 0.7465156143198908, 0.7423984321599453, 0.7491536013328777, 0.7547526486671224, 0.7484487141148325, 0.7487424171223515, 0.7465770249487356, 0.7501281613123718, 0.75173017771702, 0.7467345565618593, 0.751767558099795, 0.746026999316473, 0.7420326384142174, 0.748229771872864, 0.744761406356801, 0.7446759654818865, 0.750032040328093, 0.7423129912850308, 0.7490200999658235, 0.7440698692754615, 0.7470683099794941, 0.7532841336295282, 0.7513590439166097, 0.7447934

In [620]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [621]:
column_to_drop_56 = 'Cat_현재 주택의 유형'

In [622]:
if not column_to_drop_56.startswith('Cat_'):
    comp_57 = comp_56.drop(column_to_drop_56, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']
else:
    comp_57 = comp_56.drop(comp_56.filter(regex='^' + column_to_drop_56).columns, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']

print(X_57.shape)

(6119, 13)


In [623]:
X_train, X_test, y_train, y_test = train_test_split(X_57, y_57, test_size=0.2, shuffle=True, stratify=y_57, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [624]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [625]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 135, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 4}
0.7400541740749089


In [626]:
optuna_57 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_57.fit(X_train, y_train)

In [627]:
optuna_proba_57 = optuna_57.predict_proba(X_test)[:, 1]
auc_57 = roc_auc_score(y_test, optuna_proba_57)
print(auc_57)

0.7347381237183869


In [628]:
X_train = X_train.values
y_train = y_train.values

In [629]:
auc_bootstrap = []

In [630]:
rs = RandomState(seed = 57)
bootstrap_auc(optuna_57, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72749421, 0.73768597])

In [631]:
t_57 = auc_bootstrap
print(t_57)

[0.7289735346889952, 0.7289788747436774, 0.7367246240601505, 0.7303299085782639, 0.7333310193096377, 0.7299213943950786, 0.7339558057074504, 0.7324258800410116, 0.7351840182843473, 0.735974346377307, 0.7398405459671907, 0.7314059295967191, 0.7240820446001367, 0.7374775717703349, 0.7336914730006835, 0.7363508202323992, 0.7371438183526999, 0.7338676948051949, 0.7362627093301436, 0.7345165114490771, 0.7375923829460013, 0.732772983595352, 0.7331814977785373, 0.7285810406698565, 0.731806433697881, 0.7330293062200957, 0.7271685962064252, 0.7335339413875598, 0.7360037166780589, 0.7361265379357486, 0.7325273410799727, 0.729966784859877, 0.728898773923445, 0.7300548957621326, 0.7371491584073822, 0.731870514354067, 0.7371037679425837, 0.724186175666439, 0.7380970181134655, 0.732740943267259, 0.7337528836295283, 0.7297264823991798, 0.734372329972659, 0.7340919771018456, 0.7324472402597403, 0.7364656314080656, 0.7371304682159946, 0.7349196855775803, 0.7279802845181135, 0.7335019010594669, 0.733557

In [632]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [633]:
column_to_drop_57 = 'Cat_현재 주택의 점유형태'

In [634]:
if not column_to_drop_57.startswith('Cat_'):
    comp_58 = comp_57.drop(column_to_drop_57, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']
else:
    comp_58 = comp_57.drop(comp_57.filter(regex='^' + column_to_drop_57).columns, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']

print(X_58.shape)

(6119, 9)


In [635]:
X_train, X_test, y_train, y_test = train_test_split(X_58, y_58, test_size=0.2, shuffle=True, stratify=y_58, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [636]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [637]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 7}
0.7172639170610773


In [638]:
optuna_58 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_58.fit(X_train, y_train)

In [639]:
optuna_proba_58 = optuna_58.predict_proba(X_test)[:, 1]
auc_58 = roc_auc_score(y_test, optuna_proba_58)
print(auc_58)

0.7064251537935747


In [640]:
X_train = X_train.values
y_train = y_train.values

In [641]:
auc_bootstrap = []

In [642]:
rs = RandomState(seed = 58)
bootstrap_auc(optuna_58, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.70295679, 0.70648122])

In [643]:
t_58 = auc_bootstrap
print(t_58)

[0.7036696855775801, 0.7057362867395761, 0.7064812243677374, 0.7064251537935747, 0.7058964883800409, 0.7039687286397811, 0.7043585526315788, 0.7030689294258372, 0.7057362867395761, 0.7043585526315788, 0.7057362867395761, 0.7037017259056731, 0.7057362867395761, 0.7064251537935747, 0.7057362867395761, 0.7035148239917975, 0.7055814251537935, 0.7064251537935747, 0.7043585526315788, 0.7043585526315788, 0.7036136150034176, 0.7043585526315788, 0.7039687286397811, 0.7058243976418317, 0.7051942711893369, 0.7033279220779219, 0.7039687286397811, 0.7039687286397811, 0.7034400632262474, 0.7064812243677374, 0.7026150247778535, 0.7043585526315788, 0.7057362867395761, 0.7051942711893369, 0.7055814251537935, 0.7064812243677374, 0.7057362867395761, 0.7058243976418317, 0.7059525589542036, 0.7030689294258372, 0.7043585526315788, 0.7036696855775801, 0.7064812243677374, 0.7064251537935747, 0.7057362867395761, 0.7043585526315788, 0.7064251537935747, 0.7064812243677374, 0.7039687286397811, 0.7057683270676691,

In [644]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc