In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
신혼가구 = pd.read_csv('신혼가구_변수추가.csv', encoding='cp949')
신혼가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
신혼가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'Cat_주택 마련 예상 소요연수','Cat_남편/아내의 부모님과 동거 의향','Cat_가족계획 시 중요 고려 사항 1순위',
    'target'    
]

In [9]:
cat = 신혼가구.select_dtypes(include = 'object')
num = 신혼가구.select_dtypes(exclude = 'object')
num_신혼 = num.drop('target',axis=1)
target = 신혼가구.target

In [10]:
scaler=RobustScaler()
scaler.fit(num_신혼)
num_scaled_신혼=scaler.transform(num_신혼)
num_df_scaled_신혼=pd.DataFrame(data=num_scaled_신혼, columns=num_신혼.columns)

In [11]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [12]:
comp =pd.concat([num_df_scaled_신혼, target,cat2],axis=1)

In [13]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(6119, 221)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [15]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [16]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7886585029925125


In [17]:
optuna_0 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [18]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

0.7765213815789473


In [19]:
X_train = X_train.values
y_train = y_train.values

In [20]:
auc_bootstrap = []

In [21]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76491951, 0.78174683])

In [22]:
t_0 = auc_bootstrap
print(t_0)

[0.7781314080656185, 0.7736083817498292, 0.7778003246753247, 0.7762009782980177, 0.7744521103896104, 0.7726925623718388, 0.7765480818523582, 0.7819575572453862, 0.7737205228981545, 0.7766869232740943, 0.7743746795967191, 0.7722279776144908, 0.768692861414901, 0.7694351290157211, 0.7701320061517429, 0.7746096420027342, 0.7717874231032126, 0.7687409219070404, 0.7728100435748462, 0.7598096804511278, 0.7704497394053316, 0.7694938696172249, 0.7770180066643881, 0.7815383629528365, 0.7723080784347232, 0.7730316558441559, 0.7751009270334928, 0.7740729665071769, 0.7749700956937798, 0.7702147769993166, 0.7752157382091593, 0.7743292891319208, 0.77667624316473, 0.778078007518797, 0.7759179553998634, 0.770097295796309, 0.7713708988380041, 0.7730850563909775, 0.7752958390293916, 0.7796132732399179, 0.7732719583048531, 0.7693229878673958, 0.7687088815789473, 0.7758645548530417, 0.7777843045112782, 0.7732586081681477, 0.7765934723171566, 0.7828253161312372, 0.7729088345864662, 0.7722840481886535, 0.77

In [23]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [14]:
column_to_drop = 'Cat_가구주 동거 여부'

In [15]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(6119, 219)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [27]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [28]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 107, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5}
0.788383041594671


In [29]:
optuna_1 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [30]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7822566003075871


In [31]:
X_train = X_train.values
y_train = y_train.values

In [32]:
auc_bootstrap = []

In [33]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76491457, 0.78314118])

In [34]:
t_1 = auc_bootstrap
print(t_1)

[0.7659267130895421, 0.7703055579289132, 0.7770500469924813, 0.7786600734791524, 0.7753492395762134, 0.7710398154477102, 0.7825609834244702, 0.7705431903622693, 0.7734909005468217, 0.774534881237184, 0.7739554853041695, 0.7672403665413534, 0.7663085269993164, 0.7738273239917977, 0.7781794685577581, 0.7773330698906357, 0.7798028451811345, 0.7747778537252222, 0.7663806177375256, 0.7761182074504442, 0.7750955869788105, 0.7801926691729324, 0.7715978511619959, 0.7712560876623378, 0.7737846035543403, 0.7722119574504442, 0.7784544813738893, 0.7762303485987696, 0.7683911483253589, 0.7765694420710868, 0.7755575017088175, 0.7738566942925496, 0.7781180579289131, 0.7689999145591251, 0.780315490430622, 0.7739608253588517, 0.778812265037594, 0.7796399735133287, 0.7790285372522214, 0.7680573949077238, 0.764225905673274, 0.7760033962747779, 0.7726498419343814, 0.7813247607655502, 0.7763077793916608, 0.7789297462406015, 0.7693443480861244, 0.7749647556390977, 0.7758912551264524, 0.7669706937799042, 0.7

In [35]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [16]:
column_to_drop_1 = '부채 중 임대 보증금의 비중'

In [17]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(6119, 218)


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [39]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [40]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7897937378442224


In [41]:
optuna_2 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [42]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.78268380468216


In [43]:
X_train = X_train.values
y_train = y_train.values

In [44]:
auc_bootstrap = []

In [45]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76422824, 0.7829181 ])

In [46]:
t_2 = auc_bootstrap
print(t_2)

[0.7690159347231716, 0.7753465695488722, 0.7753492395762133, 0.7699050538277512, 0.7726391618250171, 0.774769843643199, 0.7791113080997949, 0.7736270719412167, 0.7716165413533834, 0.7717580528024608, 0.7775066216678059, 0.7747591635338347, 0.7767029434381407, 0.7617614704374573, 0.774078306561859, 0.7742198180109363, 0.7657771915584415, 0.7801259184894054, 0.7789430963773069, 0.7684071684894053, 0.7810203776486673, 0.7712373974709501, 0.7696941216678059, 0.7677930622009569, 0.7791006279904307, 0.7760140763841422, 0.7817386150034177, 0.7758245044429255, 0.7776881835269993, 0.7684418788448393, 0.7702174470266575, 0.7793996710526316, 0.7721612269309637, 0.7791860688653452, 0.7726765422077921, 0.7763638499658236, 0.7799043062200957, 0.7727593130553656, 0.783858616712235, 0.7747351332877648, 0.7701400162337662, 0.7688130126452495, 0.7700812756322624, 0.7678998632946001, 0.773870044429255, 0.7724495898838004, 0.7758965951811347, 0.7721719070403281, 0.772604451469583, 0.7693550281954887, 0.77

In [47]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [18]:
column_to_drop_2 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [19]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(6119, 217)


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [51]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [52]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 6}
0.7862544762477149


In [53]:
optuna_3 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [54]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7811431989063569


In [55]:
X_train = X_train.values
y_train = y_train.values

In [56]:
auc_bootstrap = []

In [57]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76330695, 0.78263488])

In [58]:
t_3 = auc_bootstrap
print(t_3)

[0.7686768412508544, 0.7694725093984962, 0.7715417805878332, 0.7799336765208476, 0.7725804212235134, 0.7780165968899522, 0.7685273197197539, 0.7789804767600821, 0.7752237482911825, 0.7731971975393028, 0.7790712576896788, 0.768161525974026, 0.7747938738892687, 0.7675020292207791, 0.7733947795625427, 0.7651497351332877, 0.771835483595352, 0.7776748333902939, 0.7672964371155161, 0.775239768455229, 0.7700465652768284, 0.776038106630212, 0.7751009270334928, 0.7862936816473001, 0.7735603212576897, 0.7824595223855092, 0.7739875256322626, 0.7708635936431989, 0.7667143711551607, 0.7634863080997949, 0.7722146274777854, 0.7683457578605605, 0.782051008202324, 0.7695499401913876, 0.7821925196514012, 0.7772716592617909, 0.7764519608680793, 0.7677423316814764, 0.7735149307928914, 0.765192455570745, 0.777143497949419, 0.776270399008886, 0.7811992694805194, 0.7709890849282297, 0.7788042549555708, 0.7770019865003419, 0.7710051050922762, 0.7699344241285031, 0.7721104964114832, 0.7778937756322626, 0.76908

In [59]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [20]:
column_to_drop_3 = 'Cat_기초생활보장 수급가구 여부'

In [21]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(6119, 215)


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [63]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [64]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7921351597258742


In [65]:
optuna_4 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [66]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7822138798701298


In [67]:
X_train = X_train.values
y_train = y_train.values

In [68]:
auc_bootstrap = []

In [69]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76405449, 0.78249103])

In [70]:
t_4 = auc_bootstrap
print(t_4)

[0.7759633458646618, 0.7719850051264525, 0.7705218301435407, 0.7721291866028708, 0.7690826854066986, 0.7757604237867396, 0.7762303485987696, 0.7637292805878332, 0.7762223385167464, 0.7667357313738893, 0.7695045497265891, 0.7770900974025975, 0.7765961423444976, 0.7717206724196857, 0.7724869702665755, 0.7797120642515379, 0.7701400162337663, 0.7721078263841422, 0.7684338687628162, 0.7695926606288448, 0.7728500939849624, 0.7648693822624744, 0.7763398197197539, 0.7701800666438825, 0.7677583518455229, 0.7763638499658236, 0.7720250555365686, 0.7781687884483937, 0.7771488380041012, 0.7808708561175667, 0.7800965481886535, 0.774940725393028, 0.7739100948393711, 0.7693149777853725, 0.7796800239234449, 0.7718995642515379, 0.7731411269651401, 0.7684872693096376, 0.7860240088858511, 0.7742999188311689, 0.7779685363978128, 0.7740996667805878, 0.7741076768626111, 0.7765026913875599, 0.7692268668831168, 0.75991381151743, 0.7747992139439508, 0.7802914601845523, 0.7791246582365003, 0.7824435022214629, 0.

In [71]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [22]:
column_to_drop_4 = 'Cat_가구주 장애 여부'

In [23]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(6119, 213)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 113, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7861167455487942


In [77]:
optuna_5 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [78]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7815971035543405


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76400449, 0.78202618])

In [82]:
t_5 = auc_bootstrap
print(t_5)

[0.771637901572112, 0.7673631877990431, 0.77677503417635, 0.7681027853725222, 0.772436239747095, 0.7703215780929598, 0.7797521146616542, 0.7756162423103213, 0.7748632946001367, 0.773434829972659, 0.777642793062201, 0.7751943779904307, 0.77553881151743, 0.7712854579630897, 0.7757684338687628, 0.774438760252905, 0.7708716037252221, 0.7759366455912509, 0.771629891490089, 0.7778377050580999, 0.7747297932330828, 0.7631765849282297, 0.7746096420027341, 0.7751596676349966, 0.7694271189336979, 0.7796373034859878, 0.7678224325017089, 0.7672430365686944, 0.7705271701982228, 0.7763772001025291, 0.7685112995557075, 0.7683243976418318, 0.7696647513670541, 0.7737418831168832, 0.7676008202323992, 0.7720837961380724, 0.7761582578605606, 0.7737125128161313, 0.7681668660287082, 0.7740035457963089, 0.7689465140123035, 0.7769539260082023, 0.7720891361927547, 0.7773944805194805, 0.7764012303485988, 0.7705138200615175, 0.7770740772385509, 0.7758031442241968, 0.7773998205741626, 0.7777949846206424, 0.7608643

In [83]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [24]:
column_to_drop_5 = '소득 중 재산소득의 비중(월평균)'

In [25]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(6119, 212)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7907995893120979


In [89]:
optuna_6 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [90]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7799043062200958


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76409167, 0.78218758])

In [94]:
t_6 = auc_bootstrap
print(t_6)

[0.7670214242993849, 0.773472210355434, 0.7688103426179085, 0.7720143754272044, 0.7742518583390294, 0.7771488380041013, 0.7706072710184553, 0.7693336679767602, 0.7814449119958989, 0.7741343771360218, 0.7748926649008885, 0.7728073735475052, 0.7677022812713602, 0.7682336167122352, 0.7725884313055366, 0.7758645548530417, 0.7726525119617225, 0.7725457108680793, 0.769298957621326, 0.7758805750170882, 0.7730049555707451, 0.7757577537593985, 0.7796907040328093, 0.7680440447710185, 0.7760968472317156, 0.7725884313055366, 0.7735496411483254, 0.7720410757006151, 0.769296287593985, 0.7731811773752563, 0.7706766917293233, 0.783954737696514, 0.7716379015721121, 0.7708101930963773, 0.7700812756322625, 0.7619296821599453, 0.7807106544771019, 0.7723855092276144, 0.786456553315106, 0.7697234919685578, 0.7738833945659604, 0.7681428357826384, 0.7597562799043063, 0.7648320018796992, 0.7714403195488722, 0.770970394736842, 0.7736751324333561, 0.7773170497265891, 0.7731891874572797, 0.774339969241285, 0.7789

In [95]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
column_to_drop_6 = 'Cat_가구주 주민등록상 등재 여부'

In [27]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(6119, 210)


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [99]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [100]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7880908855666574


In [101]:
optuna_7 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [102]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7796186132946001


In [103]:
X_train = X_train.values
y_train = y_train.values

In [104]:
auc_bootstrap = []

In [105]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7637483 , 0.78199113])

In [106]:
t_7 = auc_bootstrap
print(t_7)

[0.7785559424128503, 0.7781180579289132, 0.7641511449077238, 0.7733840994531784, 0.7748179041353382, 0.7706580015379356, 0.7760274265208476, 0.7688744232740943, 0.7692669172932329, 0.7673177973342447, 0.7834367523923446, 0.7770740772385509, 0.7724469198564592, 0.7702708475734792, 0.7789350862952836, 0.7755494916267943, 0.7701213260423787, 0.7724602699931647, 0.7636571898496239, 0.772163896958305, 0.7751196172248803, 0.7722306476418319, 0.7708689336978811, 0.770401678913192, 0.7707434424128503, 0.7718381536226931, 0.780176649008886, 0.7760381066302119, 0.7680306946343131, 0.7709570446001368, 0.7761368976418318, 0.7776801734449761, 0.7742945787764867, 0.7687516020164047, 0.7752370984278879, 0.7727753332194122, 0.7687435919343814, 0.7779712064251537, 0.7698062628161312, 0.7706259612098428, 0.7742465182843473, 0.7749620856117567, 0.7703616285030758, 0.7702441473000683, 0.7664660586124401, 0.7760327665755296, 0.7766869232740943, 0.7699184039644567, 0.7732319078947368, 0.7736911525974026, 0.

In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
column_to_drop_7 = '자산 중 부동산 자산의 비중'

In [29]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(6119, 209)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [31]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [112]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7864965483852119


In [32]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7864965483852119


In [33]:
optuna_8 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [34]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7782515592959671


In [35]:
X_train = X_train.values
y_train = y_train.values

In [36]:
auc_bootstrap = []

In [37]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76141136, 0.78076986])

In [38]:
t_8 = auc_bootstrap
print(t_8)

[0.7747538234791523, 0.7773971505468218, 0.7786841037252221, 0.7659614234449761, 0.7711172462406014, 0.7757577537593985, 0.7724549299384824, 0.769298957621326, 0.7706473214285713, 0.7755361414900889, 0.7820483381749828, 0.7697261619958988, 0.7758698949077238, 0.7824381621667805, 0.7766415328092959, 0.7757791139781272, 0.7774452110389609, 0.7707140721120984, 0.7624716977101845, 0.7764786611414901, 0.7628081211551606, 0.7691280758714969, 0.7621059039644567, 0.7709089841079972, 0.7643594070403281, 0.7609738123718387, 0.7707461124401913, 0.7594358766233765, 0.7673418275803146, 0.7739688354408749, 0.776275739063568, 0.7679052033492823, 0.7725243506493507, 0.7698062628161313, 0.7682496368762817, 0.762920262303486, 0.7766281826725905, 0.7723828392002734, 0.7622073650034176, 0.7754507006151743, 0.773370749316473, 0.7741236970266576, 0.7624770377648669, 0.768158855946685, 0.7806919642857142, 0.764522278708134, 0.762720010252905, 0.7707007219753931, 0.7687916524265208, 0.7702121069719755, 0.7802

In [39]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
column_to_drop_8 = '소득 중 사적이전소득의 비중(월평균)'

In [31]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(6119, 208)


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [122]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [123]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7886877185953138


In [124]:
optuna_9 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [125]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7830549384825701


In [126]:
X_train = X_train.values
y_train = y_train.values

In [127]:
auc_bootstrap = []

In [128]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76309615, 0.7822594 ])

In [129]:
t_9 = auc_bootstrap
print(t_9)

[0.7749060150375939, 0.7753999700956937, 0.7751436474709501, 0.7737552332535886, 0.774307928913192, 0.7728794642857141, 0.7766628930280246, 0.7735656613123717, 0.7655956296992481, 0.7759766960013671, 0.7765614319890637, 0.7720784560833902, 0.7724896402939168, 0.7652939166097061, 0.7647786013328777, 0.7762196684894053, 0.776171607997266, 0.7743746795967192, 0.7784598214285713, 0.7715978511619959, 0.7732132177033492, 0.7709169941900205, 0.7801285885167464, 0.776475991114149, 0.7770340268284347, 0.7756963431305536, 0.7683297376965141, 0.7773357399179768, 0.7757524137047163, 0.7788282852016405, 0.7705218301435408, 0.7700438952494875, 0.7733280288790159, 0.7704497394053316, 0.7747484834244702, 0.7750368463773069, 0.7701453562884484, 0.7809215866370471, 0.7807346847231716, 0.7730850563909774, 0.7763077793916608, 0.7817359449760766, 0.7750929169514696, 0.7774158407382091, 0.7682683270676691, 0.7672350264866712, 0.7695819805194806, 0.7716913021189336, 0.7714776999316473, 0.7740836466165413, 0.

In [130]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [32]:
column_to_drop_9 = 'Cat_현재 주택의 위치'

In [33]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(6119, 205)


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [134]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [135]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 4}
0.7901944089683554


In [136]:
optuna_10 = RandomForestClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [137]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.7854099025974026


In [138]:
X_train = X_train.values
y_train = y_train.values

In [139]:
auc_bootstrap = []

In [140]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76485683, 0.78249283])

In [141]:
t_10 = auc_bootstrap
print(t_10)

[0.7749247052289815, 0.7757283834586466, 0.7759473257006152, 0.7685860603212576, 0.777642793062201, 0.772535030758715, 0.7653419771018455, 0.7725270206766918, 0.7693630382775121, 0.7808761961722488, 0.7723828392002734, 0.7702334671907041, 0.7760194164388243, 0.7720143754272043, 0.7799443566302119, 0.7739100948393711, 0.7670561346548188, 0.7726097915242652, 0.7734802204374572, 0.7773517600820232, 0.779282189849624, 0.7726097915242651, 0.774174427546138, 0.7750208262132603, 0.7671762858851674, 0.7781661184210527, 0.7670214242993848, 0.7763237995557075, 0.7763211295283664, 0.7726765422077921, 0.7679559338687627, 0.7725644010594668, 0.7755441515721122, 0.7742678785030759, 0.7785239020847574, 0.7754320104237866, 0.7698569933356118, 0.7745962918660286, 0.7744948308270676, 0.7755227913533833, 0.7647839413875598, 0.7695713004101162, 0.7792341293574846, 0.774537551264525, 0.7739367951127819, 0.7731277768284348, 0.7729115046138073, 0.7789083860218728, 0.7738967447026659, 0.7732559381408066, 0.77

In [142]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
column_to_drop_10 = '부채 중 비금융기관 대출금의 비중'

In [35]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(6119, 204)


In [145]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [146]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [147]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 3}
0.7878822026895049


In [148]:
optuna_11 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [149]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7786040029049899


In [150]:
X_train = X_train.values
y_train = y_train.values

In [151]:
auc_bootstrap = []

In [152]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76447949, 0.78304419])

In [153]:
t_11 = auc_bootstrap
print(t_11)

[0.7755975521189337, 0.7762970992822966, 0.7744574504442925, 0.7718621838687628, 0.7684578990088857, 0.7757897940874915, 0.7811298487696514, 0.7766548829460013, 0.7815383629528366, 0.76639129784689, 0.7639695830485305, 0.777474581339713, 0.783490152939166, 0.7729462149692414, 0.7711279263499657, 0.7830415883458647, 0.7836877349624061, 0.7739261150034176, 0.7774185107655502, 0.7737712534176351, 0.7773170497265893, 0.7724228896103895, 0.7790525674982913, 0.7713094882091592, 0.7704924598427887, 0.7660815746753247, 0.7728073735475051, 0.7679479237867396, 0.7800698479152426, 0.7694938696172249, 0.7691948265550239, 0.7741183569719754, 0.7670588046821599, 0.7659560833902939, 0.7777495941558441, 0.7704844497607656, 0.7749380553656868, 0.7731918574846206, 0.7758405246069721, 0.7698249530075189, 0.7725537209501026, 0.7757737739234449, 0.77400621582365, 0.7765240516062885, 0.7718087833219411, 0.7742652084757349, 0.7733974495898837, 0.7790926179084073, 0.7650696343130554, 0.769632711038961, 0.7775

In [154]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [36]:
column_to_drop_11 = 'Cat_가구주 성별'

In [37]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(6119, 202)


In [157]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [158]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [159]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7897853905291363


In [160]:
optuna_12 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [161]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7782221889952152


In [162]:
X_train = X_train.values
y_train = y_train.values

In [163]:
auc_bootstrap = []

In [164]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76678559, 0.78400854])

In [165]:
t_12 = auc_bootstrap
print(t_12)

[0.776409240430622, 0.773336038961039, 0.7758538747436774, 0.7719636449077238, 0.7807026443950785, 0.7834794728298018, 0.7719823350991114, 0.7772342788790156, 0.7703055579289132, 0.7802220394736843, 0.7746977529049897, 0.7766762431647299, 0.7699184039644567, 0.7709597146274777, 0.7751516575529733, 0.777776294429255, 0.7720971462747779, 0.7791753887559808, 0.7813888414217361, 0.778980476760082, 0.7846943352699932, 0.7793703007518797, 0.7741610774094327, 0.7746256621667806, 0.7813007305194806, 0.7824114618933697, 0.7738967447026659, 0.7787561944634314, 0.7819976076555022, 0.7745989618933697, 0.779912316302119, 0.7703883287764867, 0.7716218814080656, 0.7739875256322625, 0.7710745258031442, 0.7754747308612441, 0.7760861671223513, 0.7738593643198906, 0.7769592660628845, 0.7735629912850307, 0.7722653579972659, 0.7783049598427888, 0.7684071684894054, 0.7900397300068354, 0.7797708048530417, 0.7716485816814764, 0.7696113508202325, 0.7782088388585098, 0.773373419343814, 0.7715097402597403, 0.782

In [166]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [38]:
column_to_drop_12 = 'Cat_소득 계층'

In [39]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(6119, 200)


In [169]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [170]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [171]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 197, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7886835449377709


In [172]:
optuna_13 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [173]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.7832391703691046


In [174]:
X_train = X_train.values
y_train = y_train.values

In [175]:
auc_bootstrap = []

In [176]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76620633, 0.78336553])

In [177]:
t_13 = auc_bootstrap
print(t_13)

[0.7835195232399179, 0.7682816772043746, 0.7699931647300069, 0.7703776486671223, 0.7696620813397129, 0.7784758415926178, 0.7833646616541353, 0.7784251110731373, 0.7761716079972658, 0.774737803315106, 0.7709383544087491, 0.7718034432672589, 0.7756028921736159, 0.7831937799043063, 0.7811538790157211, 0.769792912679426, 0.7674646488380041, 0.7709463644907724, 0.7721211765208476, 0.7744254101161996, 0.7754320104237867, 0.7761342276144908, 0.7712774478810662, 0.7794397214627478, 0.7774452110389611, 0.7715951811346549, 0.7765747821257689, 0.7779765464798358, 0.775976696001367, 0.7803635509227613, 0.776742993848257, 0.7832258202323992, 0.7757150333219412, 0.7773944805194806, 0.7827318651742994, 0.7684979494190021, 0.7681161355092275, 0.782149799213944, 0.7792715097402598, 0.7765560919343814, 0.7766655630553656, 0.7756963431305537, 0.7781634483937117, 0.7760167464114832, 0.7648667122351333, 0.7771274777853726, 0.7796987141148325, 0.7776614832535885, 0.7740783065618593, 0.7725403708133971, 0.77

In [178]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
column_to_drop_13 = 'Cat_현재 공공기관 접근용이성'

In [41]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(6119, 196)


In [181]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [182]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [183]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7885332932662209


In [184]:
optuna_14 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [185]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7783663704716336


In [186]:
X_train = X_train.values
y_train = y_train.values

In [187]:
auc_bootstrap = []

In [188]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76261281, 0.78162167])

In [189]:
t_14 = auc_bootstrap
print(t_14)

[0.7738219839371157, 0.7750849068694464, 0.7756002221462748, 0.7682816772043746, 0.7693256578947368, 0.7696727614490771, 0.7695419301093642, 0.7677476717361585, 0.7667143711551606, 0.7668131621667805, 0.7775493421052632, 0.7689037935748462, 0.7728954844497608, 0.7678251025290499, 0.7770500469924811, 0.773664452323992, 0.7695018796992481, 0.7695686303827752, 0.7720143754272044, 0.776339819719754, 0.769429788961039, 0.7709970950102529, 0.7744227400888585, 0.7700492353041695, 0.7737312030075189, 0.7732586081681476, 0.7741397171907041, 0.7681855562200958, 0.7736884825700615, 0.77764012303486, 0.7621032339371157, 0.766927973342447, 0.7777202238550923, 0.7651470651059467, 0.7607548701298701, 0.7692802674299384, 0.7758992652084757, 0.7703642985304169, 0.7807640550239234, 0.7750154861585781, 0.7766762431647299, 0.7746657125768968, 0.7739608253588517, 0.769063995215311, 0.7816478340738209, 0.7664073180109364, 0.7648587021531101, 0.7664660586124401, 0.7727833433014353, 0.7719342746069721, 0.7698

In [190]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [42]:
column_to_drop_14 = '총 이사 횟수'

In [43]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(6119, 195)


In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [194]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [195]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 179, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7886167664170819


In [196]:
optuna_15 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [197]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.780214029391661


In [198]:
X_train = X_train.values
y_train = y_train.values

In [199]:
auc_bootstrap = []

In [200]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76716968, 0.78415766])

In [201]:
t_15 = auc_bootstrap
print(t_15)

[0.7764626409774436, 0.7675874700956937, 0.7778724154135339, 0.7681855562200958, 0.7784304511278195, 0.7779925666438824, 0.7752451085099111, 0.7773757903280929, 0.7655555792891319, 0.7762063183527, 0.7769832963089542, 0.771330848427888, 0.7671709458304853, 0.7716138713260423, 0.7686741712235132, 0.7778911056049215, 0.7824888926862611, 0.7814956425153794, 0.7675340695488722, 0.7751596676349966, 0.7824034518113465, 0.7762970992822966, 0.7794317113807245, 0.7854045625427205, 0.7797547846889952, 0.7742812286397812, 0.7762356886534518, 0.7661109449760766, 0.7763344796650717, 0.7756936731032127, 0.7614490772385509, 0.7745268711551607, 0.7745455613465482, 0.7707915029049897, 0.7748152341079972, 0.7674352785372522, 0.7729141746411483, 0.7705752306903623, 0.7814449119958989, 0.7815437030075189, 0.7807667250512645, 0.7784010808270677, 0.7762490387901573, 0.7832098000683527, 0.773904754784689, 0.7870119190020506, 0.7749220352016404, 0.7752531185919344, 0.7730289858168147, 0.7772583091250855, 0.77

In [202]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
column_to_drop_15 = '현재 주택 거주 기간(총 개월)'

In [45]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(6119, 194)


In [205]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [206]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [207]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7874189267022261


In [208]:
optuna_16 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [209]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7835729237867395


In [210]:
X_train = X_train.values
y_train = y_train.values

In [211]:
auc_bootstrap = []

In [212]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76301625, 0.78277746])

In [213]:
t_16 = auc_bootstrap
print(t_16)

[0.7742865686944636, 0.7672857570061518, 0.7576042378673957, 0.77106384569378, 0.7647705912508543, 0.7679185534859878, 0.7730183057074504, 0.7693229878673958, 0.7712907980177717, 0.7752931690020506, 0.7636278195488722, 0.7670641447368421, 0.7686181006493508, 0.7711386064593301, 0.7750315063226249, 0.7712747778537251, 0.7749193651742993, 0.7707461124401913, 0.7641911953178401, 0.7876847658920028, 0.7635183484278879, 0.7642152255639098, 0.7755548316814764, 0.7712187072795625, 0.7687756322624744, 0.7699878246753247, 0.7766628930280246, 0.7760194164388244, 0.776777704203691, 0.7686181006493507, 0.7765160415242651, 0.7701907467532467, 0.7660308441558441, 0.7728928144224196, 0.7751810278537252, 0.7707327623034861, 0.7686875213602188, 0.7721932672590568, 0.7729595651059468, 0.7776214328434725, 0.7699744745386194, 0.7702761876281614, 0.7745268711551606, 0.7681481758373205, 0.7750208262132603, 0.7674512987012987, 0.7725590610047846, 0.7740756365345182, 0.768433868762816, 0.7705939208817498, 0.7

In [214]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
column_to_drop_16 = 'Cat_주택 보유 의식'

In [47]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(6119, 192)


In [217]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [218]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [219]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7896768754330169


In [220]:
optuna_17 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [221]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7781607783663704


In [222]:
X_train = X_train.values
y_train = y_train.values

In [223]:
auc_bootstrap = []

In [224]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7658332 , 0.78243042])

In [225]:
t_17 = auc_bootstrap
print(t_17)

[0.7789858168147641, 0.7775012816131237, 0.779744104579631, 0.775042186431989, 0.7718888841421736, 0.7664767387218046, 0.7695392600820233, 0.7742465182843474, 0.7753705997949418, 0.7755147812713601, 0.774072966507177, 0.7812660201640464, 0.7646344198564593, 0.7729488849965824, 0.7763344796650717, 0.7727913533834587, 0.779613273239918, 0.7813995215311005, 0.772772663192071, 0.7770046565276829, 0.770262837491456, 0.7778937756322625, 0.7764412807587149, 0.7786707535885167, 0.7810043574846206, 0.7727165926179085, 0.775709693267259, 0.7714830399863295, 0.7805050623718386, 0.7721638969583049, 0.7716752819548872, 0.7775920625427204, 0.773605711722488, 0.773138456937799, 0.7791673786739576, 0.7851776102187287, 0.772468280075188, 0.7819094967532468, 0.7697795625427204, 0.7785666225222146, 0.7749594155844157, 0.7712907980177718, 0.7793756408065617, 0.7762837491455912, 0.7778697453861927, 0.7782408791866029, 0.77867876367054, 0.7717740729665072, 0.7712160372522215, 0.7692428870471633, 0.768959864

In [226]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [48]:
column_to_drop_17 = 'Cat_이사 예상 기간'

In [49]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(6119, 188)


In [229]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [230]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [231]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7895725339944407


In [232]:
optuna_18 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [233]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.7793996710526315


In [234]:
X_train = X_train.values
y_train = y_train.values

In [235]:
auc_bootstrap = []

In [236]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76583046, 0.78197698])

In [237]:
t_18 = auc_bootstrap
print(t_18)

[0.7751116071428571, 0.7722947282980178, 0.7783102998974709, 0.7738006237183869, 0.7713094882091592, 0.7735015806561859, 0.7775920625427205, 0.7728554340396446, 0.7819682373547505, 0.7791833988380041, 0.7735656613123719, 0.7730717062542721, 0.775402640123035, 0.7766548829460014, 0.7712747778537252, 0.7764252605946684, 0.7786921138072455, 0.7605092276144908, 0.7752664687286398, 0.776275739063568, 0.7735923615857826, 0.7727593130553656, 0.7709356843814081, 0.7748526144907724, 0.7799657168489406, 0.7768097445317839, 0.7690453050239234, 0.7734027896445659, 0.7736938226247436, 0.7698783535543404, 0.7689491840396446, 0.7739127648667123, 0.7753145292207793, 0.7801819890635682, 0.780243399692413, 0.7738273239917977, 0.7750582065960356, 0.7728207236842105, 0.7766175025632263, 0.7746496924128503, 0.7750475264866712, 0.7732933185235816, 0.7756456126110731, 0.7804917122351334, 0.7705138200615175, 0.7680654049897471, 0.7708716037252221, 0.7663298872180451, 0.776238358680793, 0.7786547334244703, 0.7

In [238]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [50]:
column_to_drop_18 = 'Cat_현재 교육환경'

In [51]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(6119, 184)


In [241]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [242]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [243]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7881159275119157


In [244]:
optuna_19 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [245]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7819548872180452


In [246]:
X_train = X_train.values
y_train = y_train.values

In [247]:
auc_bootstrap = []

In [248]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76506396, 0.78232262])

In [249]:
t_19 = auc_bootstrap
print(t_19)

[0.7795999231032126, 0.7681775461380724, 0.7783770505809979, 0.7686661611414901, 0.7688290328092959, 0.772067775974026, 0.7816558441558441, 0.7799043062200957, 0.7696754314764183, 0.771400269138756, 0.7779311560150376, 0.7764225905673274, 0.7783877306903623, 0.7727539730006836, 0.7700065148667122, 0.7839493976418319, 0.7703589584757348, 0.7702521573820916, 0.7753812799043063, 0.7738353340738208, 0.7681882262474368, 0.7742491883116882, 0.7795732228298018, 0.7804356416609706, 0.7721104964114832, 0.7669947240259741, 0.7783183099794942, 0.7781581083390294, 0.7757684338687629, 0.7737258629528367, 0.7766068224538618, 0.7756028921736159, 0.7814155416951469, 0.7746950828776489, 0.7770366968557758, 0.7688584031100478, 0.772070446001367, 0.7817332749487353, 0.7796907040328093, 0.773101076555024, 0.7738006237183869, 0.777546672077922, 0.7752290883458646, 0.7717447026657552, 0.7743906997607655, 0.7680467147983595, 0.7773651102187287, 0.7742972488038278, 0.7754133202323992, 0.7767002734107997, 0.77

In [250]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
column_to_drop_19 = 'Cat_현재 주택의 구조'

In [53]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(6119, 182)


In [253]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [254]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [255]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7858704997537541


In [256]:
optuna_20 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [257]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7763985603212578


In [258]:
X_train = X_train.values
y_train = y_train.values

In [259]:
auc_bootstrap = []

In [260]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76357902, 0.78193126])

In [261]:
t_20 = auc_bootstrap
print(t_20)

[0.7770046565276829, 0.7806358937115516, 0.7789991669514696, 0.7743132689678741, 0.7704043489405331, 0.7747271232057417, 0.7794637517088174, 0.7778911056049214, 0.7680039943609022, 0.7823393711551606, 0.7737739234449761, 0.7754907510252904, 0.7767803742310322, 0.769397748632946, 0.7751890379357484, 0.7754159902597402, 0.7573399051606289, 0.7692028366370471, 0.7662070659603554, 0.7694191088516746, 0.7695259099453179, 0.7733120087149692, 0.7760434466848942, 0.7801659688995215, 0.7731144266917293, 0.763822731544771, 0.7745135210184553, 0.7669226332877648, 0.7726658620984279, 0.7739634953861927, 0.7782168489405332, 0.7787989149008885, 0.7646611201298701, 0.7750955869788108, 0.7680039943609023, 0.7753492395762133, 0.7736911525974026, 0.7757630938140805, 0.7732719583048531, 0.7818934765892003, 0.7740329160970609, 0.7727246026999316, 0.7739501452494875, 0.7692589072112097, 0.7677289815447711, 0.7673685278537251, 0.7775226418318524, 0.7703936688311688, 0.7713869190020506, 0.7666502904989747, 0

In [262]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [54]:
column_to_drop_20 = 'Cat_현재 대기오염 정도'

In [55]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(6119, 178)


In [265]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [266]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [267]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7865967161662452


In [268]:
optuna_21 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [269]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7800378075871498


In [270]:
X_train = X_train.values
y_train = y_train.values

In [271]:
auc_bootstrap = []

In [272]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76690868, 0.78273527])

In [273]:
t_21 = auc_bootstrap
print(t_21)

[0.7738059637730691, 0.777341079972659, 0.7760274265208474, 0.7713228383458646, 0.7790151871155161, 0.776208988380041, 0.7744093899521531, 0.7755788619275461, 0.7804036013328777, 0.7785986628503077, 0.774505510936432, 0.7712427375256321, 0.7766041524265208, 0.775472060833903, 0.7697528622693095, 0.7773784603554339, 0.7780246069719753, 0.7800564977785373, 0.7790552375256322, 0.7810363978127137, 0.7724575999658236, 0.778945766404648, 0.7777629442925496, 0.7766228426179083, 0.7793195702323992, 0.7734001196172249, 0.778080677546138, 0.7768524649692412, 0.772705912508544, 0.7766655630553656, 0.7726418318523581, 0.7815437030075187, 0.7676195104237867, 0.7680493848257006, 0.7784811816473001, 0.7710024350649352, 0.7779071257689678, 0.7756910030758715, 0.7717046522556391, 0.7781581083390293, 0.7721291866028708, 0.7765240516062886, 0.7676889311346549, 0.7704817797334245, 0.7751650076896788, 0.7749060150375939, 0.7750475264866713, 0.7778136748120301, 0.7774505510936431, 0.77400621582365, 0.771021

In [274]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
column_to_drop_21 = 'Cat_현재 주변도로의 보행 안전'

In [57]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(6119, 174)


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [59]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [60]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 5}
0.7903738762427066


In [61]:
optuna_22 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [62]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7815383629528365


In [63]:
X_train = X_train.values
y_train = y_train.values

In [64]:
auc_bootstrap = []

In [65]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76691109, 0.78358681])

In [66]:
t_22 = auc_bootstrap
print(t_22)

[0.7717633928571428, 0.7762009782980177, 0.7730957365003418, 0.7745482313738894, 0.7689705442583732, 0.7782382091592618, 0.7749941259398496, 0.7728794642857143, 0.7715017301777171, 0.7751383074162679, 0.7795385124743677, 0.770732762303486, 0.7773864704374572, 0.7818507561517429, 0.7761582578605606, 0.7712080271701982, 0.782253930280246, 0.7863737824675324, 0.7729889354066986, 0.7810070275119616, 0.7698676734449761, 0.7766682330827068, 0.7720704460013671, 0.7749914559125085, 0.7769699461722489, 0.7776454630895421, 0.768161525974026, 0.7641698350991114, 0.7742758885850991, 0.7763211295283664, 0.7812313098086126, 0.7746149820574162, 0.7715684808612441, 0.7666930109364319, 0.7775653622693095, 0.772705912508544, 0.7727886833561175, 0.7776694933356119, 0.7799897470950101, 0.7690186047505125, 0.782854686431989, 0.7726818822624744, 0.7770740772385509, 0.7771381578947368, 0.7810310577580315, 0.7782889396787424, 0.7738246539644567, 0.7779231459330143, 0.7745161910457963, 0.7757043532125769, 0.78

In [67]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [58]:
column_to_drop_22 = 'Cat_남편/아내의 부모님과 동거 의향'

In [59]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(6119, 169)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [71]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [72]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7919097822185494


In [73]:
optuna_23 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [74]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.7819949376281612


In [75]:
X_train = X_train.values
y_train = y_train.values

In [76]:
auc_bootstrap = []

In [77]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76554724, 0.78317335])

In [78]:
t_23 = auc_bootstrap
print(t_23)

[0.7698569933356116, 0.776043446684894, 0.7791273282638413, 0.7800458176691729, 0.7768551349965823, 0.7752878289473685, 0.7833833518455229, 0.7855113636363638, 0.7734962406015037, 0.7796720138414217, 0.772834073820916, 0.7820082877648667, 0.7746577024948735, 0.7746069719753931, 0.7762703990088857, 0.779250149521531, 0.7818480861244019, 0.7752691387559809, 0.7743453092959671, 0.7851455698906358, 0.7716966421736158, 0.7742652084757349, 0.7774478810663021, 0.7770019865003419, 0.7708582535885168, 0.7844540328092959, 0.779645313568011, 0.7733814294258373, 0.772639161825017, 0.7802781100478469, 0.7812393198906357, 0.7739181049213943, 0.7720250555365687, 0.7721905972317156, 0.7834741327751196, 0.773643092105263, 0.7793916609706084, 0.7686047505126452, 0.7800297975051265, 0.7759900461380725, 0.7707381023581681, 0.7681508458646616, 0.7766842532467533, 0.7755227913533835, 0.7761475777511962, 0.7693630382775118, 0.7823100008544087, 0.7762490387901572, 0.7774906015037594, 0.7804917122351334, 0.777

In [79]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [60]:
column_to_drop_23 = '부채 중 금융기관 대출금의 비중'

In [61]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(6119, 168)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [83]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [84]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7863546440287481


In [85]:
optuna_24 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [86]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7795865729665071


In [87]:
X_train = X_train.values
y_train = y_train.values

In [88]:
auc_bootstrap = []

In [89]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7636939 , 0.78087366])

In [90]:
t_24 = auc_bootstrap
print(t_24)

[0.7776881835269992, 0.7728741242310322, 0.7826971548188653, 0.7722867182159945, 0.7703963388585099, 0.7644421778879016, 0.7706953819207109, 0.7754960910799726, 0.7710878759398496, 0.7644154776144909, 0.7701960868079288, 0.7697768925153794, 0.7696033407382092, 0.7709597146274777, 0.7776828434723172, 0.771499060150376, 0.7775680322966507, 0.7613022257347914, 0.7701213260423787, 0.7727886833561175, 0.7692482271018455, 0.770567220608339, 0.7636037893028025, 0.777845715140123, 0.7764306006493507, 0.7715471206425153, 0.7774024906015037, 0.7750475264866713, 0.773402789644566, 0.7736564422419685, 0.7739608253588517, 0.7781714584757349, 0.7784251110731376, 0.7755147812713602, 0.7775867224880383, 0.7761983082706767, 0.7778403750854408, 0.7730209757347916, 0.7728474239576213, 0.7693737183868764, 0.7694725093984963, 0.7715631408065617, 0.7806278836295284, 0.7744921607997266, 0.7701827366712235, 0.776473321086808, 0.773333368933698, 0.7755121112440191, 0.7762223385167464, 0.774238508202324, 0.7675

In [91]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [62]:
column_to_drop_24 = 'Cat_현재 상업시설 접근용이성'

In [63]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(6119, 164)


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [95]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [96]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7939298324693862


In [97]:
optuna_25 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [98]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.7826277341079972


In [99]:
X_train = X_train.values
y_train = y_train.values

In [100]:
auc_bootstrap = []

In [101]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76545605, 0.78356578])

In [102]:
t_25 = auc_bootstrap
print(t_25)

[0.7720544258373205, 0.7753679297676008, 0.7756376025290499, 0.7761983082706766, 0.7720971462747779, 0.759847060833903, 0.7745509014012302, 0.7791700487012987, 0.7728500939849624, 0.7762917592276146, 0.7762089883800409, 0.7704604195146958, 0.7772823393711552, 0.7777148838004102, 0.7794797718728639, 0.7700225350307587, 0.7700412252221464, 0.7731518070745045, 0.7800938781613125, 0.771768732911825, 0.7729221847231715, 0.775402640123035, 0.7803235005126453, 0.7787989149008885, 0.7860159988038278, 0.7793329203691046, 0.7841683398838004, 0.7836076341421736, 0.7734535201640464, 0.7746363422761449, 0.7752637987012987, 0.768431198735475, 0.7748365943267259, 0.7802433996924127, 0.7680734150717703, 0.7767136235475051, 0.7767937243677376, 0.7759206254272044, 0.7806599239576213, 0.7752184082365003, 0.7713842489747095, 0.7724736201298701, 0.7763291396103896, 0.7748552845181134, 0.769029284859877, 0.7650963345864662, 0.7738540242652086, 0.7784251110731374, 0.772340118762816, 0.7730450059808612, 0.770

In [103]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [64]:
column_to_drop_25 = 'Cat_현재 문화시설 접근용이성'

In [65]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(6119, 160)


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [107]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [108]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 105, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.789593402282156


In [109]:
optuna_26 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [110]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.7837464755639098


In [111]:
X_train = X_train.values
y_train = y_train.values

In [112]:
auc_bootstrap = []

In [113]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76736766, 0.78473852])

In [114]:
t_26 = auc_bootstrap
print(t_26)

[0.7756295924470267, 0.7725003204032809, 0.7768871753246753, 0.7787962448735474, 0.770193416780588, 0.7804997223171566, 0.7720143754272044, 0.7764866712235133, 0.770933014354067, 0.7826223940533151, 0.7787615345181135, 0.7700465652768285, 0.7798642558099795, 0.7775039516404648, 0.7828680365686944, 0.7671015251196172, 0.7746443523581681, 0.7762063183526999, 0.7758191643882433, 0.7725029904306221, 0.7764466208133971, 0.7835115131578948, 0.7769218856801094, 0.768861073137389, 0.7842110603212578, 0.773368079289132, 0.7764573009227614, 0.7736484321599454, 0.7813434509569378, 0.7760461167122351, 0.7747030929596718, 0.7774505510936434, 0.774040926179084, 0.782852016404648, 0.7757817840054682, 0.767563439849624, 0.7807480348598769, 0.7758538747436774, 0.7760567968215996, 0.7761182074504442, 0.7773250598086124, 0.7690559851332877, 0.7731785073479153, 0.7725110005126452, 0.7830042079630894, 0.7824595223855092, 0.7760087363294601, 0.7759606758373205, 0.7759900461380724, 0.7771301478127137, 0.7800

In [115]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [66]:
column_to_drop_26 = 'Cat_가족계획 시 중요 고려 사항 1순위'

In [67]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(6119, 153)


In [118]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [119]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [120]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7890424794864732


In [121]:
optuna_27 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [122]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27) 
print(auc_27)

0.782285970608339


In [123]:
X_train = X_train.values
y_train = y_train.values

In [124]:
auc_bootstrap = []

In [125]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76471939, 0.78318977])

In [126]:
t_27 = auc_bootstrap
print(t_27)

[0.7765320616883117, 0.7720731160287081, 0.7757871240601504, 0.7726765422077924, 0.7715150803144224, 0.777576042378674, 0.7819655673274094, 0.7738753844839371, 0.775375939849624, 0.7765560919343815, 0.7734668703007518, 0.7817359449760766, 0.7798509056732741, 0.7639535628844838, 0.7643166866028708, 0.7762250085440876, 0.7693924085782639, 0.7754533706425154, 0.7720624359193438, 0.7761502477785374, 0.7737098427887901, 0.7823954417293233, 0.7807320146958304, 0.7798188653451812, 0.7747351332877648, 0.7799710569036227, 0.7698650034176349, 0.7691440960355435, 0.7788149350649349, 0.7752824888926863, 0.7818827964798359, 0.7740462662337663, 0.7771968984962406, 0.7730183057074504, 0.7813220907382092, 0.7809376068010937, 0.7824114618933697, 0.7608910415242651, 0.7799069762474367, 0.7690025845864661, 0.7761662679425838, 0.773870044429255, 0.7838799769309639, 0.7735202708475735, 0.7730770463089542, 0.7646637901572113, 0.7716592617908408, 0.7704123590225563, 0.7693203178400547, 0.7724522599111416, 0.

In [127]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [68]:
column_to_drop_27 = '현재 무주택 기간(총 개월)'

In [69]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(6119, 152)


In [130]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [131]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [132]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3}
0.7900483309543485


In [133]:
optuna_28 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [134]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7881680408407382


In [135]:
X_train = X_train.values
y_train = y_train.values

In [136]:
auc_bootstrap = []

In [137]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76997875, 0.7859146 ])

In [138]:
t_28 = auc_bootstrap
print(t_28)

[0.778078007518797, 0.7722279776144907, 0.7799977571770336, 0.7819281869446344, 0.7724148795283664, 0.7714670198222828, 0.7783343301435406, 0.7771862183868764, 0.7782195189678742, 0.7717874231032126, 0.7673311474709501, 0.7767937243677375, 0.7757577537593985, 0.7895671351674641, 0.7744120599794941, 0.7798695958646616, 0.7734161397812714, 0.7795892429938482, 0.7802514097744361, 0.7820963986671224, 0.778312969924812, 0.7728500939849625, 0.780820125598086, 0.7787535244360902, 0.7880959501025291, 0.77771488380041, 0.7726738721804512, 0.7732212277853726, 0.7806786141490089, 0.7763932202665755, 0.777479921394395, 0.7788256151742994, 0.7716779519822282, 0.7777469241285031, 0.7762063183526998, 0.7749407253930279, 0.7798776059466849, 0.7757444036226931, 0.7831670796308954, 0.7813220907382091, 0.7721959372863978, 0.7833192711893371, 0.775066216678059, 0.7796186132946001, 0.7798055152084757, 0.7709543745727956, 0.7747912038619276, 0.7772716592617909, 0.7777549342105263, 0.7825289430963773, 0.7743

In [139]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [70]:
column_to_drop_28 = 'Cat_현재 의료시설 접근용이성'

In [71]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(6119, 148)


In [142]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [143]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [144]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 162, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 3}
0.7907995893120976


In [145]:
optuna_29 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [146]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7854926734449761


In [147]:
X_train = X_train.values
y_train = y_train.values

In [148]:
auc_bootstrap = []

In [149]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77038299, 0.78638713])

In [150]:
t_29 = auc_bootstrap
print(t_29)

[0.7768257646958304, 0.781650504101162, 0.7782462192412851, 0.7778724154135338, 0.7780032467532467, 0.7786680835611757, 0.7780005767259057, 0.784352571770335, 0.7801045582706767, 0.7835115131578948, 0.7825636534518113, 0.7776668233082705, 0.7788042549555708, 0.7772449589883801, 0.7717206724196856, 0.7770553870471633, 0.7791780587833219, 0.785359172077922, 0.7791647086466166, 0.7802166994190021, 0.7759980562200957, 0.7810630980861244, 0.7756429425837321, 0.7790579075529733, 0.7681641960013672, 0.7836770548530418, 0.7716859620642514, 0.7745776016746412, 0.7708048530416952, 0.77734375, 0.7778911056049214, 0.7777228938824333, 0.7770660671565277, 0.7823553913192071, 0.7817332749487356, 0.7822672804169516, 0.7795518626110731, 0.7823714114832537, 0.7774131707108681, 0.7785479323308271, 0.7723107484620642, 0.7764920112781954, 0.777642793062201, 0.7707541225222146, 0.7822165498974709, 0.7780513072453862, 0.7756643028024607, 0.7804276315789473, 0.7752718087833219, 0.7799149863294601, 0.777215588

In [151]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
column_to_drop_29 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [73]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(6119, 144)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 120, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 3}
0.7901359777627526


In [77]:
optuna_30 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [78]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.7858504571086807


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77011799, 0.7869975 ])

In [82]:
t_30 = auc_bootstrap
print(t_30)

[0.7754880809979494, 0.7832979109706083, 0.7796960440874915, 0.77791246582365, 0.7738940746753247, 0.7854633031442242, 0.7838799769309637, 0.7744174000341764, 0.7802994702665756, 0.7792100991114149, 0.7783637004442927, 0.7729809253246753, 0.7800912081339714, 0.7696754314764185, 0.7805611329460014, 0.7817386150034178, 0.7732906484962405, 0.7800538277511962, 0.7728180536568694, 0.7829454673615859, 0.7827906057758032, 0.7758138243335612, 0.7812099495898839, 0.7818320659603555, 0.7761902981886535, 0.7861708603896105, 0.7790739277170198, 0.7757230434039645, 0.7744093899521531, 0.7790899478810663, 0.7808494958988381, 0.7739554853041696, 0.7783610304169515, 0.7857703562884484, 0.7782034988038278, 0.779215439166097, 0.7709864149008885, 0.7765881322624746, 0.7779391660970609, 0.7731518070745045, 0.7784758415926178, 0.7843205314422419, 0.7763157894736842, 0.7821204289131922, 0.7770954374572796, 0.7689865644224196, 0.7782008287764866, 0.7815063226247436, 0.7824274820574163, 0.7816585141831853, 0.

In [83]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc 

In [84]:
column_to_drop_30 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [85]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(6119, 140)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 138, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5}
0.7893262881994005


In [89]:
optuna_31 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [90]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7843045112781954


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76951189, 0.78613388])

In [94]:
t_31 = auc_bootstrap
print(t_31)

[0.7788896958304854, 0.7771942284688995, 0.7727486329460014, 0.7748980049555707, 0.7740008757689678, 0.7690773453520163, 0.7800404776144907, 0.7826891447368421, 0.7750955869788106, 0.7761235475051265, 0.7777309039644567, 0.7780806775461382, 0.7854152426520848, 0.7825262730690362, 0.779749444634313, 0.7811138286056049, 0.7794370514354066, 0.7787027939166097, 0.7788523154477103, 0.780617203520164, 0.7814796223513328, 0.7753892899863294, 0.775501431134655, 0.7809135765550239, 0.7759313055365687, 0.7773384099453178, 0.7760701469583048, 0.7784624914559125, 0.7761075273410799, 0.7754400205058098, 0.7833780117908407, 0.7799416866028708, 0.7893321727614491, 0.7740943267259057, 0.7829107570061518, 0.782347381237184, 0.7803849111414901, 0.7750849068694463, 0.7791193181818181, 0.782483552631579, 0.7696247009569378, 0.7696060107655504, 0.78068128417635, 0.7747297932330827, 0.7730129656527682, 0.7736083817498292, 0.7734882305194806, 0.7776908535543404, 0.7735629912850307, 0.7778644053315105, 0.7834

In [95]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [96]:
column_to_drop_31 = '총 가구원 수'

In [97]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(6119, 139)


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [99]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [100]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7886710239651417


In [101]:
optuna_32 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [102]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.7835782638414217


In [103]:
X_train = X_train.values
y_train = y_train.values

In [104]:
auc_bootstrap = []

In [105]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76797529, 0.78511426])

In [106]:
t_32 = auc_bootstrap
print(t_32)

[0.7772769993164731, 0.7780459671907038, 0.7774959415584415, 0.7691494360902256, 0.7757337235133288, 0.7809856672932332, 0.7733814294258372, 0.7707300922761449, 0.7761155374231032, 0.7816398239917977, 0.774705762987013, 0.784755745898838, 0.7793035500683527, 0.7791593685919345, 0.7746763926862611, 0.7738113038277512, 0.7780566473000684, 0.7860240088858509, 0.7766976033834587, 0.7772769993164731, 0.7704363892686261, 0.7701213260423786, 0.7830175580997949, 0.7821337790498974, 0.7664126580656185, 0.7808388157894737, 0.7778323650034176, 0.77487130468216, 0.7778163448393712, 0.7782649094326726, 0.7738486842105263, 0.7817813354408748, 0.7827398752563226, 0.7857383159603555, 0.7751463174982912, 0.7799577067669172, 0.7778483851674641, 0.7748499444634314, 0.7743266191045796, 0.775175687799043, 0.773135786910458, 0.7802353896103895, 0.7795919130211892, 0.7817546351674641, 0.7706179511278195, 0.7759820360560492, 0.7800698479152426, 0.7835649137047163, 0.7776267728981544, 0.7857596761790842, 0.784

In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [108]:
column_to_drop_32 = '자산 중 금융자산의 비중'

In [109]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(6119, 138)


In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [111]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [112]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 151, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 3}
0.790728637133866


In [113]:
optuna_33 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [114]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.7828680365686944


In [115]:
X_train = X_train.values
y_train = y_train.values

In [116]:
auc_bootstrap = [] 

In [117]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76985666, 0.78600959])

In [118]:
t_33 = auc_bootstrap
print(t_33)

[0.7811031484962406, 0.7792982100136705, 0.7858958475734793, 0.780147278708134, 0.7700091848940533, 0.7819602272727273, 0.7754213303144224, 0.7715951811346549, 0.77133618848257, 0.7794610816814765, 0.7751569976076553, 0.7756002221462748, 0.7815303528708134, 0.7848305066643882, 0.7736003716678058, 0.7801579588174983, 0.780550452836637, 0.783281890806562, 0.777207578605605, 0.7745856117566644, 0.775941985645933, 0.7791273282638415, 0.7804329716336296, 0.7771194677033493, 0.778812265037594, 0.7803982612781954, 0.7764679810321258, 0.7745802717019822, 0.7884831040669856, 0.777979216507177, 0.7798562457279563, 0.7771915584415583, 0.7827078349282297, 0.7771648581681476, 0.7839520676691729, 0.7766361927546138, 0.7755681818181818, 0.7770927674299384, 0.7795091421736158, 0.7735629912850307, 0.7783396701982228, 0.7786600734791524, 0.7832017899863296, 0.7831270292207793, 0.7798322154818866, 0.7787108039986329, 0.7817866754955571, 0.7742732185577581, 0.7774158407382092, 0.7741851076555024, 0.780958

In [119]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [120]:
column_to_drop_33 = 'Cat_현재 대중교통 접근용이성'

In [121]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(6119, 134)


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [123]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [124]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7876651724972663


In [125]:
optuna_34 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [126]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.7830389183185236


In [127]:
X_train = X_train.values
y_train = y_train.values

In [128]:
auc_bootstrap = []

In [129]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77027071, 0.78631244])

In [130]:
t_34 = auc_bootstrap
print(t_34)

[0.7809269266917293, 0.7761742780246069, 0.775677652939166, 0.7798322154818865, 0.7709890849282297, 0.7836556946343131, 0.7817386150034176, 0.7830736286739576, 0.7757363935406698, 0.7734722103554339, 0.77618228810663, 0.775173017771702, 0.7668158321941216, 0.7774078306561859, 0.7801339285714286, 0.7836583646616541, 0.7773250598086124, 0.7795091421736159, 0.7832178101503758, 0.7740329160970607, 0.7804596719070404, 0.7710531655844155, 0.7824515123034861, 0.7786574034518113, 0.776307779391661, 0.7792688397129186, 0.7726044514695831, 0.7798962961380724, 0.7751463174982911, 0.7802567498291184, 0.7806652640123035, 0.7832044600136705, 0.7738914046479838, 0.7781554383116883, 0.7717927631578947, 0.7822165498974709, 0.7766628930280246, 0.7800564977785374, 0.7824248120300752, 0.7836904049897472, 0.7792715097402597, 0.7779017857142857, 0.7702655075187971, 0.7754133202323993, 0.7741183569719754, 0.784854536910458, 0.7762463687628162, 0.7755575017088175, 0.7894549940191387, 0.7761048573137389, 0.783

In [131]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [132]:
column_to_drop_34 = 'Cat_현재 청소/쓰레기 처리상태'

In [133]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(6119, 130)


In [134]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [135]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [136]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 80, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7914798954916151


In [137]:
optuna_35 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [138]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.787463153622693


In [139]:
X_train = X_train.values
y_train = y_train.values

In [140]:
auc_bootstrap = []

In [141]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76987795, 0.78692621])

In [142]:
t_35 = auc_bootstrap
print(t_35)

[0.780216699419002, 0.7788896958304853, 0.7807960953520164, 0.7841897001025291, 0.7748499444634314, 0.7786841037252221, 0.7824568523581682, 0.782785265721121, 0.7793222402597403, 0.7798428955912509, 0.7754987611073137, 0.7803502007860561, 0.7747591635338347, 0.7820243079289132, 0.7884564037935748, 0.7728100435748462, 0.7764306006493508, 0.7772983595352017, 0.7750955869788108, 0.7742838986671223, 0.7784678315105946, 0.77934627050581, 0.7752157382091591, 0.7817065746753247, 0.77667624316473, 0.7775359919685576, 0.7725617310321258, 0.7777015336637048, 0.7783076298701299, 0.7805397727272727, 0.7768471249145591, 0.7771221377306903, 0.7782408791866029, 0.7749594155844156, 0.7838105562200958, 0.7759019352358169, 0.7814262218045113, 0.7759339755639099, 0.7798669258373206, 0.7832365003417636, 0.7831457194121667, 0.7695018796992482, 0.7762650589542037, 0.769298957621326, 0.776542741797676, 0.7832471804511278, 0.774372009569378, 0.7828760466507176, 0.7761662679425837, 0.7715871710526314, 0.773720

In [143]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [144]:
column_to_drop_35 = '소득 대비 주거관리비의 비율'

In [145]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(6119, 129)


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [147]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [148]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 137, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 3}
0.7912670389569195


In [149]:
optuna_36 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [150]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.7846436047505126


In [151]:
X_train = X_train.values
y_train = y_train.values

In [152]:
auc_bootstrap = []

In [153]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77037218, 0.78674058])

In [154]:
t_36 = auc_bootstrap
print(t_36)

[0.774102336807929, 0.7860373590225564, 0.7825396232057418, 0.7788095950102529, 0.7846783151059467, 0.7834340823650033, 0.7762970992822966, 0.7757257134313054, 0.7848465268284348, 0.7844300025632263, 0.7802754400205059, 0.7772182587149692, 0.7804676819890637, 0.7817920155502391, 0.7806919642857142, 0.7772529690704031, 0.7827505553656869, 0.7726178016062883, 0.7752531185919344, 0.7823874316473, 0.7832525205058101, 0.7797948350991114, 0.7787081339712919, 0.7746443523581681, 0.7744334201982227, 0.7785719625768968, 0.7810604280587833, 0.7816104536910458, 0.7775733723513328, 0.7689224837662336, 0.7793275803144224, 0.7703562884483937, 0.7807613849965823, 0.7778563952494874, 0.782352721291866, 0.7698917036910458, 0.7773784603554341, 0.7772663192071088, 0.7848358467190704, 0.7833433014354068, 0.7756803229665072, 0.7801125683526999, 0.7791673786739576, 0.7772369489063569, 0.7797200743335613, 0.7782649094326726, 0.7731491370471634, 0.7834340823650034, 0.7851749401913876, 0.7852817412850308, 0.77

In [155]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [156]:
column_to_drop_36 = '소득 대비 생활비의 비율'

In [157]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(6119, 128)


In [158]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [159]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [160]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7907662000517534


In [161]:
optuna_37 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [162]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7877461765208476


In [163]:
X_train = X_train.values
y_train = y_train.values

In [164]:
auc_bootstrap = []

In [165]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77096132, 0.78720209])

In [166]:
t_37 = auc_bootstrap
print(t_37)

[0.7832712106971975, 0.7803929212235134, 0.779108638072454, 0.7795011320915926, 0.7809402768284348, 0.7827959458304854, 0.7861281399521531, 0.7744788106630213, 0.7795678827751196, 0.777578712406015, 0.7813514610389611, 0.7733654092617909, 0.7799897470950102, 0.7817252648667122, 0.7776107527341081, 0.782320680963773, 0.7783930707450444, 0.7771675281954886, 0.7813888414217361, 0.7744708005809979, 0.7807026443950786, 0.7864912636705399, 0.7806973043403964, 0.7819869275461382, 0.7823420411825017, 0.7872308612440191, 0.7719609748803827, 0.7784384612098428, 0.7824808826042379, 0.7776027426520846, 0.7791219882091593, 0.7775039516404648, 0.7849773581681476, 0.7794504015721122, 0.7763291396103895, 0.7897380169172934, 0.7794530715994532, 0.7771808783321941, 0.781821385850991, 0.7783343301435406, 0.7750955869788105, 0.775509441216678, 0.7824007817840055, 0.7757684338687629, 0.7767296437115516, 0.7813514610389611, 0.7775920625427204, 0.787092019822283, 0.7804116114149009, 0.7793195702323992, 0.774

In [167]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [168]:
column_to_drop_37 = '자산 중 기타자산의 비중'

In [169]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(6119, 127)


In [170]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [171]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [172]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 94, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.790257013831501


In [173]:
optuna_38 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [174]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7826330741626795


In [175]:
X_train = X_train.values
y_train = y_train.values

In [176]:
auc_bootstrap = []

In [177]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7691993 , 0.78572517])

In [178]:
t_38 = auc_bootstrap
print(t_38)

[0.7789270762132604, 0.7778083347573479, 0.7849292976760082, 0.774040926179084, 0.7788095950102529, 0.7750982570061518, 0.7727886833561176, 0.7794183612440192, 0.7786146830143542, 0.7835862739234449, 0.7751943779904306, 0.7766869232740943, 0.7680280246069721, 0.7818160457963089, 0.7811592190704032, 0.7789244061859194, 0.7691360859535201, 0.7792287893028025, 0.7761529178058784, 0.7719743250170881, 0.7826304041353384, 0.7776027426520848, 0.772935534859877, 0.7783263200615175, 0.7794690917634997, 0.7702708475734791, 0.7777522641831853, 0.7756028921736158, 0.7812473299726589, 0.7781314080656185, 0.7729408749145591, 0.7782675794600137, 0.776080827067669, 0.7776347829801776, 0.7772369489063569, 0.7677022812713602, 0.778780224709501, 0.785257711038961, 0.7768044044771019, 0.7774452110389609, 0.7779204759056733, 0.7753999700956938, 0.7854526230348599, 0.7758245044429255, 0.7735256109022556, 0.7791860688653451, 0.7800404776144907, 0.770933014354067, 0.7784891917293232, 0.7786520633971292, 0.776

In [179]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [180]:
column_to_drop_38 = '가구주 나이'

In [181]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(6119, 126)


In [182]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [183]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [184]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7896059232547852


In [185]:
optuna_39 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [186]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.7825796736158577


In [187]:
X_train = X_train.values
y_train = y_train.values

In [188]:
auc_bootstrap = []

In [189]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77047971, 0.78689991])

In [190]:
t_39 = auc_bootstrap
print(t_39)

[0.779744104579631, 0.7793756408065617, 0.7798509056732741, 0.7776855134996583, 0.7824328221120984, 0.78325252050581, 0.7789057159945317, 0.7775279818865345, 0.7868997778537252, 0.7769325657894737, 0.7783263200615175, 0.7746523624401913, 0.7810524179767602, 0.781316750683527, 0.778879015721121, 0.7846996753246753, 0.7692055066643881, 0.7767456638755981, 0.7850414388243335, 0.7764039003759398, 0.7729542250512644, 0.7748365943267259, 0.781020377648667, 0.78438995215311, 0.7697768925153793, 0.7846222445317839, 0.7830736286739576, 0.78268380468216, 0.7701480263157894, 0.7831644096035543, 0.782483552631579, 0.7771728682501708, 0.7788202751196174, 0.7827719155844156, 0.7694004186602871, 0.7726124615516063, 0.7803849111414901, 0.7859759483937115, 0.7775813824333561, 0.7723000683526999, 0.7735896915584416, 0.7815063226247436, 0.7814582621326043, 0.7806973043403964, 0.7844780630553656, 0.7763211295283663, 0.7779765464798359, 0.7728874743677375, 0.7789377563226246, 0.7789591165413534, 0.77522107

In [191]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [192]:
column_to_drop_39 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [193]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(6119, 122)


In [194]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [195]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [196]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7872645013731334


In [197]:
optuna_40 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [198]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.7810978084415584


In [199]:
X_train = X_train.values
y_train = y_train.values

In [200]:
auc_bootstrap = []

In [201]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76793951, 0.78468753])

In [202]:
t_40 = auc_bootstrap
print(t_40)

[0.7805290926179084, 0.7781207279562542, 0.7798856160287081, 0.7761075273410799, 0.7812927204374572, 0.77820883885851, 0.7745909518113466, 0.7835328733766234, 0.7714563397129186, 0.7825396232057417, 0.7729729152426521, 0.776574782125769, 0.7800191173957622, 0.7766949333561177, 0.7723187585440875, 0.7769779562542721, 0.7755922120642516, 0.7757177033492824, 0.7736858125427204, 0.7771194677033493, 0.7776054126794258, 0.7762009782980178, 0.7854018925153794, 0.7743693395420369, 0.7663565874914557, 0.7728153836295284, 0.7670160842447027, 0.7768337747778538, 0.7782649094326726, 0.7775546821599454, 0.7725670710868079, 0.7762623889268627, 0.7766628930280246, 0.7767830442583732, 0.7887874871838688, 0.7773731203007519, 0.7756109022556391, 0.7790231971975393, 0.7790098470608339, 0.7807613849965824, 0.7748686346548188, 0.780248739747095, 0.7780860176008202, 0.7677823820915926, 0.7788416353383458, 0.7742037978468899, 0.7694511491797676, 0.7791647086466164, 0.7743105989405331, 0.7780219369446343, 0.7

In [204]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

NameError: name 'X_train' is not defined

In [206]:
comp_41 = comp[[
 '현재 주택의 면적(㎡)',
 '소득 대비 주택 임대료의 비율',
 '소득 중 근로/사업소득의 비중(월평균)',
 '소득 중 정부 보조금의 비중(월평균)',                                
 '중기부채부담지표',
 '장기부채부담지표',
                
 'target',
 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
 'Cat_현재 주택의 유형_고시원',
 'Cat_현재 주택의 유형_기타',
 'Cat_현재 주택의 유형_다가구 단독주택',
 'Cat_현재 주택의 유형_다세대주택',
 'Cat_현재 주택의 유형_비거주용 건물 내 주택',
 'Cat_현재 주택의 유형_아파트',
 'Cat_현재 주택의 유형_연립주택',
 'Cat_현재 주택의 유형_영업겸용 단독주택',
 'Cat_현재 주택의 유형_오피스텔',
 'Cat_현재 주택의 유형_일반 단독주택',
 'Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
                
 'Cat_현재 주택의 점유형태_무상',
 'Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_현재 주택의 점유형태_보증금 있는 월세',
 'Cat_현재 주택의 점유형태_전세',                
                
 'Cat_현재 주차시설 이용편의성_대체로 만족',
 'Cat_현재 주차시설 이용편의성_매우 만족',
 'Cat_현재 주차시설 이용편의성_매우 불만족',
 'Cat_현재 주차시설 이용편의성_약간 불만족',                                
                
 'Cat_현재 주택에 대한 전반적인 만족도_대체로 만족',
 'Cat_현재 주택에 대한 전반적인 만족도_매우 만족',
 'Cat_현재 주택에 대한 전반적인 만족도_매우 불만족',
 'Cat_현재 주택에 대한 전반적인 만족도_약간 불만족',                
                
 'Cat_이사 계획 첫 번째 이유_가구상황에 적합한 주택규모로 이사',
 'Cat_이사 계획 첫 번째 이유_결혼이나 세대독립을 위해',
 'Cat_이사 계획 첫 번째 이유_계약 만기로 인해',
 'Cat_이사 계획 첫 번째 이유_교통과 편의 및 문화시설 및 공원과 녹지 등이 좋은 지역으로 가기 위해',
 'Cat_이사 계획 첫 번째 이유_높은 집값 혹은 집세 부담',
 'Cat_이사 계획 첫 번째 이유_부모 혹은 자녀 등과 가까이 살기 위해',
 'Cat_이사 계획 첫 번째 이유_시설이나 설비가 더 양호한 집으로 이사',
 'Cat_이사 계획 첫 번째 이유_이사 계획 없음 및 모름',
 'Cat_이사 계획 첫 번째 이유_자가로 이사 또는 자가 마련을 위해',
 'Cat_이사 계획 첫 번째 이유_재개발이나 재건축으로 인해',
 'Cat_이사 계획 첫 번째 이유_직주근접 혹은 직장변동',
 'Cat_이사 계획 첫 번째 이유_집주인이 나가라고 해서',
    
 'Cat_이사 계획 중인 거주 지역_국내 to 국외',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
                
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 전세',                

 'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급',
 'Cat_현재 가장 필요한 주거지원 1순위_없음',
 'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등',
 'Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원',

 'Cat_가구주 최종 학력_고등학교 졸업',
 'Cat_가구주 최종 학력_대학 졸업 이상',
 'Cat_가구주 최종 학력_중학교 졸업 이하',                
                
 'Cat_가구주 종사상 지위_무급가족종사자',
 'Cat_가구주 종사상 지위_무직 및 기타',
 'Cat_가구주 종사상 지위_사업자 및 자영자',
 'Cat_가구주 종사상 지위_상용근로자',
 'Cat_가구주 종사상 지위_임시일용근로자',                
                
 'Cat_주택 마련 예상 소요연수_10년 이상',
 'Cat_주택 마련 예상 소요연수_1~3년',
 'Cat_주택 마련 예상 소요연수_1년 미만',
 'Cat_주택 마련 예상 소요연수_3~5년',
 'Cat_주택 마련 예상 소요연수_5~10년',
 'Cat_주택 마련 예상 소요연수_계획 없음',
                
]]

In [207]:
X_41 = comp_41.drop('target', axis=1)
y_41 = comp_41['target']
X_41.shape

(6119, 108)

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [209]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [210]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 103, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 9}
0.7891176053222482


In [19]:
optuna_41 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [20]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.786


In [21]:
X_train = X_train.values
y_train = y_train.values

In [22]:
auc_bootstrap = []

In [23]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77081393, 0.78598403])

In [24]:
np.mean(auc_bootstrap)

0.7784223609449761

In [25]:
t_41 = auc_bootstrap
print(t_41)

[0.7847637559808613, 0.7800404776144909, 0.7810390678400547, 0.777044706937799, 0.7834714627477785, 0.7752878289473684, 0.7852203306561859, 0.7725911013328777, 0.7815543831168831, 0.7854953434723171, 0.7762597188995215, 0.7795438525290499, 0.7771568480861244, 0.7802193694463431, 0.7785986628503077, 0.7814369019138756, 0.77430258885851, 0.7774719113123717, 0.7811325187969924, 0.7769298957621326, 0.770935684381408, 0.7857463260423787, 0.7772235987696514, 0.771531100478469, 0.7749861158578264, 0.7828893967874231, 0.7782382091592619, 0.7809055664730007, 0.7776694933356119, 0.7713148282638413, 0.7830576085099111, 0.7826090439166097, 0.7826063738892686, 0.7795278323650034, 0.7770794172932332, 0.7784411312371838, 0.7764519608680792, 0.7781127178742311, 0.7843685919343814, 0.7770420369104579, 0.7751409774436091, 0.7799310064935066, 0.7788202751196173, 0.7799229964114833, 0.775941985645933, 0.7783076298701299, 0.7772369489063569, 0.7772956895078607, 0.7746336722488038, 0.7758885850991114, 0.773

In [26]:
column_to_drop_41 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [27]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(6119, 104)


In [28]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [30]:
X_train.shape

(3916, 104)

In [31]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [32]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 53, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7921142914381588


In [33]:
optuna_42 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [34]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.784


In [35]:
X_train = X_train.values
y_train = y_train.values

In [36]:
auc_bootstrap = []

In [37]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76796281, 0.78572243])

In [38]:
np.mean(auc_bootstrap)

0.7774525949995728

In [39]:
t_42 = auc_bootstrap
print(t_42)

[0.7723107484620643, 0.7757363935406698, 0.7811245087149693, 0.7651363849965823, 0.7744200700615176, 0.7775253118591934, 0.7789190661312372, 0.777544002050581, 0.7705218301435407, 0.7738486842105263, 0.7739127648667122, 0.7785052118933697, 0.771870193950786, 0.7723321086807928, 0.7825209330143541, 0.769963794429255, 0.7854018925153793, 0.7805184125085441, 0.7799523667122351, 0.7730183057074504, 0.7693256578947367, 0.7784010808270677, 0.7759847060833903, 0.772169237012987, 0.7819655673274095, 0.7766895933014354, 0.7795411825017088, 0.7705004699248121, 0.7762810791182502, 0.7753572496582366, 0.7736964926520847, 0.7783316601161995, 0.7793596206425154, 0.7757711038961038, 0.780016447368421, 0.7726071214969241, 0.7780299470266576, 0.7816024436090225, 0.7774745813397128, 0.774102336807929, 0.7773357399179768, 0.7762570488721804, 0.7819468771360218, 0.7812953904647983, 0.7715230903964456, 0.7790739277170198, 0.7774238508202325, 0.7821417891319207, 0.7734882305194805, 0.7753171992481203, 0.776

In [40]:
column_to_drop_42 = '소득 중 근로/사업소득의 비중(월평균)'

In [41]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(6119, 103)


In [42]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [44]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [45]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7899606841459443


In [46]:
optuna_43 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [47]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.786


In [48]:
X_train = X_train.values
y_train = y_train.values

In [49]:
auc_bootstrap = []

In [50]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77121223, 0.78687401])

In [51]:
np.mean(auc_bootstrap)

0.7793185809872694

In [52]:
t_43 = auc_bootstrap
print(t_43)

[0.7757283834586466, 0.7772476290157212, 0.7798669258373205, 0.7796212833219412, 0.7761208774777854, 0.7790846078263841, 0.7728207236842106, 0.7754880809979494, 0.7819895975734792, 0.7786066729323309, 0.7806599239576213, 0.7839040071770333, 0.7829935278537252, 0.7824915627136022, 0.7793542805878333, 0.785153579972659, 0.7763318096377307, 0.7701079759056733, 0.7808388157894738, 0.7712133672248803, 0.7756189123376622, 0.7872922718728639, 0.7844139823991798, 0.7860106587491456, 0.7798615857826384, 0.7813808313397129, 0.7770046565276828, 0.784085569036227, 0.7804089413875598, 0.784187030075188, 0.7803074803485989, 0.7807480348598769, 0.7842244104579631, 0.7833967019822283, 0.7762864191729323, 0.782318010936432, 0.7819201768626111, 0.7766842532467534, 0.7906378161312373, 0.7831136790840738, 0.7776508031442242, 0.7806866242310321, 0.7700198650034177, 0.7823126708817499, 0.7773197197539303, 0.7814315618591935, 0.7754800709159262, 0.7800912081339713, 0.7763558398838003, 0.7857890464798358, 0.7

In [53]:
column_to_drop_43 = 'Cat_현재 주차시설 이용편의성'

In [54]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(6119, 99)


In [55]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [57]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [58]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 106, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 3}
0.7878863763470481


In [59]:
optuna_44 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [60]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.788


In [61]:
X_train = X_train.values
y_train = y_train.values

In [62]:
auc_bootstrap = []

In [63]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76946033, 0.7862003 ])

In [64]:
np.mean(auc_bootstrap)

0.7779071678218984

In [65]:
t_44 = auc_bootstrap
print(t_44)

[0.7887607869104579, 0.7777976546479837, 0.7828279861585783, 0.7757844540328094, 0.7700171949760766, 0.7722600179425837, 0.7862883415926178, 0.7710291353383458, 0.7784811816473001, 0.7862029007177033, 0.7742198180109363, 0.777143497949419, 0.7825756685748462, 0.784753075871497, 0.7821471291866029, 0.7791700487012987, 0.7802647599111415, 0.7737525632262474, 0.7740809765892003, 0.7787588644907724, 0.7796773538961039, 0.7702174470266575, 0.7738193139097744, 0.7763958902939166, 0.7812740302460698, 0.7823286910457963, 0.778441131237184, 0.7828813867053999, 0.7770633971291867, 0.7822312350478469, 0.777082087320574, 0.7790365473342447, 0.7754774008885851, 0.7829000768967874, 0.7739581553315106, 0.7798776059466848, 0.7776000726247436, 0.7794584116541354, 0.7834100521189336, 0.7792448094668489, 0.7863150418660287, 0.7802594198564592, 0.7802727699931649, 0.7713548786739576, 0.7806652640123035, 0.7810657681134654, 0.7780860176008203, 0.7807186645591251, 0.7662417763157896, 0.7751089371155161, 0.7

In [66]:
column_to_drop_44 = 'Cat_가구주 종사상 지위'

In [67]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(6119, 94)


In [68]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [70]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [71]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 50, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 7}
0.784916819005167


In [72]:
optuna_45 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [73]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.778


In [74]:
X_train = X_train.values
y_train = y_train.values

In [75]:
auc_bootstrap = []

In [76]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7653638 , 0.78245355])

In [77]:
np.mean(auc_bootstrap)

0.7748281383501366

In [78]:
t_45 = auc_bootstrap
print(t_45)

[0.7799043062200957, 0.7727472979323309, 0.7730877264183185, 0.7656557053144224, 0.776710953520164, 0.7744801456766918, 0.7720984812884484, 0.778829620215311, 0.7736123867908408, 0.7659667634996582, 0.7741664174641147, 0.7680760850991114, 0.7841069292549556, 0.7742571983937117, 0.773365409261791, 0.7819361970266574, 0.776241028708134, 0.7733840994531784, 0.7725216806220094, 0.7610539131920712, 0.7693083027170198, 0.7689344988892686, 0.7648813973855092, 0.7781207279562543, 0.7671629357484621, 0.772633821770335, 0.7745762666609707, 0.773870044429255, 0.7746083069890636, 0.772705912508544, 0.7770500469924813, 0.7752691387559809, 0.7757297184723171, 0.7734855604921395, 0.7785225670710867, 0.7750849068694464, 0.7753358894395078, 0.7789911568694464, 0.7765147065105946, 0.7706433163875599, 0.7675767899863295, 0.7816224688140807, 0.7774772513670539, 0.7701093109193439, 0.7741997928058784, 0.7793516105604921, 0.768898453520164, 0.7798001751537936, 0.7774078306561859, 0.7744133949931646, 0.77559

In [79]:
column_to_drop_45 = '소득 중 정부 보조금의 비중(월평균)'

In [80]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(6119, 93)


In [81]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [83]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [84]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7877069090726968


In [85]:
optuna_46 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [86]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.783


In [87]:
X_train = X_train.values
y_train = y_train.values

In [88]:
auc_bootstrap = []

In [89]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76925977, 0.78613181])

In [90]:
np.mean(auc_bootstrap)

0.7779555754175923

In [91]:
t_46 = auc_bootstrap
print(t_46)

[0.784953327922078, 0.7812606801093644, 0.7844567028366369, 0.7824435022214629, 0.7774692412850307, 0.7830068779904306, 0.7729115046138072, 0.7647986265379357, 0.7759900461380725, 0.7775279818865346, 0.7736671223513328, 0.7755681818181819, 0.7729515550239234, 0.7740889866712234, 0.7766014823991798, 0.7773063696172249, 0.7694191088516746, 0.7793996710526314, 0.7795411825017089, 0.7744254101161996, 0.7804703520164047, 0.7778964456596036, 0.7760487867395762, 0.7715524606971975, 0.7813754912850308, 0.7807747351332878, 0.7810257177033493, 0.7779631963431306, 0.7824461722488038, 0.78181871582365, 0.7806599239576213, 0.777741584073821, 0.774671052631579, 0.7807774051606289, 0.7872789217361587, 0.7833112611073137, 0.783989448051948, 0.7797254143882434, 0.7777202238550922, 0.7873817177887903, 0.7731971975393028, 0.774169087491456, 0.778713474025974, 0.7838185663021189, 0.7786627435064934, 0.7740676264524948, 0.7795331724196854, 0.7779258159603555, 0.784656954887218, 0.7829775076896788, 0.785869

In [92]:
column_to_drop_46 = 'Cat_이사 계획 첫 번째 이유'

In [93]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(6119, 81)


In [94]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [96]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [97]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7874231003597693


In [98]:
optuna_47 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [99]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.785


In [100]:
X_train = X_train.values
y_train = y_train.values

In [101]:
auc_bootstrap = []

In [102]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76903102, 0.78530631])

In [103]:
np.mean(auc_bootstrap)

0.7775500550025632

In [104]:
t_47 = auc_bootstrap
print(t_47)

[0.7722039473684211, 0.773103746582365, 0.7735309509569378, 0.7774906015037594, 0.7773357399179768, 0.784518113465482, 0.7808735261449078, 0.7774719113123718, 0.7752344284005468, 0.7868316921565277, 0.7744574504442925, 0.7727619830827068, 0.7747885338345865, 0.7729275247778536, 0.7716939721462748, 0.7728634441216679, 0.7758031442241969, 0.7805277576042379, 0.7745282061688312, 0.7805010573308271, 0.7782435492139439, 0.7769966464456595, 0.7746496924128503, 0.7761115323820915, 0.7779364960697198, 0.7834647876794258, 0.7804142814422419, 0.7770313568010937, 0.7664794087491456, 0.7767990644224196, 0.7800324675324676, 0.7753599196855776, 0.7769245557074503, 0.7797494446343132, 0.7757817840054682, 0.7763825401572112, 0.7745188610731374, 0.7802353896103896, 0.7776094177204375, 0.7784411312371837, 0.7773357399179768, 0.7843632518796992, 0.7713468685919345, 0.7800071022727273, 0.7758805750170882, 0.7735843515037595, 0.7800511577238552, 0.7765667720437457, 0.7762543788448394, 0.7819094967532467, 0

In [105]:
column_to_drop_47 = '소득 대비 주택 임대료의 비율'

In [106]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(6119, 80)


In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [109]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [110]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7868826117079442


In [111]:
optuna_48 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [112]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.786


In [113]:
X_train = X_train.values
y_train = y_train.values

In [114]:
auc_bootstrap = []

In [115]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76863165, 0.78515885])

In [116]:
np.mean(auc_bootstrap)

0.77719231739683

In [117]:
t_48 = auc_bootstrap
print(t_48)

[0.7808067754613807, 0.7725363657723855, 0.7788990409261791, 0.7740636214114832, 0.7780299470266575, 0.782874711637047, 0.7741076768626111, 0.7792434744531784, 0.7733293638926864, 0.780347530758715, 0.7802207044600137, 0.77101445018797, 0.7767042784518114, 0.7777495941558441, 0.7777856395249487, 0.7725577259911142, 0.7842577857997266, 0.7718875491285031, 0.7786720886021874, 0.7784304511278195, 0.7698409731715652, 0.7791126431134656, 0.7847971313226246, 0.7844046373034861, 0.7746803977272727, 0.7810911333732058, 0.7848478618421053, 0.7725657360731374, 0.777326394822283, 0.7755775269138755, 0.7727272727272727, 0.7743479793233082, 0.776658887987013, 0.7818707813568011, 0.7758498697026658, 0.7694244489063569, 0.7785692925495556, 0.774876644736842, 0.7778230199077237, 0.776090172163363, 0.7859332279562543, 0.7851041844668489, 0.7732439230177717, 0.7846743100649349, 0.7836797248803828, 0.7756149072966507, 0.7842404306220097, 0.7703349282296651, 0.774520196086808, 0.7800044322453862, 0.766701

In [118]:
column_to_drop_48 = 'Cat_주택 마련 예상 소요연수'

In [119]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(6119, 74)


In [120]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [122]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [123]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 180, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 4}
0.7868492224475997


In [124]:
optuna_49 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [125]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.781


In [126]:
X_train = X_train.values
y_train = y_train.values

In [127]:
auc_bootstrap = []

In [128]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76752976, 0.78382578])

In [129]:
np.mean(auc_bootstrap)

0.7759090989191729

In [130]:
t_49 = auc_bootstrap
print(t_49)

[0.7733800944121667, 0.7813527960526316, 0.7791887388926863, 0.774755158492823, 0.7759793660287082, 0.7795358424470268, 0.775007476076555, 0.7737552332535885, 0.7796666737867396, 0.7775546821599453, 0.7715751559295967, 0.7720063653451812, 0.7672550516917294, 0.7723160885167464, 0.7775426670369104, 0.7772529690704033, 0.7787001238892686, 0.7738526892515379, 0.779517152255639, 0.772456264952153, 0.7736978276657552, 0.7762810791182502, 0.779870930878332, 0.7727619830827068, 0.773919439935065, 0.7668892579460014, 0.7726471719070402, 0.7773717852870814, 0.7804449867566643, 0.776908535543404, 0.7732919835099112, 0.7813100756151743, 0.7682523069036227, 0.7777549342105263, 0.7752090631408066, 0.7786627435064936, 0.7766081574675325, 0.7748526144907724, 0.7752023880724539, 0.774572261619959, 0.7775600222146275, 0.7762717340225564, 0.7783543553485989, 0.7793169002050581, 0.7794130211893371, 0.7766976033834585, 0.7740716314935066, 0.7710571706254271, 0.770935684381408, 0.7790485624572796, 0.778985

In [131]:
column_to_drop_49 = 'Cat_현재 거주 지역' 

In [132]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(6119, 57)


In [133]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [135]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [136]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 107, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 5}
0.7779196821342415


In [137]:
optuna_50 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [138]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.775


In [139]:
X_train = X_train.values
y_train = y_train.values

In [140]:
auc_bootstrap = []

In [141]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76269101, 0.77821812])

In [142]:
np.mean(auc_bootstrap)

0.7706949707365004

In [143]:
t_50 = auc_bootstrap
print(t_50)

[0.7779925666438825, 0.7757937991285031, 0.7786894437799043, 0.7738673744019139, 0.7716766169685578, 0.769850318267259, 0.7622834607826384, 0.7723307736671223, 0.7728273987525631, 0.7681682010423787, 0.770503139952153, 0.7697675474196857, 0.7791032980177717, 0.7701026358509913, 0.7700492353041696, 0.767160265721121, 0.7681828861927547, 0.7692549021701982, 0.7722439977785371, 0.7737245279391661, 0.7648213217703349, 0.774876644736842, 0.774755158492823, 0.769452484193438, 0.7696380510936431, 0.7787375042720438, 0.7717193374060151, 0.7748699696684895, 0.7678184274606972, 0.7711933420198223, 0.7692749273752564, 0.7714670198222829, 0.7727059125085441, 0.7664954289131921, 0.7767469988892687, 0.7730917314593301, 0.7705965909090909, 0.761216784859877, 0.7692602422248804, 0.7689158086978811, 0.7719623098940533, 0.77445077537594, 0.7743493143369787, 0.7683777981886534, 0.7694057587149692, 0.7682603169856459, 0.7747124380553656, 0.7709957599965824, 0.7730970715140124, 0.7689411739576214, 0.773402

In [144]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [145]:
column_to_drop_50 = 'Cat_가구주 최종 학력'

In [146]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(6119, 54)


In [147]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [148]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [149]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 10}
0.7765382014874915


In [150]:
optuna_51 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [151]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.773


In [152]:
X_train = X_train.values
y_train = y_train.values

In [153]:
auc_bootstrap = []

In [154]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76261314, 0.77668579])

In [155]:
np.mean(auc_bootstrap)

0.7699746073724795

In [156]:
t_51 = auc_bootstrap
print(t_51)

[0.7674286034688996, 0.7709717297505125, 0.7730022855434039, 0.7671242203520163, 0.7653246219241284, 0.7684405438311689, 0.7744894907723854, 0.7756469476247436, 0.7789284112269309, 0.7681975713431306, 0.7656703904647983, 0.7643140165755298, 0.7707794877819549, 0.7714016041524264, 0.7665074440362271, 0.7710825358851676, 0.7723774991455913, 0.7657277960526315, 0.7683230626281613, 0.7710224602699932, 0.772717927631579, 0.7739848556049214, 0.7671936410628846, 0.7685540199931646, 0.7729235197368421, 0.7673191323479153, 0.76958064550581, 0.7666356053485986, 0.7703055579289133, 0.7660802396616542, 0.7658012218045114, 0.7743146039815447, 0.7719796650717703, 0.7720397406869446, 0.766211071001367, 0.7696420561346548, 0.7739394651401229, 0.7739461402084757, 0.7709396894224196, 0.7761676029562543, 0.7703402682843472, 0.7753892899863295, 0.7733053336466166, 0.7741851076555023, 0.7704964648838004, 0.7722239725734792, 0.767931903622693, 0.7710024350649352, 0.7676929361756666, 0.7632954011449077, 0.77

In [157]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [158]:
column_to_drop_51 = '장기부채부담지표' 

In [159]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(6119, 53)


In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [161]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [162]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 51, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7801484152622309


In [163]:
optuna_52 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [164]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.777


In [165]:
X_train = X_train.values
y_train = y_train.values

In [166]:
auc_bootstrap = []

In [167]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76321877, 0.77791777])

In [168]:
np.mean(auc_bootstrap)

0.7708076031698565

In [169]:
t_52 = auc_bootstrap
print(t_52)

[0.7661616754955571, 0.778562617481203, 0.7686768412508543, 0.764603714542037, 0.7769926414046481, 0.7751289623205742, 0.7657972167634997, 0.77205309082365, 0.7689064636021873, 0.7758578797846889, 0.7715965161483254, 0.762574493762816, 0.7722319826555024, 0.7667063610731375, 0.7701587064251538, 0.7658612974196856, 0.7692028366370471, 0.7639642429938482, 0.7738046287593985, 0.7762049833390294, 0.7778350350307588, 0.7699330891148325, 0.7714683548359534, 0.7749153601332878, 0.768875758287765, 0.7704604195146958, 0.7707447774265209, 0.7720477507689678, 0.7680360346889952, 0.7738113038277512, 0.7721478767942583, 0.7743760146103896, 0.769635381066302, 0.7724949803485988, 0.7713802439336979, 0.7713935940704033, 0.77276999316473, 0.7716405715994531, 0.7708088580827068, 0.7751116071428572, 0.775823169429255, 0.7804356416609707, 0.7684018284347232, 0.7663111970266575, 0.7757670988550924, 0.7739835205912509, 0.7736350820232399, 0.7675153793574846, 0.7660041438824334, 0.7680907702494872, 0.7677623

In [170]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [171]:
column_to_drop_52 = '중기부채부담지표'

In [172]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(6119, 52)


In [173]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [174]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [175]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 92, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7716842377649229


In [176]:
optuna_53 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [177]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.776


In [178]:
X_train = X_train.values
y_train = y_train.values

In [179]:
auc_bootstrap = []

In [180]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76072984, 0.77466294])

In [181]:
np.mean(auc_bootstrap)

0.7680958352913534

In [182]:
t_53 = auc_bootstrap
print(t_53)

[0.7722199675324677, 0.7651737653793573, 0.7674379485645932, 0.7666382753759399, 0.7703108979835953, 0.7635770890293916, 0.772218632518797, 0.7691574461722488, 0.7685460099111415, 0.7745268711551606, 0.7595226525119617, 0.761486457621326, 0.7659240430622009, 0.7709023090396446, 0.7694377990430622, 0.7720477507689678, 0.7672016511449078, 0.7684125085440875, 0.7664914238721805, 0.7737899436090226, 0.7647011705399864, 0.7751943779904307, 0.7714323094668488, 0.7724495898838004, 0.7636625299043062, 0.7657277960526316, 0.768277672163363, 0.7688984535201641, 0.7685206446514012, 0.7618629314764183, 0.7704403943096377, 0.7648533620984278, 0.7663953028879015, 0.763378171992481, 0.7688170176862612, 0.7678237675153794, 0.7717994382262474, 0.7681455058099794, 0.7694444741114148, 0.7707487824675324, 0.7655662593984962, 0.7662537914388242, 0.7671616007347916, 0.7672724068694463, 0.7630057031784006, 0.7625958539815447, 0.768881098342447, 0.7716552567498292, 0.7672603917464115, 0.7736404220779222, 0.76

In [183]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [184]:
column_to_drop_53 = 'Cat_이사 계획 중인 주택의 점유형태'

In [185]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(6119, 32)


In [186]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [187]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [188]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 148, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 7}
0.771076970592409


In [189]:
optuna_54 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [190]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.767


In [191]:
X_train = X_train.values
y_train = y_train.values

In [192]:
auc_bootstrap = []

In [193]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75728907, 0.7689055 ])

In [194]:
np.mean(auc_bootstrap)

0.7634302288480435

In [195]:
t_54 = auc_bootstrap
print(t_54)

[0.7612581702836637, 0.7626746197881067, 0.755789954289132, 0.7602168596206426, 0.7640750491285031, 0.7618282211209843, 0.7576496283321942, 0.7628348214285715, 0.7605159026828434, 0.7632126302973343, 0.7660361842105263, 0.7641458048530417, 0.7605372629015721, 0.763390187115516, 0.7639055023923444, 0.7642686261107312, 0.7649508180963773, 0.7612862055707451, 0.7617174149863295, 0.7640536889097743, 0.7618709415584416, 0.763506333304853, 0.7647639161825018, 0.7626305643369788, 0.7688997885338346, 0.7630991541353384, 0.7650829844497608, 0.7650456040669857, 0.7617841656698565, 0.7658733125427205, 0.7669893839712918, 0.759434541609706, 0.7670547996411483, 0.7630163832877649, 0.765790541695147, 0.7622367353041696, 0.7625971889952152, 0.764522278708134, 0.7672016511449077, 0.764336711807929, 0.7626666097060834, 0.7598884462576897, 0.766628930280246, 0.7653032617053999, 0.7628868869617225, 0.7651043446684894, 0.7566310129015721, 0.7605172376965139, 0.7619096569548871, 0.7622674406185919, 0.76285

In [196]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [197]:
column_to_drop_54 = 'Cat_이사 계획 중인 거주 지역'

In [198]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(6119, 25)


In [199]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [200]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [201]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 148, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 7}
0.7660143239926877


In [202]:
optuna_55 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [203]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.759


In [204]:
X_train = X_train.values
y_train = y_train.values

In [205]:
auc_bootstrap = []

In [206]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74947914, 0.76121191])

In [207]:
np.mean(auc_bootstrap)

0.7553266231096206

In [208]:
t_55 = auc_bootstrap
print(t_55)

[0.7539289452323991, 0.7570795774948735, 0.7523282638414217, 0.7550717169343814, 0.7549435556220095, 0.7585200572453862, 0.7581075380211894, 0.7559648410799726, 0.7519037294941899, 0.756942071086808, 0.756942071086808, 0.7558219946172249, 0.7567818694463432, 0.7571890486158579, 0.7552879891490087, 0.754138542378674, 0.7590767579460014, 0.7544055451127819, 0.7560556220095694, 0.7572865046138073, 0.7508504037081339, 0.7552105583561177, 0.7580247671736158, 0.7602929553998632, 0.7574533813226249, 0.7565789473684211, 0.7526553421907041, 0.7519584650546822, 0.7542186431989063, 0.751349698820916, 0.7526099517259056, 0.7530318160457963, 0.7501842318865345, 0.7569701063738893, 0.7580861778024607, 0.754091816900205, 0.7501521915584416, 0.7503390934723171, 0.7548394245557074, 0.7521200017088174, 0.7553947902426521, 0.7572678144224196, 0.7568352699931646, 0.7560783172419685, 0.7504952900717703, 0.7580060769822282, 0.7540050410116199, 0.7526833774777854, 0.7539396253417635, 0.7561317177887901, 0.75

In [209]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [210]:
column_to_drop_55 = 'Cat_현재 주택의 유형'

In [211]:
if not column_to_drop_55.startswith('Cat_'):
    comp_56 = comp_55.drop(column_to_drop_55, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']
else:
    comp_56 = comp_55.drop(comp_55.filter(regex='^' + column_to_drop_55).columns, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']

print(X_56.shape)

(6119, 14)


In [212]:
X_train, X_test, y_train, y_test = train_test_split(X_56, y_56, test_size=0.2, shuffle=True, stratify=y_56, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [213]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [214]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 107, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7670702593510798


In [215]:
optuna_56 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_56.fit(X_train, y_train)

In [216]:
optuna_proba_56 = optuna_56.predict_proba(X_test)[:, 1]
auc_56 = roc_auc_score(y_test, optuna_proba_56)
print(decimal.Decimal(auc_56).quantize(decimal.Decimal('1.000')))

0.748


In [217]:
X_train = X_train.values
y_train = y_train.values

In [218]:
auc_bootstrap = []

In [219]:
rs = RandomState(seed = 56)
bootstrap_auc(optuna_56, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73122644, 0.75043258])

In [220]:
np.mean(auc_bootstrap)

0.7412193046981802

In [221]:
t_56 = auc_bootstrap
print(t_56)

[0.7440805493848258, 0.7462339264354068, 0.736862130468216, 0.7496902768284347, 0.7410100179425837, 0.7403585312713603, 0.7381437435919344, 0.7520172056561859, 0.7376471185064934, 0.7419925880041012, 0.7464889140464798, 0.748778462491456, 0.7382438696172248, 0.7372639695830486, 0.7452313311688311, 0.7435879293403964, 0.7342775440020505, 0.7427161654135339, 0.7430018583390294, 0.7379261363636364, 0.74542624316473, 0.7392491349111416, 0.7455677546138072, 0.7353388798701299, 0.7396082535885167, 0.7427895911654135, 0.7447146808783323, 0.7412116050068353, 0.7299093792720438, 0.7362827345352017, 0.7469494937628162, 0.7398392109535202, 0.7448762175324676, 0.7368955058099793, 0.7438802973342448, 0.7355057565789473, 0.7407817306049214, 0.7416414794087491, 0.7430592639268625, 0.747446118848257, 0.7451298701298701, 0.742219540328093, 0.7438042015550239, 0.7479841293574846, 0.7387578498803827, 0.7402023346719071, 0.7451966208133971, 0.7447040007689678, 0.7411355092276146, 0.7413584565105947, 0.737

In [222]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [223]:
# 57
column_to_drop_56 = '현재 주택의 면적(㎡)'

In [224]:
if not column_to_drop_56.startswith('Cat_'):
    comp_57 = comp_56.drop(column_to_drop_56, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']
else:
    comp_57 = comp_56.drop(comp_56.filter(regex='^' + column_to_drop_56).columns, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']

print(X_57.shape)

(6119, 13)


In [225]:
X_train, X_test, y_train, y_test = train_test_split(X_57, y_57, test_size=0.2, shuffle=True, stratify=y_57, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [226]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [227]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 107, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7403546774180085


In [228]:
optuna_57 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_57.fit(X_train, y_train)

In [229]:
optuna_proba_57 = optuna_57.predict_proba(X_test)[:, 1]
auc_57 = roc_auc_score(y_test, optuna_proba_57)
print(decimal.Decimal(auc_57).quantize(decimal.Decimal('1.000')))

0.735


In [230]:
X_train = X_train.values
y_train = y_train.values

In [231]:
auc_bootstrap = []

In [232]:
rs = RandomState(seed = 57)
bootstrap_auc(optuna_57, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72788089, 0.73780352])

In [233]:
np.mean(auc_bootstrap)

0.7332470522898155

In [234]:
t_57 = auc_bootstrap
print(t_57)

[0.7292966079972659, 0.7337875939849624, 0.7382251794258374, 0.7307971633629529, 0.7336460825358851, 0.7301189764183186, 0.733371069719754, 0.7342895591250854, 0.7340706168831169, 0.7367726845522898, 0.7373307202665756, 0.732303058783322, 0.7249711637047164, 0.7371438183526999, 0.7332135381066301, 0.7351172676008203, 0.7361292079630896, 0.7355471420027341, 0.7279963046821599, 0.7344604408749146, 0.7383319805194805, 0.7322576683185237, 0.7355204417293234, 0.7296704118250171, 0.7339130852699932, 0.7345538918318524, 0.7277533321941217, 0.7343616498632947, 0.7362760594668489, 0.7352961594326726, 0.7321081467874231, 0.730367288961039, 0.730134996582365, 0.7292992780246069, 0.7359663362952837, 0.7327089029391661, 0.7362600393028025, 0.733539281442242, 0.7376831638755981, 0.7290055750170882, 0.7330853767942583, 0.7293740387901573, 0.7351920283663704, 0.7362146488380041, 0.7335686517429939, 0.7358942455570745, 0.7360891575529734, 0.7352614490772386, 0.7295022001025291, 0.730631621667806, 0.734

In [235]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [236]:
# 58
column_to_drop_57 = 'Cat_현재 주택의 점유형태'

In [237]:
if not column_to_drop_57.startswith('Cat_'):
    comp_58 = comp_57.drop(column_to_drop_57, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']
else:
    comp_58 = comp_57.drop(comp_57.filter(regex='^' + column_to_drop_57).columns, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']

print(X_58.shape)

(6119, 9)


In [238]:
X_train, X_test, y_train, y_test = train_test_split(X_58, y_58, test_size=0.2, shuffle=True, stratify=y_58, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [239]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [240]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 125, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 9}
0.7172639170610773


In [241]:
optuna_58 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_58.fit(X_train, y_train)

In [242]:
optuna_proba_58 = optuna_58.predict_proba(X_test)[:, 1]
auc_58 = roc_auc_score(y_test, optuna_proba_58)
print(decimal.Decimal(auc_58).quantize(decimal.Decimal('1.000')))

0.706


In [243]:
X_train = X_train.values
y_train = y_train.values

In [244]:
auc_bootstrap = []

In [245]:
rs = RandomState(seed = 58)
bootstrap_auc(optuna_58, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.70177931, 0.70648122])

In [246]:
np.mean(auc_bootstrap)

0.7055986895505809

In [247]:
t_58 = auc_bootstrap
print(t_58)

[0.7058243976418317, 0.7064812243677374, 0.7057683270676691, 0.7059525589542036, 0.7058964883800409, 0.7064251537935747, 0.7064251537935747, 0.7059525589542036, 0.7057683270676691, 0.7064251537935747, 0.7064812243677374, 0.7017793062200957, 0.7064812243677374, 0.7058964883800409, 0.7058243976418317, 0.7064251537935747, 0.7064812243677374, 0.7057683270676691, 0.7064251537935747, 0.7064251537935747, 0.7064251537935747, 0.7064251537935747, 0.7064251537935747, 0.7018353767942582, 0.7019074675324675, 0.7033279220779219, 0.7058964883800409, 0.7064251537935747, 0.7058964883800409, 0.7064251537935747, 0.7058243976418317, 0.7064812243677374, 0.7058243976418317, 0.7058964883800409, 0.7064812243677374, 0.7059525589542036, 0.7058243976418317, 0.7058243976418317, 0.7059525589542036, 0.7019074675324675, 0.7064251537935747, 0.7057683270676691, 0.7059525589542036, 0.7057683270676691, 0.7049753289473684, 0.7058964883800409, 0.7064251537935747, 0.7064812243677374, 0.7043585526315788, 0.7057683270676691,

In [248]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 59.