In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
청년가구 = pd.read_csv('청년가구_변수추가.csv', encoding='cp949')
청년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [7]:
청년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [8]:
cat = 청년가구.select_dtypes(include = 'object')
num = 청년가구.select_dtypes(exclude = 'object')
num_청년 = num.drop('target',axis=1)
target = 청년가구.target

In [9]:
scaler=RobustScaler()
scaler.fit(num_청년)
num_scaled_청년=scaler.transform(num_청년)
num_df_scaled_청년=pd.DataFrame(data=num_scaled_청년, columns=num_청년.columns)

In [10]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [11]:
comp = pd.concat([num_df_scaled_청년, target,cat2],axis=1)

In [12]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(8444, 213)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [15]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [16]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7881542833655235


In [17]:
optuna_0 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [18]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

0.7772348896127398


In [19]:
X_train = X_train.values
y_train = y_train.values

In [20]:
auc_bootstrap = []

In [21]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76565589, 0.77777315])

In [22]:
t_0 = auc_bootstrap
print(t_0)

[0.77359160106768, 0.7733950868621065, 0.7735689807274702, 0.77156142553384, 0.7669709102424901, 0.7741514544878755, 0.768127375135722, 0.7716109075280493, 0.773119401465798, 0.7692697023163227, 0.7723078967607673, 0.7704798905175534, 0.7721000723850887, 0.7718455935577272, 0.7715105297683678, 0.773519498733261, 0.7694153207564242, 0.7703795127578719, 0.7694464237242128, 0.7679577225841477, 0.7719883844553023, 0.7805204940282302, 0.7693050465979009, 0.7695694218241041, 0.7777622262938835, 0.7661339576547231, 0.7758903931415129, 0.771029847538907, 0.7717438020267825, 0.7715911147303656, 0.7736156351791531, 0.7712546371697431, 0.7704502013210279, 0.7716250452406804, 0.7716943200325734, 0.7758239458921461, 0.7713012916214259, 0.7713437047593197, 0.7712461545421643, 0.7749290286825914, 0.7699992082880927, 0.7736410830618892, 0.7694195620702136, 0.769401183043793, 0.7737541847629388, 0.7761194240861382, 0.7708361608758596, 0.7715274950235251, 0.7734883957654723, 0.770274893684401, 0.7718003

In [23]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [13]:
column_to_drop = 'Cat_가구주 동거 여부'

In [14]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(8444, 212)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [27]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [28]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7877719781312567


In [29]:
optuna_1 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [30]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7765633482627579


In [31]:
X_train = X_train.values
y_train = y_train.values

In [32]:
auc_bootstrap = []

In [33]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76711377, 0.77828391])

In [34]:
t_1 = auc_bootstrap
print(t_1)

[0.7686151262214984, 0.7738064942996743, 0.7751623009410062, 0.7694025968150562, 0.7711457767824829, 0.7740906623235613, 0.777473816956207, 0.7710355026239595, 0.7721834848896127, 0.7721071412414043, 0.7721467268367715, 0.7723404134998191, 0.7729568177705394, 0.7706198538726023, 0.7709817793159609, 0.7756274316865724, 0.7677286916395223, 0.7736580483170467, 0.7770383754071661, 0.7733116743575822, 0.7730571955302207, 0.779950744209193, 0.7775671258595729, 0.7731505044335868, 0.7703272032211363, 0.7730656781577996, 0.7717932840209916, 0.7696316277596815, 0.7722810351067679, 0.7705717856496561, 0.7739125271444083, 0.7663545059717698, 0.7682249253528773, 0.7729794381107491, 0.7676014522258414, 0.7738884930329353, 0.7714652890879479, 0.7751467494571118, 0.7742786939015562, 0.7748470299493304, 0.7734530514838942, 0.7733752940644227, 0.7725609618168657, 0.7718766965255157, 0.7732197792254795, 0.7730529542164315, 0.7714751854867896, 0.7663898502533477, 0.7730642643865363, 0.7713055329352153, 0

In [35]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [15]:
column_to_drop_1 = '부채 중 임대 보증금의 비중'

In [16]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(8444, 211)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [40]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [41]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7874007221812171


In [42]:
optuna_2 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [43]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.7767740001809628


In [44]:
X_train = X_train.values
y_train = y_train.values

In [45]:
auc_bootstrap = []

In [46]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76530969, 0.77789682])

In [47]:
t_2 = auc_bootstrap
print(t_2)

[0.7649322520810713, 0.7702961002533478, 0.7734417413137893, 0.7744879320484981, 0.7672819399203764, 0.7664181256786102, 0.7744115884002897, 0.773331467155266, 0.7745572068403909, 0.7760148050126674, 0.7737598398479912, 0.7691071186210641, 0.7686320914766559, 0.7713126017915309, 0.7717282505428881, 0.7677767598624683, 0.776378144227289, 0.7764035921100253, 0.771100536102063, 0.7766920014477017, 0.7663912640246109, 0.7748130994390157, 0.7696754546688382, 0.7689572588671735, 0.7726839599167572, 0.7712956365363735, 0.7734700167390518, 0.7707541621425987, 0.7767400696706479, 0.7678912753347811, 0.7700190010857764, 0.7739916983351429, 0.7720378664495116, 0.7641419539449873, 0.7705519928519725, 0.7703724439015562, 0.7767838965798045, 0.7725779270720232, 0.7663234030039812, 0.771736733170467, 0.7712221204306913, 0.7741542820304017, 0.7644869141331886, 0.773650979460731, 0.7738771828628302, 0.7749177185124865, 0.7674162481903728, 0.7707343693449149, 0.7679916530944624, 0.7690194648027506, 0.77

In [48]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [17]:
column_to_drop_2 = '소득 중 재산소득의 비중(월평균)'

In [18]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(8444, 210)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [54]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [55]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7848350783836223


In [56]:
optuna_3 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [57]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7728437160694897


In [58]:
X_train = X_train.values
y_train = y_train.values

In [59]:
auc_bootstrap = []

In [60]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76331621, 0.77617919])

In [61]:
t_3 = auc_bootstrap
print(t_3)

[0.7708220231632283, 0.7651824895946435, 0.7677159676981542, 0.7651259387441187, 0.7676127623959463, 0.7690788431958018, 0.7737570123054651, 0.7699157957835686, 0.7637390291349981, 0.7714370136626855, 0.7648686323742309, 0.7705661305646037, 0.7696231451321028, 0.7724294810893955, 0.7762735251538182, 0.767525108577633, 0.7692682885450597, 0.7703993055555556, 0.7691721520991677, 0.7713677388707926, 0.7659600637893593, 0.7657084125045239, 0.7678615861382555, 0.7718752827542525, 0.7697885563698879, 0.7732664336771624, 0.7690618779406442, 0.7680411350886717, 0.7727419245385451, 0.7658229279768367, 0.7712023276330076, 0.7708927117263844, 0.7701405854144047, 0.7717961115635179, 0.7588374841657619, 0.768977051664857, 0.7718682138979371, 0.7707541621425986, 0.7676622443901555, 0.76869288364097, 0.7733993281758959, 0.7689318109844372, 0.7649364933948606, 0.7768333785740138, 0.7687437794064423, 0.7736962201411509, 0.7738192182410424, 0.7741952813970322, 0.764621222403185, 0.7712843263662685, 0.77

In [62]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [174]:
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [176]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(8444, 209)


In [177]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [178]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [179]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7862560163352618


In [180]:
optuna_4 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [181]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.772104313698878


In [182]:
X_train = X_train.values
y_train = y_train.values

In [183]:
auc_bootstrap = []

In [184]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76584675, 0.77749686])

In [185]:
t_4 = auc_bootstrap
print(t_4)

[0.7726146851248642, 0.7695171122873689, 0.7699666915490408, 0.7799493304379299, 0.7762028365906622, 0.7677329329533116, 0.778264115092291, 0.7692598059174811, 0.7724111020629751, 0.7747989617263844, 0.7728069580166487, 0.772611857582338, 0.7781057727108215, 0.7748625814332248, 0.7732918815598987, 0.7760063223850886, 0.7704926144589215, 0.7727914065327541, 0.7727800963626492, 0.7669793928700687, 0.7732310893955845, 0.7747961341838581, 0.7727094077994933, 0.7713733939558451, 0.7720053497104595, 0.773543532844734, 0.7703766852153455, 0.7734375, 0.7726669946615997, 0.7699596226927253, 0.7757108441910967, 0.7684256808722403, 0.7730501266739052, 0.7714087382374231, 0.7718258007600435, 0.7768786192544337, 0.7767952067499095, 0.769306460369164, 0.7667164314151285, 0.7705053384002896, 0.7746505157437567, 0.7707739549402824, 0.7674798678972132, 0.7687466069489686, 0.7762084916757148, 0.7769478940463264, 0.7658017214078899, 0.7758592901737242, 0.771323911961636, 0.7799945711183496, 0.77154587404

In [186]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [21]:
column_to_drop_4 = '자산 중 부동산 자산의 비중'

In [22]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(8444, 208)


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [78]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [79]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7893365567778519


In [80]:
optuna_5 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [81]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7761462857401376


In [82]:
X_train = X_train.values
y_train = y_train.values

In [85]:
auc_bootstrap = []

In [86]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76615372, 0.77767634])

In [87]:
t_5 = auc_bootstrap
print(t_5)

[0.7721764160332971, 0.7770822023163229, 0.7739521127397757, 0.7722315531125589, 0.7738969756605139, 0.7747848240137531, 0.7754705030763663, 0.7743380722946074, 0.7698832790445169, 0.7688865703040173, 0.7759851158161418, 0.7772758889793703, 0.7665057794969237, 0.7696796959826275, 0.7730784020991676, 0.7731335391784293, 0.7714949782844736, 0.7744398638255519, 0.7734459826275788, 0.7727405107672819, 0.7666301913680782, 0.7672904225479551, 0.7681386853058271, 0.7687593308903367, 0.7688554673362288, 0.7676947611292074, 0.7747975479551212, 0.7673427320846906, 0.7759017033116178, 0.7794870272348896, 0.7706905424357582, 0.7684808179515019, 0.7706693358668115, 0.7663050239775606, 0.7680425488599348, 0.7687338830076004, 0.7698988305284111, 0.7760105636988781, 0.7753559876040536, 0.7687126764386536, 0.7719643503438292, 0.7699214508686212, 0.7749474077090119, 0.7766920014477017, 0.7742023502533477, 0.7746618259138618, 0.7719374886898299, 0.7754422276511037, 0.7739252510857764, 0.7721142100977199,

In [88]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [23]:
column_to_drop_5 = 'Cat_가구주 장애 여부'

In [24]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(8444, 206)


In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [92]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [93]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7878338541229299


In [94]:
optuna_6 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [95]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7769507215888527


In [96]:
X_train = X_train.values
y_train = y_train.values

In [97]:
auc_bootstrap = []

In [98]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76547047, 0.77775039])

In [99]:
t_6 = auc_bootstrap
print(t_6)

[0.7725185486789722, 0.7660830618892509, 0.7710538816503801, 0.7716999751176259, 0.7670628053745927, 0.7788819331342742, 0.768209373868983, 0.7694916644046326, 0.7731095050669562, 0.7685373688020267, 0.7703879953854506, 0.7760176325551936, 0.7708686776149114, 0.7684907143503439, 0.7771528908794788, 0.7773847493666304, 0.7695326637712632, 0.7737768051031487, 0.7721665196344552, 0.7683012690010859, 0.7738432523525154, 0.7685204035468693, 0.7769931347267464, 0.7781891852153456, 0.7702423769453492, 0.7751354392870068, 0.771903558179515, 0.7747721000723852, 0.7684737490951865, 0.7730586093014838, 0.7756557071118348, 0.7721368304379297, 0.7696514205573652, 0.769401183043793, 0.7711076049583785, 0.7673780763662685, 0.7718950755519364, 0.7705887509048137, 0.7761618372240318, 0.7757249819037279, 0.7667998439196525, 0.7755920874049946, 0.7690095684039089, 0.7710142960550127, 0.7711938450054288, 0.7796001289359392, 0.7702876176257691, 0.7741048000361925, 0.7710157098262758, 0.772033625135722, 0.7

In [100]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [25]:
column_to_drop_6 = '부채 중 비금융기관 대출금의 비중'

In [26]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(8444, 205)


In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [104]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [105]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7858891800989132


In [106]:
optuna_7 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [107]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7757193268186753


In [108]:
X_train = X_train.values
y_train = y_train.values

In [109]:
auc_bootstrap = []

In [110]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76497428, 0.77783606])

In [111]:
t_7 = auc_bootstrap
print(t_7)

[0.7752414721317409, 0.7701405854144047, 0.7714695304017374, 0.7709747104596453, 0.7779658093557726, 0.7747141354505972, 0.7682786486608759, 0.7678516897394136, 0.7736580483170467, 0.7725807546145493, 0.7727885789902279, 0.7718144905899386, 0.7733328809265291, 0.7686193675352877, 0.7637743734165761, 0.7780987038545061, 0.7750817159790084, 0.770051517824828, 0.7762636287549765, 0.7709294697792255, 0.7652065237061165, 0.7693389771082157, 0.7701829985522982, 0.7749064083423813, 0.7667574307817591, 0.7683055103148752, 0.7754874683315237, 0.7682263391241404, 0.7704403049221862, 0.7682023050126673, 0.766980806641332, 0.7695934559355773, 0.7682517870068766, 0.7714596340028954, 0.777191062703583, 0.770535027596815, 0.7689770516648571, 0.7716405967245747, 0.7701363441006153, 0.7722852764205574, 0.7620905718422005, 0.7739054582880927, 0.7664803316141875, 0.7681952361563518, 0.7728889567499095, 0.7681825122149837, 0.7720689694173, 0.7703795127578719, 0.769767349800941, 0.7722668973941368, 0.77048

In [112]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
column_to_drop_7 = 'Cat_현재 주택의 위치'

In [28]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(8444, 201)


In [115]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [117]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [118]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7882935043467885


In [119]:
optuna_8 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [120]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7766425194534926


In [121]:
X_train = X_train.values
y_train = y_train.values

In [122]:
auc_bootstrap = []

In [123]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7655734 , 0.77764665])

In [124]:
t_8 = auc_bootstrap
print(t_8)

[0.770604302388708, 0.7690378438291712, 0.7671843897032211, 0.7702791349981903, 0.7665368824647122, 0.766203232446616, 0.7768319648027506, 0.7667673271806007, 0.770167447068404, 0.7748597538906985, 0.7695270086862106, 0.7676509342200506, 0.7726599258052841, 0.7740072498190373, 0.7674572475570033, 0.7697447294607311, 0.7690830845095911, 0.7714016693811074, 0.7740355252442996, 0.770263583514296, 0.7701547231270359, 0.7695595254252624, 0.773321570756424, 0.7658186866630474, 0.7737117716250452, 0.7744681392508144, 0.7733908455483169, 0.7671886310170104, 0.7732339169381106, 0.7705364413680782, 0.7717409744842563, 0.769921450868621, 0.7720944173000361, 0.7718880066956206, 0.7704671665761852, 0.7724393774882374, 0.7718045941910967, 0.7692909088852696, 0.7703724439015562, 0.7751905763662688, 0.7730501266739052, 0.7663460233441911, 0.7738149769272529, 0.7762056641331885, 0.7752640924719507, 0.7618615408975751, 0.7670642191458559, 0.7751071638617445, 0.775845152461093, 0.776164664766558, 0.77290

In [125]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
column_to_drop_8 = 'Cat_현재 대중교통 접근용이성'

In [30]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(8444, 197)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [34]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [35]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7870604042270141


In [36]:
optuna_9 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [37]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7790939988237424


In [38]:
X_train = X_train.values
y_train = y_train.values

In [39]:
auc_bootstrap = []

In [40]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76745576, 0.77865767])

In [41]:
t_9 = auc_bootstrap
print(t_9)

[0.7787207632102786, 0.7659982356134636, 0.7755440191820485, 0.7700840345638797, 0.7764149022801302, 0.7730303338762214, 0.768764985975389, 0.7728140268729643, 0.7764983147846545, 0.7684398185848716, 0.776471453130655, 0.7747650312160697, 0.7702961002533477, 0.7663050239775606, 0.7712362581433225, 0.7718314558450959, 0.7726556844914947, 0.7679449986427797, 0.7769493078175896, 0.7781552547050308, 0.772115623868983, 0.7723644476112921, 0.769637282844734, 0.7723927230365545, 0.776660898479913, 0.7715204261672095, 0.7774992648389432, 0.7765435554650743, 0.7750859572927977, 0.7745317589576548, 0.7745331727289179, 0.7754224348534202, 0.7795520607129933, 0.7749064083423814, 0.7737513572204127, 0.7736721860296779, 0.7742419358487151, 0.7741571095729279, 0.7751707835685848, 0.7774610930148389, 0.7718413522439376, 0.7700331387984074, 0.7726599258052842, 0.7733866042345278, 0.7686476429605502, 0.7780138775787188, 0.7757193268186753, 0.7766227266558089, 0.7721368304379298, 0.7689728103510677, 0.77

In [42]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [31]:
column_to_drop_9 = '현재 주택 거주 기간(총 개월)'

In [32]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(8444, 196)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [46]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [47]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7885896251640819


In [48]:
optuna_10 = RandomForestClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [49]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.778146772077452


In [50]:
X_train = X_train.values
y_train = y_train.values

In [51]:
auc_bootstrap = []

In [52]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76618326, 0.7785287 ])

In [53]:
t_10 = auc_bootstrap
print(t_10)

[0.7661056822294607, 0.7720788658161418, 0.7728974393774882, 0.7742730388165038, 0.7754535378212088, 0.7743310034382916, 0.7714992195982627, 0.7723672751538184, 0.7810336929967427, 0.768798916485704, 0.7735477741585233, 0.774818754524068, 0.7716900787187839, 0.7742108328809265, 0.7740680419833514, 0.7765477967788635, 0.7718003528773073, 0.7730260925624323, 0.7761081139160333, 0.7697107989504162, 0.7777367784111472, 0.7687819512305467, 0.7721947950597177, 0.7735845322113646, 0.7691580143865364, 0.7715119435396307, 0.7659812703583062, 0.7733031917300036, 0.7733837766920014, 0.7692188065508505, 0.7781679786463986, 0.7741882125407166, 0.7724832043973942, 0.7757476022439378, 0.7737117716250452, 0.769484595548317, 0.7732395720231632, 0.7728267508143323, 0.7772348896127398, 0.7709860206297503, 0.7692626334600072, 0.7703328583061889, 0.7738079080709374, 0.7618855750090481, 0.772729200597177, 0.7758819105139341, 0.772327689558451, 0.7727829239051754, 0.7692682885450597, 0.7682263391241404, 0.76

In [54]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [33]:
column_to_drop_10 = 'Cat_현재 주택의 구조'

In [34]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(8444, 194)


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [58]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [59]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7897277014395007


In [60]:
optuna_11 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [61]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7723828266377126


In [62]:
X_train = X_train.values
y_train = y_train.values

In [63]:
auc_bootstrap = []

In [64]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76652836, 0.77891265])

In [65]:
t_11 = auc_bootstrap
print(t_11)

[0.7794120973579443, 0.7741048000361925, 0.7724789630836049, 0.7740510767281941, 0.7821859165761853, 0.7709676416033296, 0.7716618032935214, 0.7745656894679696, 0.7723262757871878, 0.7648403569489686, 0.7739959396489322, 0.7769266874773797, 0.7752315757328991, 0.7771288567680057, 0.7751000950054288, 0.7725015834238148, 0.7737980116720956, 0.7709294697792255, 0.7728521986970684, 0.7734431550850526, 0.7720491766196163, 0.7725468241042345, 0.7735802908975751, 0.7737937703583062, 0.7680241698335143, 0.7704077881831342, 0.7722004501447701, 0.7741132826637713, 0.7741118688925083, 0.7731660559174811, 0.7690661192544336, 0.7733823629207384, 0.7693997692725298, 0.7784648706116539, 0.7766990703040174, 0.7728097855591749, 0.7712489820846906, 0.7701363441006153, 0.7711429492399566, 0.7732226067680058, 0.7750096136445892, 0.7740256288454579, 0.7719049719507781, 0.7783333898841839, 0.7706127850162867, 0.7740001809627218, 0.7708686776149114, 0.7684327497285558, 0.7766863463626492, 0.7786543159609122,

In [66]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [35]:
column_to_drop_11 = 'Cat_현재 상업시설 접근용이성'

In [36]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(8444, 190)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [70]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [71]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7858361435346218


In [72]:
optuna_12 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [73]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7759681505609843


In [74]:
X_train = X_train.values
y_train = y_train.values

In [75]:
auc_bootstrap = []

In [76]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76585643, 0.77783168])

In [77]:
t_12 = auc_bootstrap
print(t_12)

[0.7683776126492943, 0.7723771715526601, 0.7684398185848715, 0.7692739436301124, 0.7739026307455664, 0.779587404994571, 0.7757278094462541, 0.7716943200325734, 0.767100977198697, 0.7781948403003982, 0.7661848534201955, 0.7668705324828086, 0.7691905311255881, 0.779208514296055, 0.7643978465436121, 0.7708008165942816, 0.7719897982265654, 0.7691679107853782, 0.7727320281397032, 0.7677046575280492, 0.7692993915128483, 0.7719855569127759, 0.7743748303474485, 0.772303655446978, 0.7711782935215346, 0.7692499095186393, 0.7735124298769454, 0.7735138436482085, 0.7745190350162867, 0.7703667888165038, 0.7668196367173363, 0.7746434468874412, 0.7708404021896489, 0.768705607582338, 0.7743267621245025, 0.7778937070213536, 0.7658568584871517, 0.7808216273072748, 0.7662484731270358, 0.7740581455845096, 0.7650905944625407, 0.772255587224032, 0.7766764499638074, 0.7705251311979732, 0.7688681912775969, 0.7756500520267824, 0.7728493711545422, 0.7749219598262758, 0.77566701728194, 0.7734162934310531, 0.76920

In [78]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [37]:
column_to_drop_12 = 'Cat_현재 치안 및 범죄 등 방범 상태' 

In [38]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(8444, 186)


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7878051259839387


In [84]:
optuna_13 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [85]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.7773041644046327


In [86]:
X_train = X_train.values
y_train = y_train.values

In [87]:
auc_bootstrap = []

In [88]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76498311, 0.77734573])

In [89]:
t_13 = auc_bootstrap
print(t_13)

[0.7677428293521534, 0.7705350275968149, 0.7710934672457475, 0.7732890540173725, 0.7758550488599348, 0.7743592788635542, 0.7666584667933405, 0.7754973647303655, 0.7733456048678973, 0.7720039359391966, 0.7760628732356134, 0.774865408975751, 0.7700486902823018, 0.7696231451321027, 0.7740312839305105, 0.7700529315960912, 0.7734502239413681, 0.7712249479732176, 0.7745444828990228, 0.7718993168657257, 0.7671179424538546, 0.770676404723127, 0.768598160966341, 0.7728635088671734, 0.7706043023887079, 0.7729723692544335, 0.7703272032211363, 0.7739294923995657, 0.7658625135722041, 0.7688681912775968, 0.7693785627035831, 0.7749445801664857, 0.7710609505066957, 0.7664972968693449, 0.7632837947882737, 0.7688413296235975, 0.7724832043973942, 0.7742193155085053, 0.769306460369164, 0.7659699601882013, 0.7691806347267462, 0.7685910921100254, 0.7728097855591749, 0.7748258233803836, 0.7737202542526239, 0.7716674583785741, 0.7724492738870793, 0.7729511626854868, 0.7744950009048137, 0.7711797072927977, 0.7

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [39]:
column_to_drop_13 = 'Cat_현재 교육환경'

In [40]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(8444, 182)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [94]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [95]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7860328207938689


In [96]:
optuna_14 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [97]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.775088784835324


In [98]:
X_train = X_train.values
y_train = y_train.values

In [99]:
auc_bootstrap = []

In [100]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7658271 , 0.77803215])

In [101]:
t_14 = auc_bootstrap
print(t_14)

[0.7700698968512486, 0.7753658840028954, 0.7694831817770539, 0.7797429198335142, 0.7704077881831344, 0.7733611563517916, 0.7685317137169743, 0.7742504184762938, 0.7718484211002534, 0.7743338309808179, 0.7725793408432863, 0.7784210437024972, 0.7730374027325371, 0.7740242150741947, 0.7757207405899385, 0.7685373688020268, 0.7762791802388709, 0.775374366630474, 0.7693573561346363, 0.7637050986246833, 0.765827169290626, 0.7696104211907348, 0.769449251266739, 0.7745034835323925, 0.778594937567861, 0.7731830211726385, 0.7736764273434673, 0.7799620543792978, 0.7763046281216068, 0.7723375859572927, 0.7694704578356859, 0.7753885043431052, 0.7671942861020631, 0.7755906736337315, 0.769543973941368, 0.7747721000723852, 0.7710991223307999, 0.7666810871335505, 0.7730798158704306, 0.7707555759138617, 0.7684440598986609, 0.7786274543069128, 0.7688795014477019, 0.7667404655266015, 0.7705124072566052, 0.7677866562613102, 0.7746830324828087, 0.7719544539449874, 0.7710157098262759, 0.7729949895946435, 0.77

In [102]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [41]:
column_to_drop_14 = 'Cat_현재 주변도로의 보행 안전'

In [42]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(8444, 178)


In [105]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [106]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [107]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7861388939224516


In [108]:
optuna_15 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [109]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7754832270177344


In [110]:
X_train = X_train.values
y_train = y_train.values

In [111]:
auc_bootstrap = []

In [112]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76417323, 0.77701173])

In [113]:
t_15 = auc_bootstrap
print(t_15)

[0.7755935011762577, 0.7703215481360839, 0.7697206953492581, 0.774192453854506, 0.7735576705573652, 0.7692230478646399, 0.769401183043793, 0.7750590956387984, 0.7662527144408252, 0.7707767824828085, 0.7684921281216069, 0.7711217426710097, 0.7717551121968875, 0.7735237400470503, 0.7720138323380384, 0.7686561255881288, 0.7718724552117263, 0.7668860839667029, 0.7710793295331162, 0.7736566345457836, 0.7661664743937748, 0.770650956840391, 0.770830505790807, 0.7743182794969236, 0.7738955618892509, 0.7670628053745927, 0.7737160129388345, 0.771383290354687, 0.7714341861201592, 0.7720901759862469, 0.7695976972493666, 0.775726395674991, 0.7739973534201954, 0.7684030605320303, 0.7722117603148752, 0.7691198425624322, 0.7755878460912051, 0.7726867874592834, 0.7761038726022439, 0.7702522733441912, 0.7672211477560622, 0.7691594281577996, 0.7687140902099168, 0.7698239006514658, 0.7764530741042344, 0.7716957338038364, 0.7755044335866812, 0.7721594507781397, 0.7681033410242489, 0.7715812183315237, 0.766

In [114]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
column_to_drop_15 = 'Cat_현재 의료시설 접근용이성'

In [44]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(8444, 174)


In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [118]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [119]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7862891641879439


In [120]:
optuna_16 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [121]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7758154632645675


In [122]:
X_train = X_train.values
y_train = y_train.values

In [123]:
auc_bootstrap = []

In [124]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76635539, 0.77840341])

In [125]:
t_16 = auc_bootstrap
print(t_16)

[0.7710496403365907, 0.7711585007238508, 0.773002058450959, 0.7781552547050308, 0.7739916983351429, 0.7738814241766196, 0.773331467155266, 0.7706608532392327, 0.7718823516105682, 0.7686547118168657, 0.7648149090662324, 0.7685571615997104, 0.7728649226384365, 0.7722315531125588, 0.7734912233079987, 0.7692527370611654, 0.7690887395946435, 0.7750477854686935, 0.7766849325913863, 0.7744327949692363, 0.7753333672638437, 0.7699370023525154, 0.7680807206840391, 0.7679336884726746, 0.769695247466522, 0.7765322452949692, 0.7726330641512849, 0.7688427433948606, 0.7710496403365907, 0.77224569082519, 0.7743550375497648, 0.7758536350886718, 0.7716646308360477, 0.7721311753528773, 0.7744582428519725, 0.7774596792435757, 0.7757094304198335, 0.7722329668838219, 0.7766015200868621, 0.7724789630836048, 0.773803666757148, 0.7716773547774158, 0.7756698448244661, 0.7741302479189287, 0.77679096543612, 0.7715600117625769, 0.7708927117263843, 0.773143435577271, 0.7763074556641332, 0.7720421077633008, 0.771858

In [126]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [45]:
column_to_drop_16 = 'Cat_가구주 주민등록상 등재 여부'

In [46]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(8444, 172)


In [131]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [132]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [133]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7871775266398242


In [134]:
optuna_17 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [135]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7725934785559175


In [136]:
X_train = X_train.values
y_train = y_train.values

In [137]:
auc_bootstrap = []

In [138]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7642523 , 0.77668006])

In [139]:
t_17 = auc_bootstrap
print(t_17)

[0.7669016354505973, 0.7724139296055013, 0.7686900560984437, 0.7669129456207021, 0.7719671778863554, 0.7672381130112196, 0.772329103329714, 0.7711471905537459, 0.7671971136445892, 0.7720463490770901, 0.7723729302388709, 0.7674021104777415, 0.7670203922366993, 0.7754577791349982, 0.7662117150741947, 0.7720237287368803, 0.7691170150199059, 0.7724676529134998, 0.7735788771263119, 0.7679435848715164, 0.7694520788092654, 0.7765195213536011, 0.7680255836047774, 0.7728734052660151, 0.7707357831161781, 0.767892689106044, 0.774250418476294, 0.76986207247557, 0.7788395199963807, 0.7708375746471228, 0.7744907595910242, 0.7701589644408252, 0.7700585866811437, 0.776092562432139, 0.7706721634093378, 0.768740951863916, 0.7661933360477743, 0.7719360749185669, 0.7710227786825914, 0.7697221091205211, 0.7705152347991313, 0.7729327836590661, 0.7748837880021715, 0.7712235342019544, 0.768268752262034, 0.7733851904632645, 0.7745162074737604, 0.7674204895041622, 0.7699115544697792, 0.7705859233622874, 0.77248

In [140]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [47]:
column_to_drop_17 = 'Cat_현재 청소/쓰레기 처리상태'

In [48]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(8444, 168)


In [143]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [144]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [145]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 98, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7847599432508763


In [146]:
optuna_18 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [147]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.7746307229460732


In [148]:
X_train = X_train.values
y_train = y_train.values

In [149]:
auc_bootstrap = []

In [150]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76554996, 0.77679962])

In [151]:
t_18 = auc_bootstrap
print(t_18)

[0.7716434242671009, 0.7722767937929786, 0.7733526737242127, 0.7748696502895404, 0.7681316164495114, 0.7735336364458921, 0.7698550036192544, 0.7713818765834238, 0.7691481179876946, 0.7728719914947519, 0.7687593308903365, 0.7708757464712269, 0.7711754659790082, 0.7738234595548318, 0.7694336997828448, 0.7756458107129931, 0.777615194082519, 0.7703894091567136, 0.7731604008324284, 0.7728338196706479, 0.7728748190372783, 0.773402155718422, 0.7711104325009048, 0.7753178157799493, 0.7736113938653637, 0.77072305917481, 0.7704304085233442, 0.7695976972493667, 0.7689374660694898, 0.7699935532030402, 0.7728960256062253, 0.7720774520448788, 0.7668620498552299, 0.7721778298045602, 0.766850739685125, 0.777155718422005, 0.7760049086138257, 0.7718484211002533, 0.7773310260586319, 0.7687904338581252, 0.7716109075280493, 0.771972832971408, 0.7728804741223309, 0.7752513685305827, 0.7774257487332609, 0.7729129908613825, 0.7727914065327542, 0.7735930148389432, 0.7745048973036555, 0.7675307636626855, 0.7629

In [152]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [49]:
column_to_drop_18 = 'Cat_현재 공공기관 접근용이성'

In [50]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(8444, 164)


In [155]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [156]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [157]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 86, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.784194219898435


In [158]:
optuna_19 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [159]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7762961454940283


In [160]:
X_train = X_train.values
y_train = y_train.values

In [161]:
auc_bootstrap = []

In [162]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76547609, 0.77784472])

In [163]:
t_19 = auc_bootstrap
print(t_19)

[0.7747424108758595, 0.7692583921462179, 0.7663488508867173, 0.7704459600072385, 0.7701773434672456, 0.7697051438653637, 0.7750986812341657, 0.7738941481179877, 0.7751905763662685, 0.7720746245023524, 0.7629572136264928, 0.7744879320484981, 0.774216487965979, 0.7720534179334057, 0.774569930781759, 0.7704615114911328, 0.769059050398118, 0.7720081772529859, 0.7759497715345638, 0.7694761129207384, 0.776648174538545, 0.7744426913680782, 0.7757560848715165, 0.7743734165761853, 0.7692555646036916, 0.7686745046145493, 0.7754097109120521, 0.7771528908794788, 0.7706240951863915, 0.7726853736880201, 0.7700416214259862, 0.7748328922366992, 0.7699511400651466, 0.7730147823923271, 0.772057659247195, 0.7753559876040536, 0.7693220118530584, 0.7718470073289901, 0.7700826207926168, 0.7754408138798407, 0.7750718195801665, 0.7750053723308, 0.7747862377850162, 0.7712093964893232, 0.7720958310712993, 0.7676876922728918, 0.7702324805465075, 0.774547310441549, 0.7715006333695258, 0.769625972674629, 0.7726896

In [164]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [51]:
column_to_drop_19 = 'Cat_소득 계층'

In [52]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(8444, 162)


In [179]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [180]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [181]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7877344105648836


In [169]:
optuna_20 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [170]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7774695756424177


In [171]:
X_train = X_train.values
y_train = y_train.values

In [172]:
auc_bootstrap = []

In [173]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76759855, 0.77906671])

In [174]:
t_20 = auc_bootstrap
print(t_20)

[0.7714638753166847, 0.7736651171733624, 0.7754040558269997, 0.7772363033840028, 0.7774526103872601, 0.7726980976293883, 0.7687904338581252, 0.7734898095367355, 0.7656702406804199, 0.7765958650018095, 0.7741571095729279, 0.7753135744661599, 0.7750209238146941, 0.773010541078538, 0.7758027393231993, 0.7744087608577633, 0.7756359143141514, 0.7670246335504887, 0.7723686889250814, 0.7738079080709374, 0.7660986133731451, 0.7747763413861745, 0.772186312432139, 0.7719982808541441, 0.7774045421643141, 0.7752980229822655, 0.7712588784835323, 0.776069942091929, 0.7759243236518277, 0.7739068720593557, 0.7752881265834238, 0.7769507215888527, 0.7754464689648933, 0.7707541621425987, 0.7729596453130655, 0.7657494118711544, 0.7736354279768368, 0.7750025447882737, 0.7711797072927977, 0.7716872511762576, 0.7771345118530584, 0.7714992195982628, 0.7754252623959464, 0.7747212043069128, 0.7708601949873326, 0.7710340888526964, 0.7732423995656895, 0.7770694783749548, 0.7785058699782845, 0.7737329781939921, 0.

In [182]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [53]:
column_to_drop_20 = 'Cat_현재 대기오염 정도'

In [54]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(8444, 158)


In [185]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [186]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [187]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7863112627563986


In [188]:
optuna_21 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [189]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7779573267281941


In [190]:
X_train = X_train.values
y_train = y_train.values

In [191]:
auc_bootstrap = []

In [192]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76690358, 0.77852994])

In [193]:
t_21 = auc_bootstrap
print(t_21)

[0.772210346543612, 0.779868745475932, 0.7738757690915672, 0.7781170828809265, 0.7753574013753167, 0.7787900380021715, 0.7695920421643141, 0.7730190237061165, 0.7681839259862469, 0.7669892892689105, 0.7770143412956931, 0.772976610568223, 0.778040739232718, 0.7703413409337676, 0.7754450551936302, 0.7674077655627941, 0.7712998778501627, 0.7727857514477018, 0.7672013549583786, 0.7755185712993122, 0.7779629818132465, 0.779243858577633, 0.772657098262758, 0.7727603035649657, 0.7794997511762576, 0.777557229460731, 0.7706608532392327, 0.7741995227108215, 0.7754987785016287, 0.7723248620159247, 0.7690972222222223, 0.7789653456387984, 0.7763300760043431, 0.7683026827723489, 0.7744766218783932, 0.771336635903004, 0.7682206840390879, 0.7801995679515019, 0.7729313698878031, 0.7778300873145132, 0.7703441684762937, 0.7767810690372783, 0.7741429718602967, 0.7715091159971046, 0.7697362468331523, 0.7674445236156351, 0.7735223262757871, 0.7760614594643505, 0.7707895064241767, 0.7699511400651465, 0.77368

In [194]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [55]:
column_to_drop_21 = 'Cat_가구주 성별'

In [56]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(8444, 156)


In [197]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [198]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [199]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7872460322020339


In [200]:
optuna_22 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [201]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7782909767462902


In [202]:
X_train = X_train.values
y_train = y_train.values

In [203]:
auc_bootstrap = []

In [204]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76533026, 0.7776774 ])

In [205]:
t_22 = auc_bootstrap
print(t_22)

[0.7711189151284836, 0.7671462178791171, 0.7749502352515382, 0.7688752601339124, 0.7716929062613103, 0.7744921733622874, 0.7744822769634455, 0.7751029225479551, 0.7756443969417302, 0.7756132939739414, 0.7720011083966704, 0.7679817566956206, 0.7681598918747738, 0.7730204374773797, 0.7753672977741584, 0.7719926257690917, 0.770074138165038, 0.7719007306369887, 0.7694676302931596, 0.7655953108034745, 0.7698705551031487, 0.7712970503076366, 0.7672239752985885, 0.773130711635903, 0.770617026330076, 0.772163692091929, 0.7751467494571118, 0.774770686301122, 0.7726330641512849, 0.7687466069489686, 0.7711090187296418, 0.769920037097358, 0.7669101180781759, 0.7693036328266377, 0.769968105320304, 0.7722471045964532, 0.7778315010857764, 0.7721368304379298, 0.7720562454759319, 0.7713988418385813, 0.7692456682048497, 0.7802122918928701, 0.7719714192001448, 0.7692513232899023, 0.7707060939196526, 0.7709987445711184, 0.7758522213174086, 0.7700939309627217, 0.7688469847086501, 0.7770228239232717, 0.7707

In [206]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [57]:
column_to_drop_22 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [58]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(8444, 152)


In [209]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [210]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [211]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7857831069703305


In [212]:
optuna_23 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [213]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.7756118802026782


In [214]:
X_train = X_train.values
y_train = y_train.values

In [215]:
auc_bootstrap = []

In [216]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76744756, 0.77880492])

In [217]:
t_23 = auc_bootstrap
print(t_23)

[0.7706424742128122, 0.7730882984980094, 0.7724761355410785, 0.7702946864820847, 0.7750364752985884, 0.7725270313065509, 0.7733583288092654, 0.7735039472493666, 0.7754139522258415, 0.7755864323199421, 0.7727108215707564, 0.7725114798226566, 0.7695779044516828, 0.7733441910966341, 0.775064750723851, 0.773485568222946, 0.7747607899022801, 0.7723913092652914, 0.774663239685125, 0.7725171349077091, 0.7767004840752805, 0.7740920760948244, 0.7756401556279406, 0.7717946977922547, 0.7721707609482448, 0.7718159043612015, 0.7688526397937026, 0.7745261038726022, 0.7761491132826637, 0.7756500520267824, 0.7735209125045241, 0.7731109188382193, 0.7782768390336591, 0.7718809378393052, 0.773403569489685, 0.774038352786826, 0.7748738916033296, 0.7743140381831343, 0.7738107356134636, 0.7735407053022078, 0.773023265019906, 0.774523276330076, 0.7767838965798045, 0.7774808858125226, 0.7767810690372783, 0.7690350162866448, 0.7729808518820123, 0.7777905017191459, 0.7701985500361924, 0.7760755971769815, 0.7781

In [218]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
column_to_drop_23 = '소득 중 정부 보조금의 비중(월평균)'

In [60]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(8444, 151)


In [221]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [222]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [223]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7858250942503945


In [224]:
optuna_24 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [225]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7774483690734708


In [226]:
X_train = X_train.values
y_train = y_train.values

In [227]:
auc_bootstrap = []

In [228]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76690874, 0.77897924])

In [229]:
t_24 = auc_bootstrap
print(t_24)

[0.7725242037640245, 0.7760133912414042, 0.7707640585414404, 0.772435136174448, 0.77310667752443, 0.770793747737966, 0.7774314038183134, 0.77017027461093, 0.7762466634998191, 0.7644543973941368, 0.774630722946073, 0.7761886988780311, 0.7725242037640246, 0.773673599800941, 0.7724817906261311, 0.7726768910604416, 0.768315406713717, 0.7713564287006878, 0.7745628619254434, 0.7723234482446617, 0.7693205980817952, 0.7727659586500182, 0.774169833514296, 0.774545896670286, 0.7710807433043793, 0.7756005700325733, 0.7716872511762577, 0.7752683337857402, 0.7756868100796237, 0.7718173181324648, 0.7682164427252984, 0.7771811663047412, 0.7751410943720594, 0.7700133460007238, 0.7762311120159247, 0.7681895810712993, 0.7759200823380383, 0.7752414721317408, 0.7695171122873687, 0.7692089101520085, 0.7730091273072746, 0.7735718082699964, 0.7707174040897575, 0.7745769996380747, 0.7678898615635179, 0.771267361111111, 0.7702225841476656, 0.7754973647303656, 0.7785482831161782, 0.772279621335505, 0.7739563540

In [230]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [61]:
column_to_drop_24 = 'Cat_현재 주택의 유형'

In [62]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(8444, 140)


In [235]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [236]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [237]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7882515170667244


In [238]:
optuna_25 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [239]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.7756033975750996


In [240]:
X_train = X_train.values
y_train = y_train.values

In [241]:
auc_bootstrap = []

In [242]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76612689, 0.77852602])

In [243]:
t_25 = auc_bootstrap
print(t_25)

[0.7762424221860296, 0.7682517870068766, 0.7746208265472312, 0.7738969756605139, 0.7740821796959827, 0.7716929062613102, 0.773273502533478, 0.7734530514838943, 0.7753404361201592, 0.7735831184401013, 0.7707584034563879, 0.7748866155446978, 0.7711076049583786, 0.7723488961273978, 0.7710241924538546, 0.7759540128483532, 0.7753036780673181, 0.771254637169743, 0.7754846407889975, 0.7701575506695622, 0.7766538296235974, 0.771338049674267, 0.7748244096091205, 0.7741175239775606, 0.7680764793702497, 0.7703894091567137, 0.7718144905899386, 0.7704784767462903, 0.7724761355410786, 0.7732848127035831, 0.7684864730365545, 0.7746491019724936, 0.7767711726384364, 0.7722753800217155, 0.7689445349258053, 0.7763442137169743, 0.7723460685848715, 0.7734092245747375, 0.7678007939739414, 0.7687607446615996, 0.7768065169200145, 0.7725454103329713, 0.7734558790264205, 0.7676071073108939, 0.779574681053203, 0.7728168544154905, 0.773342777325371, 0.7702494458016649, 0.772990748280854, 0.7708220231632283, 0.772

In [244]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [63]:
column_to_drop_25 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [64]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(8444, 136)


In [247]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [248]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [249]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7877432499922656


In [250]:
optuna_26 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [251]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.7759610817046688


In [252]:
X_train = X_train.values
y_train = y_train.values

In [253]:
auc_bootstrap = []

In [254]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76612523, 0.77837446])

In [255]:
t_26 = auc_bootstrap
print(t_26)

[0.7725920647846544, 0.773248054650742, 0.7763371448606587, 0.7718455935577271, 0.776733000814332, 0.7722301393412958, 0.7746123439196526, 0.7732791576185306, 0.7749756831342745, 0.7717395607129931, 0.7730289201049583, 0.7734276036011583, 0.7686787459283387, 0.7688073991132827, 0.7713437047593196, 0.7779248099891423, 0.7746194127759681, 0.7686363327904452, 0.7731349529496923, 0.7682758211183496, 0.7658992716250453, 0.7754337450235251, 0.7725425827904451, 0.7736933925986247, 0.771832869616359, 0.7725044109663409, 0.7688512260224393, 0.7785906962540717, 0.7602809446254071, 0.7706085437024972, 0.7750378890698516, 0.7724902732537098, 0.7773593014838942, 0.770970469145856, 0.7718201456749909, 0.7740736970684039, 0.7646169810893956, 0.7749530627940644, 0.7740058360477742, 0.7723531374411872, 0.7740468354144046, 0.7727857514477017, 0.7734261898298951, 0.768822950597177, 0.7696259726746291, 0.7735647394136809, 0.774569930781759, 0.7713733939558451, 0.7753955731994209, 0.7736849099710459, 0.774

In [256]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [65]:
column_to_drop_26 = 'Cat_현재 문화시설 접근용이성'

In [66]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(8444, 132)


In [259]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [260]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [261]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7848814853773772


In [262]:
optuna_27 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [263]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27) 
print(auc_27)

0.7749785106768006


In [264]:
X_train = X_train.values
y_train = y_train.values

In [265]:
auc_bootstrap = []

In [266]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76583297, 0.7775807 ])

In [267]:
t_27 = auc_bootstrap
print(t_27)

[0.770959158975751, 0.7722570009952952, 0.7679831704668839, 0.7758974619978285, 0.7777608125226203, 0.7715826321027868, 0.7705194761129207, 0.770737196887441, 0.7720831071299312, 0.7770694783749548, 0.7746745498552298, 0.7696867648389432, 0.7717706636807817, 0.767973274068042, 0.769720695349258, 0.7673611111111112, 0.7716504931234165, 0.7697235228917843, 0.7744045195439739, 0.7682970276872965, 0.7727518209373869, 0.7790105863192183, 0.7668083265472312, 0.7767923792073833, 0.7700826207926168, 0.7712023276330076, 0.7706665083242852, 0.7750110274158524, 0.7742277981360839, 0.7765760722041258, 0.775382849258053, 0.7719459713174086, 0.7729144046326457, 0.7706693358668113, 0.769815418023887, 0.7750322339847993, 0.7708729189287007, 0.7723672751538183, 0.7746010337495477, 0.7727730275063337, 0.7722909315056098, 0.7707640585414405, 0.7657451705573652, 0.7709492625769091, 0.7719558677162505, 0.7710496403365906, 0.7756500520267825, 0.7721820711183497, 0.7696980750090482, 0.7734700167390518, 0.770

In [268]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [67]:
column_to_drop_27 = 'Cat_기초생활보장 수급가구 여부'

In [68]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(8444, 130)


In [271]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [272]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [273]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7847953009604038


In [274]:
optuna_28 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [275]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7724860319399205


In [276]:
X_train = X_train.values
y_train = y_train.values

In [277]:
auc_bootstrap = []

In [278]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76476914, 0.77690248])

In [279]:
t_28 = auc_bootstrap
print(t_28)

[0.7716971475750996, 0.7703964780130292, 0.7716716996923634, 0.769992139431777, 0.7730727470141151, 0.7664591250452408, 0.7717749049945711, 0.7638125452406805, 0.7724945145674991, 0.7694676302931596, 0.7741062138074556, 0.7692456682048499, 0.7665651578899747, 0.7722767937929786, 0.7687748823742309, 0.7721891399746652, 0.7693601836771624, 0.7714398412052116, 0.7719756605139341, 0.7674261445892147, 0.7729978171371696, 0.7759525990770901, 0.7691990137531669, 0.7712023276330076, 0.7705364413680782, 0.7664704352153455, 0.7732395720231632, 0.7761434581976113, 0.7705534066232356, 0.7691269114187478, 0.7690816707383279, 0.7755510880383641, 0.774346554922186, 0.7748102718964893, 0.7682291666666667, 0.7756415693992037, 0.777199545331162, 0.7726896150018097, 0.775361642689106, 0.7710453990228013, 0.771314015562794, 0.7700006220593558, 0.7739464576547233, 0.7693870453311618, 0.7742136604234527, 0.7765718308903365, 0.7737230817951501, 0.7704148570394499, 0.771643424267101, 0.774547310441549, 0.7699

In [280]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [69]:
column_to_drop_28 = '총 가구원 수'

In [70]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(8444, 129)


In [283]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [284]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [285]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7847621531077216


In [286]:
optuna_29 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [287]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7736198764929425


In [288]:
X_train = X_train.values
y_train = y_train.values

In [289]:
auc_bootstrap = []

In [290]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76485732, 0.77590891])

In [291]:
t_29 = auc_bootstrap
print(t_29)

[0.7731858487151647, 0.772162278320666, 0.7689106044154903, 0.7691778071842201, 0.7689530175533841, 0.7707895064241765, 0.7720590730184582, 0.7695595254252623, 0.7665255722946073, 0.7707343693449149, 0.7664294358487151, 0.7722895177343467, 0.7757334645313065, 0.7685062658342382, 0.7738531487513571, 0.7729455076004343, 0.7670359437205936, 0.7765435554650741, 0.7687155039811799, 0.7684539562975028, 0.7714313585776331, 0.7671759070756424, 0.7723686889250814, 0.7684963694353963, 0.7678658274520449, 0.766238576728194, 0.7760091499276149, 0.770559061708288, 0.767749898208469, 0.7742942453854507, 0.7686419878754976, 0.7623012237604054, 0.7703045828809265, 0.7743960369163952, 0.7681344439920377, 0.7713931867535286, 0.7693700800760044, 0.7687126764386537, 0.7707527483713356, 0.7746109301483894, 0.7692470819761129, 0.7706453017553383, 0.7679563088128847, 0.771548701592472, 0.7726500294064423, 0.7693078741404271, 0.7703272032211365, 0.7710793295331162, 0.7718074217336229, 0.7695849733079987, 0.76

In [292]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [71]:
column_to_drop_29 = 'Cat_현재 주차시설 이용편의성'

In [72]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(8444, 125)


In [295]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [296]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [297]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 5}
0.7861477333498336


In [298]:
optuna_30 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [299]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.77429848669924


In [300]:
X_train = X_train.values
y_train = y_train.values

In [301]:
auc_bootstrap = []

In [302]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76606143, 0.77764672])

In [303]:
t_30 = auc_bootstrap
print(t_30)

[0.766781464893232, 0.7722612423090842, 0.7740341114730367, 0.7684440598986609, 0.7746095163771263, 0.773933733713355, 0.7691014635360116, 0.7742320394498734, 0.775326298407528, 0.7726669946615997, 0.7646494978284475, 0.7720039359391965, 0.7689671552660151, 0.7764559016467607, 0.7796397145313065, 0.7682192702678249, 0.7728564400108578, 0.7718894204668838, 0.7744172434853419, 0.7746999977379659, 0.7731533319761129, 0.7688992942453854, 0.7743465549221861, 0.7700932240770901, 0.7633940689467971, 0.7688540535649656, 0.7694322860115815, 0.7730713332428519, 0.7689756378935939, 0.7718752827542527, 0.7719191096634093, 0.7717777325370974, 0.7694025968150561, 0.7720152461093015, 0.7651188698878031, 0.7708262644770179, 0.7704148570394498, 0.7690774294245386, 0.7656829646217878, 0.7702338943177706, 0.7711083118440102, 0.7758239458921462, 0.7732056415128483, 0.7737244955664133, 0.771667458378574, 0.765992580528411, 0.7714695304017373, 0.7757928429243576, 0.7678092766015202, 0.7700486902823018, 0.76

In [304]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [73]:
column_to_drop_30 = 'Cat_이사 계획 첫 번째 이유'

In [74]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(8444, 112)


In [307]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [308]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [309]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7844439337219734


In [310]:
optuna_31 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [311]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7721919675171914


In [312]:
X_train = X_train.values
y_train = y_train.values

In [313]:
auc_bootstrap = []

In [314]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76479897, 0.77585508])

In [315]:
t_31 = auc_bootstrap
print(t_31)

[0.7679068268186754, 0.7661466815960911, 0.7719671778863553, 0.7696768684401013, 0.7651231112015924, 0.7654963468150561, 0.7716109075280493, 0.7747650312160694, 0.7761208378574014, 0.7681174787368802, 0.7718215594462541, 0.7731335391784293, 0.7707994028230184, 0.7742772801302932, 0.769601938563156, 0.7701914811798769, 0.7717961115635179, 0.7705442171100254, 0.7641772982265653, 0.7689841205211725, 0.7714044969236338, 0.7692753574013753, 0.7717791463083605, 0.7722753800217155, 0.7688031577994934, 0.7742447633912414, 0.7684949556641332, 0.7677428293521534, 0.7708290920195441, 0.7678983441910967, 0.7698168317951501, 0.7718710414404633, 0.7733229845276872, 0.766840843286283, 0.7729808518820123, 0.7661869740770901, 0.7700840345638799, 0.7720194874230908, 0.7723135518458197, 0.7721693471769815, 0.7680298249185669, 0.7689487762395947, 0.7695361981994209, 0.770026069942092, 0.7708729189287007, 0.7715699081614188, 0.7721283478103511, 0.772110675669562, 0.7707866788816504, 0.7667998439196526, 0.7

In [316]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [75]:
column_to_drop_31 = 'Cat_이사 계획 중인 주택의 유형'

In [76]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(8444, 93)


In [319]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [320]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [321]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7873587349011532


In [322]:
optuna_32 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [323]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.7753984007419472


In [324]:
X_train = X_train.values
y_train = y_train.values

In [325]:
auc_bootstrap = []

In [326]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76700143, 0.77830526])

In [327]:
t_32 = auc_bootstrap
print(t_32)

[0.7720449353058271, 0.7783475275968151, 0.7740326977017735, 0.7739902845638799, 0.7729723692544336, 0.7716490793521534, 0.7732918815598986, 0.7731391942634817, 0.7769083084509592, 0.7716052524429966, 0.7749360975389069, 0.7719742467426711, 0.7752372308179514, 0.771347946073109, 0.7773225434310531, 0.7678220005428882, 0.7736552207745204, 0.7734035694896851, 0.7730176099348535, 0.7706707496380746, 0.771336635903004, 0.7738616313789359, 0.7709450212631197, 0.7747862377850162, 0.7719417300036192, 0.7716858374049946, 0.7714695304017372, 0.7703123586228737, 0.7722527596815058, 0.7749558903365907, 0.7699709328628302, 0.7757787052117264, 0.773071333242852, 0.7742348669923995, 0.7743917956026058, 0.772459170285921, 0.7722485183677162, 0.7736467381469418, 0.7755298814694174, 0.7766637260224394, 0.7780520494028231, 0.7695835595367355, 0.770995917028592, 0.77064600864097, 0.7733116743575823, 0.7795789223669923, 0.7702381356315599, 0.7745572068403908, 0.7742546597900832, 0.7713882385541079, 0.7755

In [328]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [77]:
column_to_drop_32 = '자산 중 기타자산의 비중'

In [78]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(8444, 92)


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [80]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [81]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7855842198542378


In [82]:
optuna_33 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [83]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.7776477108215707


In [84]:
X_train = X_train.values
y_train = y_train.values

In [85]:
auc_bootstrap = [] 

In [86]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76666967, 0.77845968])

In [87]:
t_33 = auc_bootstrap
print(t_33)

[0.7687904338581253, 0.7751213015743756, 0.7719657641150923, 0.7722866901918206, 0.7707626447701774, 0.7740581455845096, 0.7726189264386536, 0.776471453130655, 0.7765350728374956, 0.7745784134093376, 0.7759540128483533, 0.772221656713717, 0.7748710640608035, 0.7727504071661238, 0.7716929062613102, 0.7699468987513572, 0.7729158184039089, 0.7707442657437568, 0.7800058812884546, 0.7744342087404994, 0.7726288228374955, 0.7694464237242127, 0.7721184514115093, 0.7760374253528773, 0.7734869819942092, 0.7680185147484618, 0.7700345525696707, 0.7761816300217156, 0.7685925058812885, 0.7692555646036917, 0.7704813042888164, 0.7741995227108216, 0.7721481406080347, 0.7716929062613103, 0.7748102718964893, 0.7721848986608758, 0.7694054243575823, 0.7732494684220051, 0.7730925398117988, 0.7691198425624322, 0.7655868281758957, 0.7723531374411872, 0.7703059966521897, 0.7732310893955844, 0.7729073357763301, 0.7724704804560261, 0.7736863237423091, 0.7720922966431415, 0.7701674470684039, 0.7721255202678248, 0

In [88]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [89]:
column_to_drop_33 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [90]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(8444, 88)


In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [92]:
X_train

Unnamed: 0,현재 무주택 기간(총 개월),현재 주택의 면적(㎡),총 이사 횟수,가구주 나이,소득 대비 주택 임대료의 비율,소득 중 근로/사업소득의 비중(월평균),소득 중 사적이전소득의 비중(월평균),소득 대비 생활비의 비율,소득 대비 주거관리비의 비율,자산 중 금융자산의 비중,...,Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원,Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원,Cat_가구주 최종 학력_고등학교 졸업,Cat_가구주 최종 학력_대학 졸업 이상,Cat_가구주 최종 학력_중학교 졸업 이하,Cat_가구주 종사상 지위_무급가족종사자,Cat_가구주 종사상 지위_무직 및 기타,Cat_가구주 종사상 지위_사업자 및 자영자,Cat_가구주 종사상 지위_상용근로자,Cat_가구주 종사상 지위_임시일용근로자
7981,-0.523810,1.949374,0.0,0.6875,-0.603015,0.0,0.000000,-0.884298,2.125654e-01,-12.923077,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4854,3.428571,0.927927,4.0,0.8125,-0.129508,0.0,0.000000,0.361157,-1.264398e-01,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
237,-0.285714,-0.269141,0.0,0.4375,0.194976,0.0,0.000000,0.047521,4.214660e-01,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3241,0.000000,-0.271585,1.0,-0.1875,-0.279893,0.0,0.000000,-0.851570,-6.406283e-01,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1972,0.023810,-0.470747,1.0,0.4375,-0.262537,0.0,0.000000,0.152066,-6.743455e-02,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8359,0.238095,-0.072423,2.0,-0.3125,-0.470428,0.0,0.000000,-0.095041,-6.896716e-02,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1565,0.285714,-0.069629,3.0,-0.3125,0.372728,-1.0,1.845805,-0.213843,-2.318063e-01,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2994,0.285714,-0.271585,1.0,-0.3125,-0.479494,0.0,0.000000,-0.266116,-8.092147e-01,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3821,1.714286,0.728415,1.0,1.3125,-0.071463,0.0,0.000000,-0.266116,-2.246022e-16,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [93]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [94]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7817898956505598


In [95]:
optuna_34 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [96]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.775042130383641


In [97]:
X_train = X_train.values
y_train = y_train.values

In [98]:
auc_bootstrap = []

In [99]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76582816, 0.77733055])

In [100]:
t_34 = auc_bootstrap
print(t_34)

[0.7678064490589938, 0.7734516377126313, 0.7746858600253347, 0.7723785853239233, 0.7684426461273977, 0.7662201977017735, 0.7653224529496924, 0.772764544878755, 0.7645349823561347, 0.7700274837133549, 0.7750053723307999, 0.7751432150289541, 0.7723121380745567, 0.7706113712450235, 0.7717197679153095, 0.771654734437206, 0.7700041564875135, 0.7695581116539992, 0.7670981496561708, 0.773814976927253, 0.77209158975751, 0.7686250226203402, 0.7703342720774521, 0.7781213241947158, 0.7694379410966341, 0.7751623009410061, 0.7736672378302569, 0.766379953854506, 0.7710962947882736, 0.7714327723488963, 0.7682942001447701, 0.7686236088490772, 0.7713408772167935, 0.7720760382736157, 0.7741189377488238, 0.7696047661056822, 0.7711825348353238, 0.7707767824828086, 0.7721297615816142, 0.7727475796235975, 0.7764756944444444, 0.7694761129207384, 0.771985556912776, 0.7692343580347448, 0.7721792435758233, 0.7701886536373507, 0.7708375746471228, 0.7742108328809266, 0.7759907709011943, 0.7688441571661239, 0.7711

In [101]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [102]:
column_to_drop_34 = '소득 대비 주거관리비의 비율'

In [103]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(8444, 87)


In [104]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [105]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [106]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7829368113533605


In [107]:
optuna_35 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [108]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.774799668612016


In [109]:
X_train = X_train.values
y_train = y_train.values

In [110]:
auc_bootstrap = []

In [111]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76620564, 0.77738478])

In [112]:
t_35 = auc_bootstrap
print(t_35)

[0.7698132973669924, 0.7707767824828086, 0.7778046394317771, 0.7733866042345277, 0.7724464463445531, 0.7773253709735795, 0.7713451185305827, 0.769802694082519, 0.7657706184401014, 0.7695482152551574, 0.7728408885269635, 0.7685204035468693, 0.7713224981903728, 0.7728875429786464, 0.7713380496742671, 0.7738446661237786, 0.7723460685848715, 0.7679534812703583, 0.7672918363192183, 0.7698352108215707, 0.7674473511581615, 0.7709294697792255, 0.7689007080166486, 0.7729059220050669, 0.7697136264929425, 0.7705675443358669, 0.7717155266015201, 0.7725835821570756, 0.7686264363916033, 0.7705279587404994, 0.7725595480456026, 0.7723305171009773, 0.7694563201230545, 0.7686801596996019, 0.7729313698878031, 0.7710171235975389, 0.7729511626854868, 0.7722980003619253, 0.7749389250814331, 0.7701759296959827, 0.7662456455845096, 0.7742546597900833, 0.7728394747557003, 0.7655571389793703, 0.769613248733261, 0.7757886016105682, 0.7727815101339125, 0.772385654180239, 0.7747339282482808, 0.7690180510314876, 0.

In [113]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [114]:
column_to_drop_35 = '소득 대비 생활비의 비율'

In [115]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(8444, 86)


In [116]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [117]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [118]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7813324552835468


In [119]:
optuna_36 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [120]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.7751743179967426


In [121]:
X_train = X_train.values
y_train = y_train.values

In [122]:
auc_bootstrap = []

In [123]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76397926, 0.77583625])

In [124]:
t_36 = auc_bootstrap
print(t_36)

[0.7682051325551936, 0.7733031917300037, 0.7725567205030763, 0.7645081207021354, 0.7702013775787188, 0.7735209125045241, 0.7694393548678973, 0.7676276069942092, 0.7640585414404633, 0.7706099574737604, 0.7708488848172277, 0.763122624864278, 0.766980806641332, 0.7671999411871153, 0.7706566119254434, 0.7677810011762577, 0.770935124864278, 0.7725510654180239, 0.770287617625769, 0.7712270686301123, 0.7728931980636988, 0.7720845209011944, 0.7689544313246471, 0.7708700913861743, 0.7656758957654723, 0.7664237807636627, 0.7685189897756062, 0.7710863983894318, 0.7704205121245024, 0.765867461771625, 0.7713168431053203, 0.7721269340390879, 0.7652772122692726, 0.7711542594100615, 0.7674268514748462, 0.7686985387260225, 0.7670034269815418, 0.7723771715526602, 0.7710765019905899, 0.768233407980456, 0.766792775063337, 0.7744624841657619, 0.7734714305103149, 0.7697249366630474, 0.771911333921462, 0.769076722538907, 0.7632724846181687, 0.766391264024611, 0.7664287289630836, 0.7696514205573652, 0.7666047

In [125]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [126]:
column_to_drop_36 = 'Cat_이사 예상 기간'

In [127]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(8444, 82)


In [128]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [129]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [130]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7831555871810624


In [131]:
optuna_37 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [132]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7752980229822657


In [133]:
X_train = X_train.values
y_train = y_train.values

In [134]:
auc_bootstrap = []

In [135]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76546339, 0.77690628])

In [136]:
t_37 = auc_bootstrap
print(t_37)

[0.7717579397394138, 0.7751934039087948, 0.7718597312703583, 0.768291372602244, 0.7646353601158161, 0.7683507509952949, 0.768575540626131, 0.7749615454216431, 0.7683436821389793, 0.7701193788454579, 0.7680595141150922, 0.7670232197792255, 0.7708601949873326, 0.7714158070937386, 0.7728267508143323, 0.7689367591838581, 0.7680383075461455, 0.7742956591567136, 0.7716179763843648, 0.772729200597177, 0.7718342833876222, 0.7765605207202315, 0.7720505903908795, 0.7762014228193992, 0.7724619978284474, 0.771444082519001, 0.7705830958197611, 0.7708941254976476, 0.7728776465798046, 0.7660053044697792, 0.7697701773434672, 0.7717240092290989, 0.7731844349439015, 0.7718045941910966, 0.7661268887984074, 0.7691580143865364, 0.7685882645674993, 0.770515941684763, 0.771135880383641, 0.7727744412775969, 0.7695425601701049, 0.7708700913861745, 0.7717593535106768, 0.7778611902823018, 0.7734629478827362, 0.771750870883098, 0.7730798158704306, 0.7730161961635903, 0.7738312352967789, 0.773650979460731, 0.77003

In [137]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [138]:
column_to_drop_37 = '중기부채부담지표'

In [139]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(8444, 81)


In [140]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [141]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [142]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.781524712829103


In [143]:
optuna_38 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [144]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7760784247195078


In [145]:
X_train = X_train.values
y_train = y_train.values

In [146]:
auc_bootstrap = []

In [147]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76607536, 0.77822319])

In [148]:
t_38 = auc_bootstrap
print(t_38)

[0.7709252284654361, 0.7750265788997467, 0.7692442544335867, 0.7708008165942815, 0.7719572814875135, 0.7682489594643505, 0.7748229958378574, 0.7730444715888527, 0.7684341634998191, 0.7708969530401738, 0.7768178270901194, 0.7717225954578357, 0.7727687861925443, 0.774263142417662, 0.7784719394679696, 0.7727108215707563, 0.7744031057727108, 0.7713634975570032, 0.7702720661418748, 0.7695878008505248, 0.7671688382193268, 0.7751170602605864, 0.7721750022620341, 0.7749629591929064, 0.7714384274339486, 0.7783567171100253, 0.7784266987875497, 0.7701653264115094, 0.7677866562613103, 0.7779262237604054, 0.7682390630655085, 0.7719431437748825, 0.7694534925805284, 0.7742150741947158, 0.7719756605139342, 0.7686999524972856, 0.770510993485342, 0.7712857401375317, 0.7738050805284111, 0.7731505044335867, 0.7695453877126313, 0.7693644249909519, 0.7737131853963083, 0.7699129682410424, 0.7736905650560985, 0.7747254456207021, 0.7740807659247196, 0.766781464893232, 0.7717918702497286, 0.7702904451682953, 0.

In [149]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [150]:
column_to_drop_38 = 'Cat_이사 계획 중인 주택의 점유형태'

In [151]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(8444, 57)


In [152]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [153]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [154]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7817412787999594


In [155]:
optuna_39 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [156]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.776918204849801


In [157]:
X_train = X_train.values
y_train = y_train.values

In [158]:
auc_bootstrap = []

In [159]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76707887, 0.77875494])

In [160]:
t_39 = auc_bootstrap
print(t_39)

[0.7727928203040173, 0.7691325665038002, 0.7718045941910967, 0.7719558677162506, 0.7728861292073833, 0.7780704284292436, 0.7694379410966341, 0.7699016580709374, 0.7754775719326819, 0.7690632917119073, 0.7733512599529497, 0.7781142553384004, 0.768408715617083, 0.7705194761129207, 0.7745147937024973, 0.7739040445168295, 0.7754280899384727, 0.7730699194715889, 0.7746377918023888, 0.7701646195258777, 0.7707216454035468, 0.7722117603148752, 0.7762848353239233, 0.775255609844372, 0.7689586726384365, 0.7789582767824828, 0.773778218874412, 0.7729751967969598, 0.7773649565689468, 0.7708326264477018, 0.7709634002895404, 0.7736410830618892, 0.7738729415490408, 0.772117037640246, 0.7743564513210279, 0.7743988644589215, 0.7766729155356495, 0.7701547231270358, 0.7753262984075281, 0.7721679334057184, 0.7755765359211002, 0.7725722719869708, 0.7742687975027145, 0.7695736631378937, 0.7739450438834601, 0.7684709215526602, 0.7735944286102063, 0.7736481519182049, 0.7771825800760044, 0.7732395720231632, 0.7

In [161]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [162]:
column_to_drop_39 = 'Cat_가구주 최종 학력'

In [163]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(8444, 54)


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [165]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [166]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7827732819467954


In [167]:
optuna_40 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [168]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.7773960595367354


In [169]:
X_train = X_train.values
y_train = y_train.values

In [170]:
auc_bootstrap = []

In [171]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76648659, 0.77686632])

In [172]:
t_40 = auc_bootstrap
print(t_40)

[0.7712221204306914, 0.7722315531125589, 0.772541169019182, 0.7693149429967427, 0.7726853736880201, 0.7738757690915672, 0.7741189377488238, 0.7739690779949331, 0.7725383414766558, 0.7698988305284111, 0.7703385133912414, 0.7719869706840392, 0.7699532607220413, 0.7680934446254071, 0.774854098805646, 0.7729115770901195, 0.7739125271444083, 0.7723955505790807, 0.7724888594824466, 0.7726076162685487, 0.7740129049040898, 0.7759405820213536, 0.7737400470503076, 0.7724535152008687, 0.7708474710459645, 0.7731208152370612, 0.7762735251538182, 0.7702056188925082, 0.774192453854506, 0.7674105931053203, 0.7698069353963084, 0.7705406826818675, 0.7697178678067318, 0.7705251311979733, 0.7725227899927615, 0.7703059966521897, 0.7719855569127758, 0.7720689694173, 0.7733526737242129, 0.7715105297683678, 0.7720760382736156, 0.7713507736156351, 0.772011004795512, 0.777732537097358, 0.7718045941910967, 0.7752004727651104, 0.7691297389612739, 0.7741910400832429, 0.7748724778320666, 0.7696245589033659, 0.77737

In [173]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [14]:
comp_41 = comp[[
 '현재 무주택 기간(총 개월)', 
 '현재 주택의 면적(㎡)', 
 '총 이사 횟수', 
 '가구주 나이', 

 '소득 대비 주택 임대료의 비율', 
 '소득 중 근로/사업소득의 비중(월평균)', 
 '소득 중 사적이전소득의 비중(월평균)', 
 '자산 중 금융자산의 비중', 
 '장기부채부담지표', 
 'target',

 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
    
 'Cat_현재 주택의 점유형태_무상',
 'Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_현재 주택의 점유형태_보증금 있는 월세',
 'Cat_현재 주택의 점유형태_전세',

 'Cat_이사 계획 중인 거주 지역_국내 to 국외',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',

 'Cat_주택 보유 의식_아니오',
 'Cat_주택 보유 의식_예',
    
 'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급',
 'Cat_현재 가장 필요한 주거지원 1순위_없음',
 'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등',
 'Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원',
    
 'Cat_가구주 종사상 지위_무급가족종사자',
 'Cat_가구주 종사상 지위_무직 및 기타',
 'Cat_가구주 종사상 지위_사업자 및 자영자',
 'Cat_가구주 종사상 지위_상용근로자',
 'Cat_가구주 종사상 지위_임시일용근로자']]

In [15]:
X_41 = comp_41.drop('target', axis=1)
y_41 = comp_41['target']
X_41.shape

(8444, 53)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [17]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [18]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7830008972018794


In [19]:
optuna_41 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [20]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.779


In [21]:
X_train = X_train.values
y_train = y_train.values

In [22]:
auc_bootstrap = []

In [23]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76653352, 0.77792219])

In [24]:
np.mean(auc_bootstrap)

0.7723475763719236

In [25]:
t_41 = auc_bootstrap
print(t_41)

[0.7742504184762937, 0.7751396806007963, 0.767903999276149, 0.7718187319037279, 0.7719007306369889, 0.7701137237604054, 0.7767428972131741, 0.7775388504343106, 0.7763654202859211, 0.7734869819942093, 0.768740951863916, 0.7744596566232356, 0.7736368417480999, 0.7746886875678611, 0.7713903592110025, 0.7673604042254796, 0.7691509455302208, 0.7750590956387985, 0.7728437160694897, 0.777731123326095, 0.7689501900108577, 0.7750336477560622, 0.7739733193087223, 0.7727221317408614, 0.7765251764386537, 0.7751665422547955, 0.7720732107310895, 0.7728635088671734, 0.7689445349258053, 0.7737244955664133, 0.7714320654632646, 0.7664577112739777, 0.7704572701773436, 0.7660788205754615, 0.7714737717155266, 0.7736990476836771, 0.7754224348534202, 0.7747876515562795, 0.776494073470865, 0.7750845435215346, 0.7724478601158162, 0.7777452610387261, 0.7719883844553022, 0.7740454216431415, 0.7742716250452408, 0.7749474077090119, 0.7721099687839306, 0.7690746018820123, 0.7711896036916396, 0.7673794901375316, 0.7

In [26]:
column_to_drop_41 = '현재 주택의 면적(㎡)'

In [27]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(8444, 52)


In [28]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [30]:
X_train.shape

(5404, 52)

In [31]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [32]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7848350783836223


In [33]:
optuna_42 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [34]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.775


In [35]:
X_train = X_train.values
y_train = y_train.values

In [36]:
auc_bootstrap = []

In [37]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76570654, 0.77655497])

In [38]:
np.mean(auc_bootstrap)

0.7710921856620975

In [39]:
t_42 = auc_bootstrap
print(t_42)

[0.7690293612015926, 0.768281476203402, 0.7691622557003258, 0.769035016286645, 0.7679068268186754, 0.7703837540716613, 0.7728529055827, 0.7669398072747013, 0.7739082858306189, 0.7697998665399929, 0.7698550036192544, 0.7717932840209917, 0.7735972561527324, 0.7692894951140065, 0.7687628653184944, 0.7783404587404995, 0.7694874230908433, 0.7704360636083967, 0.7710807433043794, 0.7697631084871516, 0.7715451671643141, 0.7744709667933406, 0.7719120408070937, 0.7686504705030764, 0.7717593535106767, 0.7636909609120522, 0.766121233713355, 0.7735590843286283, 0.7718979030944626, 0.7682418906080347, 0.7699893118892508, 0.7669284971045965, 0.7691566006152732, 0.7727207179695983, 0.7677824149475208, 0.7682037187839306, 0.7733540874954761, 0.7690604641693811, 0.7737011683405718, 0.7747141354505972, 0.7740326977017734, 0.7698960029858849, 0.7721792435758235, 0.7696796959826275, 0.7690336025153819, 0.7733003641874774, 0.7657126538183133, 0.774311210640608, 0.7730854709554833, 0.7736340142055737, 0.7715

In [40]:
column_to_drop_42 = '자산 중 금융자산의 비중'

In [41]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(8444, 51)


In [42]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [44]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [45]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 132, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 4}
0.7825677652601664


In [46]:
optuna_43 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [47]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.775


In [48]:
X_train = X_train.values
y_train = y_train.values

In [49]:
auc_bootstrap = []

In [50]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7656444 , 0.77680797])

In [51]:
np.mean(auc_bootstrap)

0.7714522095831071

In [52]:
t_43 = auc_bootstrap
print(t_43)

[0.7717409744842563, 0.7725948923271806, 0.7688568811074918, 0.769849348534202, 0.775773050126674, 0.7735916010676802, 0.76986207247557, 0.7757716363554108, 0.7686547118168657, 0.7693135292254796, 0.7702296530039812, 0.772894611834962, 0.7730176099348534, 0.7682517870068766, 0.7735647394136808, 0.7733880180057908, 0.7677781736337315, 0.7742900040716612, 0.7742928316141875, 0.7725864096996018, 0.7715826321027868, 0.7768446887441187, 0.7645986020629749, 0.771193845005429, 0.7698705551031488, 0.773673599800941, 0.7702056188925082, 0.769341804650742, 0.770026069942092, 0.7719770742851972, 0.7741260066051394, 0.7674770403546869, 0.7710157098262759, 0.7676862785016285, 0.7684991969779225, 0.768222097810351, 0.7707541621425986, 0.7709252284654361, 0.7726528569489686, 0.7740326977017735, 0.7734777924809988, 0.7676113486246834, 0.7685826094824466, 0.7731646421462179, 0.7745784134093376, 0.7692781849439015, 0.7730289201049584, 0.7673766625950054, 0.7653111427795873, 0.7674247308179516, 0.7666245

In [53]:
column_to_drop_43 = 'Cat_현재 주택의 점유형태'

In [54]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(8444, 47)


In [55]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [57]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [58]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7820727573267804


In [59]:
optuna_44 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [60]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.775


In [61]:
X_train = X_train.values
y_train = y_train.values

In [62]:
auc_bootstrap = []

In [63]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76386694, 0.77545089])

In [64]:
np.mean(auc_bootstrap)

0.7698987404004931

In [65]:
t_44 = auc_bootstrap
print(t_44)

[0.7701943087224032, 0.7696415241585233, 0.7689572588671734, 0.7704452531216069, 0.7689332247557004, 0.7747127216793341, 0.7704318222946074, 0.7668592223127036, 0.7717607672819399, 0.771678768548679, 0.76818533975751, 0.7700006220593558, 0.7701165513029316, 0.7703257894498734, 0.7688837427614912, 0.7686730908432863, 0.7669087043069127, 0.7656532754252624, 0.7641108509771988, 0.7724393774882374, 0.7687423656351792, 0.7704459600072384, 0.7676014522258415, 0.7720845209011944, 0.7696938336952588, 0.767819173000362, 0.7743041417842925, 0.7665326411509229, 0.7757108441910966, 0.7700713106225117, 0.7667630858668114, 0.7717183541440464, 0.765531691096634, 0.7682772348896127, 0.768175443358668, 0.7674558337857402, 0.7640161283025697, 0.7732819851610568, 0.768858294878755, 0.767417661961636, 0.7742468840481362, 0.7661092166576184, 0.7711811210640608, 0.7704487875497648, 0.7676353827361564, 0.770274893684401, 0.767277698606587, 0.7721693471769815, 0.7710934672457475, 0.7662442318132464, 0.7695255

In [66]:
column_to_drop_44 = '장기부채부담지표'

In [67]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(8444, 46)


In [68]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [70]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [71]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.780559005387631


In [72]:
optuna_45 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [73]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.777


In [74]:
X_train = X_train.values
y_train = y_train.values

In [75]:
auc_bootstrap = []

In [76]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76426774, 0.77567002])

In [77]:
np.mean(auc_bootstrap)

0.7700874820451049

In [78]:
t_45 = auc_bootstrap
print(t_45)

[0.768875967019544, 0.7671858034744843, 0.7660873032030402, 0.767419075732899, 0.7664640732446616, 0.7709591589757511, 0.7688611224212811, 0.7716985613463626, 0.7700232423995657, 0.7718074217336228, 0.766521330980818, 0.7717727843376765, 0.7641864877397757, 0.7640217833876222, 0.7679223783025697, 0.7695121640879479, 0.7700444489685124, 0.770883522213174, 0.769237185577271, 0.766326937432139, 0.7761462857401374, 0.7642522281035108, 0.7655726904632646, 0.7686858147846544, 0.775631673000362, 0.7736566345457836, 0.7773416293431054, 0.7696726271263121, 0.7690088615182773, 0.7677668634636263, 0.7706721634093376, 0.7698507623054651, 0.771077915761853, 0.7765930374592834, 0.7710270199963808, 0.7672932500904814, 0.7699256921824104, 0.7706976112920738, 0.7681726158161419, 0.7706905424357582, 0.7702275323470865, 0.7713111880202679, 0.772669822204126, 0.7656193449149475, 0.7664888142417662, 0.7681012203673543, 0.7703257894498734, 0.7706820598081796, 0.7664082292797683, 0.7661191130564604, 0.769180

In [79]:
column_to_drop_45 = '소득 중 근로/사업소득의 비중(월평균)'

In [80]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(8444, 45)


In [81]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [83]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [84]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7782077177040471


In [85]:
optuna_46 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [86]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.774


In [87]:
X_train = X_train.values
y_train = y_train.values

In [88]:
auc_bootstrap = []

In [89]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76519  , 0.7756942])

In [90]:
np.mean(auc_bootstrap)

0.7704773807201186

In [91]:
t_46 = auc_bootstrap
print(t_46)

[0.7658766512848354, 0.7708644363011221, 0.7754040558269997, 0.771135880383641, 0.7738785966340933, 0.7681881673000363, 0.769483181777054, 0.7768036893774882, 0.771903558179515, 0.7717946977922547, 0.7660752861473037, 0.7695213536011581, 0.7706672152099168, 0.773484154451683, 0.7741274203764024, 0.7689445349258053, 0.7714143933224756, 0.7734028626040536, 0.7728316990137531, 0.767205596272168, 0.7727673724212814, 0.7675145052931596, 0.772587823470865, 0.7700267768277235, 0.7710312613101702, 0.7703017553384004, 0.7715684943901556, 0.7658427207745205, 0.7715812183315237, 0.7720562454759319, 0.7721530888074557, 0.7743875542888166, 0.771484375, 0.7700684830799854, 0.7684864730365545, 0.7712970503076366, 0.7727928203040174, 0.7692032550669562, 0.7714638753166847, 0.7728564400108577, 0.7713458254162142, 0.77209158975751, 0.7721686402913499, 0.7736552207745204, 0.7693269600524792, 0.7710793295331162, 0.7709139182953313, 0.7717692499095186, 0.7754789857039449, 0.7694520788092654, 0.768953017553

In [92]:
column_to_drop_46 = '현재 무주택 기간(총 개월)'

In [93]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(8444, 44)


In [94]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [96]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [97]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7823755077146102


In [98]:
optuna_47 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [99]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.777


In [100]:
X_train = X_train.values
y_train = y_train.values

In [101]:
auc_bootstrap = []

In [102]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76517915, 0.77714596])

In [103]:
np.mean(auc_bootstrap)

0.7713676985783117

In [104]:
t_47 = auc_bootstrap
print(t_47)

[0.7728345265562794, 0.7734120521172638, 0.7734367931143684, 0.773731564422729, 0.7724938076818675, 0.7707774893684403, 0.768043962631198, 0.7684744559808179, 0.7686540049312341, 0.7737555985342021, 0.7698097629388346, 0.774003008505248, 0.770286203854506, 0.768222097810351, 0.7733335878121608, 0.7742249705935578, 0.7736099800941006, 0.7751828006243215, 0.769886106587043, 0.7751234222312704, 0.7726083231541803, 0.775437279451683, 0.772358085640608, 0.7711655695801666, 0.7716667514929424, 0.7702494458016649, 0.7711153807003257, 0.7654164687386898, 0.7736926857129931, 0.7698938823289903, 0.7661169923995657, 0.7673335425714802, 0.7693764420466883, 0.7743543306641332, 0.7747332213626492, 0.7712673611111112, 0.7740595593557728, 0.7723156725027145, 0.769926399068042, 0.7772652856948968, 0.7680941515110389, 0.7752888334690555, 0.7665990884002896, 0.7692986846272168, 0.7693538217064786, 0.7750527336681143, 0.7715175986246833, 0.7701610850977199, 0.7732120034835324, 0.7745791202949692, 0.770163

In [105]:
column_to_drop_47 = '소득 대비 주택 임대료의 비율'

In [106]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(8444, 43)


In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [109]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [110]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7844096809408687


In [111]:
optuna_48 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [112]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.775


In [113]:
X_train = X_train.values
y_train = y_train.values

In [114]:
auc_bootstrap = []

In [115]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7634338 , 0.77552527])

In [116]:
np.mean(auc_bootstrap)

0.7696379483775561

In [117]:
t_48 = auc_bootstrap
print(t_48)

[0.7683868021625045, 0.76730173271806, 0.7711563800669563, 0.7711641558089033, 0.7688710188201231, 0.7741465062884545, 0.7761590096815056, 0.7682015981270358, 0.7724563427433948, 0.7732346238237423, 0.7646084984618169, 0.7698387452497286, 0.770898366811437, 0.7680057908070939, 0.7671752001900108, 0.7695482152551574, 0.7673858521082156, 0.7649449760224394, 0.7714667028592109, 0.7674021104777417, 0.7761717336228736, 0.7664866935848715, 0.7693071672547955, 0.7696252657889975, 0.7694132000995295, 0.7730600230727469, 0.7644013809717698, 0.7753121606948968, 0.7716017180148389, 0.7690915671371699, 0.7666704838490771, 0.7717932840209917, 0.7709598658613825, 0.7727673724212812, 0.774941045738328, 0.7669730308993847, 0.7705788545059719, 0.7722753800217156, 0.7675986246833152, 0.7699200370973579, 0.7699143820123054, 0.7697567465164676, 0.7701554300126673, 0.7670776499728555, 0.7758020324375678, 0.7713847041259501, 0.7723488961273978, 0.7701313959011944, 0.7654956399294246, 0.7728331127850163, 0.7

In [118]:
column_to_drop_48 = 'Cat_가구주 종사상 지위'

In [119]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(8444, 38)


In [120]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [122]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [123]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 5}
0.7825080991253386


In [124]:
optuna_49 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [125]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.773


In [126]:
X_train = X_train.values
y_train = y_train.values

In [127]:
auc_bootstrap = []

In [128]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76264072, 0.77349827])

In [129]:
np.mean(auc_bootstrap)

0.7682146387531668

In [130]:
t_49 = auc_bootstrap
print(t_49)

[0.7676968817861021, 0.7686646082157075, 0.7707315418023886, 0.7675851938563156, 0.7676078141965255, 0.7699037787278321, 0.770665094553022, 0.7671645969055374, 0.7685465583152371, 0.7681273751357219, 0.7692923226565328, 0.7673172842019544, 0.7695885077361562, 0.7728486642689105, 0.7647025142508144, 0.7755143299855229, 0.7675067295512124, 0.7692244616359031, 0.7691771002985885, 0.7680941515110387, 0.7679577225841476, 0.7683260099981903, 0.7640161283025696, 0.7655472425805284, 0.7740729901827723, 0.7720647281035106, 0.7715027540264205, 0.7720442284201954, 0.7653952621697431, 0.7702939795964532, 0.770522303655447, 0.7642896930419834, 0.7668047921190735, 0.7709747104596454, 0.7763053350072385, 0.7673893865363735, 0.7679563088128847, 0.7645236721860297, 0.7704148570394499, 0.7685628166847629, 0.7653486077180601, 0.7694853024339485, 0.7688073991132827, 0.770229653003981, 0.7695036814603691, 0.7686624875588128, 0.7694110794426348, 0.7649944580166486, 0.7679541881559899, 0.7744009851158161, 0.

In [131]:
column_to_drop_49 = '총 이사 횟수' 

In [132]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(8444, 37)


In [133]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc, comp_49, X_49, y_49

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [135]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [136]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7837677175272586


In [137]:
optuna_50 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [138]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.768


In [139]:
X_train = X_train.values
y_train = y_train.values

In [140]:
auc_bootstrap = []

In [141]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75482858, 0.7675461 ])

In [142]:
np.mean(auc_bootstrap)

0.7612704039003121

In [143]:
t_50 = auc_bootstrap
print(t_50)

[0.7646608079985523, 0.7610295365092291, 0.7614416508324287, 0.759269391286645, 0.7620156419652552, 0.7616954227741586, 0.7570370464621787, 0.7577700868621063, 0.762726062024973, 0.7523836183496201, 0.7543501741766195, 0.7645625508957654, 0.761206257917119, 0.7569034450778139, 0.7666471566232357, 0.7632533987061165, 0.7632003822837495, 0.7609722787730727, 0.7621329849800941, 0.7658801857129931, 0.7594977153456388, 0.7531838128845458, 0.7646862558812885, 0.7577177773253709, 0.7541586081704669, 0.7689777585504887, 0.7636633923724213, 0.7625521681596091, 0.7636747025425262, 0.7566263459102425, 0.7658010145222583, 0.7660562002352514, 0.7591690135269634, 0.7642939343557728, 0.760687403863554, 0.7559618734165762, 0.7597748145132102, 0.7638648547774158, 0.7625557025877668, 0.7631961409699602, 0.7647145313065509, 0.7594461126945349, 0.7658702893141514, 0.750228324058994, 0.7620417967336229, 0.767565401058632, 0.7647216001628664, 0.7571692340752805, 0.764285451728194, 0.7592361676619616, 0.7626

In [144]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [145]:
column_to_drop_50 = 'Cat_현재 거주 지역'

In [146]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(8444, 20)


In [147]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [148]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [149]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7695472887266364


In [150]:
optuna_51 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [151]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.760


In [152]:
X_train = X_train.values
y_train = y_train.values

In [153]:
auc_bootstrap = []

In [154]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74874623, 0.76071958])

In [155]:
np.mean(auc_bootstrap)

0.754990067549991

In [156]:
t_51 = auc_bootstrap
print(t_51)

[0.7548386321480275, 0.7548315632917119, 0.7552465051574375, 0.7546880655085052, 0.7534545500814334, 0.7537076151375317, 0.7560488203492581, 0.7532325879931235, 0.7522542582790446, 0.752844507781397, 0.7515565621606949, 0.7557236529587406, 0.7535103940463264, 0.7548011672095548, 0.7611801031487513, 0.7536920636536375, 0.7593061493394861, 0.7620212970503075, 0.7579658941820485, 0.7518244718150561, 0.7535457383279044, 0.7588268808812886, 0.7532502601339124, 0.7585780571389793, 0.7559802524429967, 0.7547036169923996, 0.755030905039812, 0.7592248574918564, 0.7577029327271083, 0.7534213264567499, 0.7581023231089397, 0.7573198007148029, 0.7577177773253709, 0.7573530243394861, 0.7587971916847631, 0.7550902834328629, 0.7551213864006514, 0.7574611778411147, 0.7551871267643864, 0.7549722335323924, 0.7513225830166484, 0.7592220299493304, 0.7559548045602605, 0.753109589893232, 0.7555320869525877, 0.7484943336047774, 0.7555935860025335, 0.7519050567770539, 0.754355829261672, 0.7498515540173725, 0.7

In [157]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [158]:
column_to_drop_51 = '가구주 나이' 

In [159]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(8444, 19)


In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [161]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [162]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 184, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7687296416938111


In [163]:
optuna_52 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [164]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.758


In [165]:
X_train = X_train.values
y_train = y_train.values

In [166]:
auc_bootstrap = []

In [167]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74822353, 0.75792984])

In [168]:
np.mean(auc_bootstrap)

0.7532862809050398

In [169]:
t_52 = auc_bootstrap
print(t_52)

[0.7544088456840391, 0.7487071061798768, 0.7516102854686935, 0.7503668736427795, 0.7505655085052481, 0.7529102481451321, 0.7520308824194716, 0.7529286271715526, 0.753215622737966, 0.7511543442363373, 0.7542490895313065, 0.7546958412504524, 0.7578004829442634, 0.7564446763029316, 0.7534828255066957, 0.7551623857672819, 0.7535181697882737, 0.7541741596543612, 0.7537613384455303, 0.7547679435848715, 0.748282974800941, 0.7540292480998914, 0.7570214949782845, 0.7545707224936663, 0.7507422299131378, 0.7551581444534926, 0.7523404983260947, 0.751728335369164, 0.7533866890608034, 0.7515388900199059, 0.7524309796869345, 0.7548428734618169, 0.7552486258143323, 0.7541847629388346, 0.753954318222946, 0.7558409959735795, 0.7509846916847629, 0.7549453718783931, 0.7546463592562431, 0.7533174142689105, 0.7534121369435396, 0.752966798995657, 0.7554592777325372, 0.7555158285830619, 0.7501265325280492, 0.755162385767282, 0.7489545161509229, 0.7517693347357944, 0.749487507917119, 0.7531618994299674, 0.7534

In [170]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [171]:
column_to_drop_52 = 'Cat_주택 보유 의식'

In [172]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(8444, 17)


In [173]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [174]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [175]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7645552221127115


In [176]:
optuna_53 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [177]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.753


In [178]:
X_train = X_train.values
y_train = y_train.values

In [179]:
auc_bootstrap = []

In [180]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7429219 , 0.75444039])

In [181]:
np.mean(auc_bootstrap)

0.749282742589011

In [182]:
t_53 = auc_bootstrap
print(t_53)

[0.75271444082519, 0.7490110670014477, 0.7516823878031126, 0.7441590040264204, 0.753506152732537, 0.7517120769996379, 0.7488576728193992, 0.74772524203764, 0.7460145788092652, 0.7468762723941368, 0.7492542356587043, 0.7531972437115453, 0.7492754422276511, 0.749942035378212, 0.747144182048498, 0.750614990499457, 0.7490810486789722, 0.7499604144046327, 0.7465072780944624, 0.7509988293973942, 0.7507351610568221, 0.7477259489232717, 0.7436832699963808, 0.7511826196615997, 0.7463814524520449, 0.7472120430691276, 0.753493428791169, 0.7517707485070576, 0.747876515562794, 0.7509210719779226, 0.7514561844010132, 0.7457127386445891, 0.7532742942453855, 0.7470077531216068, 0.7479889103782121, 0.7486936753528772, 0.7493397688201231, 0.7538433371787912, 0.7509705539721316, 0.7490244978284474, 0.7444064139974664, 0.7516526986065871, 0.7562248348715164, 0.754184056053203, 0.747044511174448, 0.743756786102063, 0.7485381605139341, 0.7507662640246109, 0.7566312941096633, 0.7514448742309084, 0.7451076445

In [183]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [184]:
column_to_drop_53 = '소득 중 사적이전소득의 비중(월평균)'

In [185]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(8444, 16)


In [186]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [187]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [188]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 83, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7560649521124021


In [189]:
optuna_54 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [190]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.749


In [191]:
X_train = X_train.values
y_train = y_train.values

In [192]:
auc_bootstrap = []

In [193]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73945302, 0.7495764 ])

In [194]:
np.mean(auc_bootstrap)

0.7458346410717518

In [195]:
t_54 = auc_bootstrap
print(t_54)

[0.745220039359392, 0.7455409654361203, 0.7455763097176982, 0.7491220480456025, 0.7440254026420559, 0.7428929718602968, 0.7469745294969237, 0.7437991992399566, 0.7469533229279768, 0.7455833785740138, 0.7447195643322475, 0.7447492535287731, 0.7434217223127035, 0.7432563110749186, 0.7485989526782483, 0.7409518639160333, 0.7358665286825914, 0.7472671801483894, 0.7438797842019544, 0.7415781645855954, 0.7416969213716975, 0.7477068630112197, 0.7446389793702497, 0.7461206116539993, 0.7459537866449512, 0.7492747353420195, 0.7497497624864278, 0.7462209894136808, 0.7470169426348172, 0.7481536147303656, 0.7461573697068404, 0.748972188291712, 0.7370032007781397, 0.749597075190011, 0.7428194557546145, 0.7463567114549403, 0.7486682274701412, 0.7389669290626131, 0.7458604777415853, 0.7458958220231633, 0.7498105546507419, 0.7454702768729642, 0.7463213671733624, 0.7471752850162867, 0.7417902302750634, 0.7486385382736157, 0.7451691435939196, 0.7491135654180239, 0.7439844032754253, 0.7469462540716613, 0.

In [196]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [197]:
column_to_drop_54 = 'Cat_이사 계획 중인 거주 지역'

In [198]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(8444, 9)


In [199]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [200]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [201]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 50, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 7}
0.7246617814098002


In [202]:
optuna_55 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [203]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.721


In [204]:
X_train = X_train.values
y_train = y_train.values

In [205]:
auc_bootstrap = []

In [206]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71437084, 0.72131529])

In [207]:
np.mean(auc_bootstrap)

0.7188125537233079

In [208]:
t_55 = auc_bootstrap
print(t_55)

[0.7163529508233804, 0.7213152879569308, 0.7198562760133912, 0.7206225400380021, 0.7201814434039087, 0.7213322532120883, 0.7206225400380021, 0.7212417718512486, 0.7199297921190734, 0.7213152879569308, 0.7212304616811436, 0.7212643921914584, 0.7148600083695258, 0.7201220650108576, 0.7136498201682953, 0.7210494989594642, 0.7206225400380021, 0.7163529508233804, 0.7151597278773072, 0.7143708435124864, 0.712549906125588, 0.7162850898027505, 0.7162794347176981, 0.7152332439829894, 0.7199227232627577, 0.7149674549855229, 0.7150409710912051, 0.7161182647937024, 0.7212248065960911, 0.7210636366720955, 0.7151597278773072, 0.7160871618259138, 0.7162850898027505, 0.7199297921190734, 0.7201814434039087, 0.7212474269363011, 0.7180918894770176, 0.7163699160785377, 0.7172012135812522, 0.716101299538545, 0.7201955811165399, 0.7201814434039087, 0.7161239198787549, 0.7162850898027505, 0.7213152879569308, 0.7201220650108576, 0.7162568143774881, 0.7205037832519, 0.7211230150651464, 0.7201220650108576, 0.72

In [209]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc