In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
중장년가구 = pd.read_csv('중장년가구_변수추가.csv', encoding='cp949')
중장년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
중장년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [9]:
from scipy import stats

In [10]:
cat = 중장년가구.select_dtypes(include = 'object')
num = 중장년가구.select_dtypes(exclude = 'object')
num_중장년 = num.drop('target',axis=1)
target = 중장년가구.target

In [11]:
scaler=RobustScaler()
scaler.fit(num_중장년)
num_scaled_중장년=scaler.transform(num_중장년)
num_df_scaled_중장년=pd.DataFrame(data=num_scaled_중장년, columns=num_중장년.columns)

In [12]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [13]:
comp = pd.concat([num_df_scaled_중장년, target,cat2],axis=1)

In [14]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(19949, 214)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [17]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 69, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7860011962319932


In [18]:
optuna_0 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [19]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

0.790380193705317


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
auc_bootstrap = []

In [22]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78048973, 0.78994048])

In [23]:
t_0 = auc_bootstrap
print(t_0)

[0.7848922110005854, 0.7851502980813325, 0.783008730053065, 0.7872802429452677, 0.7841990477458458, 0.7845226472073762, 0.7832824027405307, 0.7880735239109624, 0.7892411236007295, 0.7891999141999142, 0.7871986166321141, 0.7817510508397207, 0.7843097319698304, 0.779973235022496, 0.7872300520083773, 0.7868858478464389, 0.7866515354200083, 0.7886113594241673, 0.7882402106540037, 0.7846172174989909, 0.7812269517934542, 0.7877105641883967, 0.78398428336852, 0.7847664694955336, 0.7869835880919624, 0.7882909299165457, 0.7866972355888612, 0.7808127444826952, 0.7811038519166598, 0.785537824946692, 0.784250559496865, 0.7820078171063393, 0.786934453806375, 0.7828227594237447, 0.7871893709332132, 0.7828951400379972, 0.785311965730685, 0.7828504965204472, 0.786237592272075, 0.785500313825437, 0.7810351695819676, 0.7844761545500462, 0.785022179110849, 0.7846792957630396, 0.7843226759482917, 0.7851621854084909, 0.7862093268497209, 0.784136969481797, 0.7840130771165252, 0.7878046061543599, 0.786378391

In [24]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [15]:
column_to_drop = '부채 중 임대 보증금의 비중'

In [16]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(19949, 213)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [28]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [29]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 174, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7853605640535176


In [30]:
optuna_1 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [31]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7923276020566661


In [32]:
X_train = X_train.values
y_train = y_train.values

In [33]:
auc_bootstrap = []

In [34]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78404361, 0.79177603])

In [35]:
t_1 = auc_bootstrap
print(t_1)

[0.7884259171204984, 0.7900410086370677, 0.786538473730592, 0.7854987288484825, 0.787000230349984, 0.7879852935271655, 0.7887989150304421, 0.7863717869875506, 0.7911816637186095, 0.7851130511229033, 0.7869516243900481, 0.7840439841671368, 0.7871301984602478, 0.7878584953708105, 0.7863474840075826, 0.7895092488688548, 0.7916283630569345, 0.7863004630246009, 0.7888311428951824, 0.7871478973695722, 0.7885461112062097, 0.7877528302405149, 0.7851246742872359, 0.78956472306226, 0.7854054793709967, 0.7867286709651242, 0.7865561726399165, 0.7928738907802947, 0.7855040121049973, 0.7887714420965652, 0.7870517421010033, 0.7863591071719149, 0.7877742274293998, 0.7848473033202099, 0.7852081497401695, 0.789208367410338, 0.7851207118448498, 0.7869542660183053, 0.7882380973513978, 0.7899540990673997, 0.786251857064665, 0.7874302874302874, 0.7897271832000896, 0.7892738797911212, 0.7875914267539883, 0.78486526639236, 0.7876355419458868, 0.7860600748531783, 0.7907521349639575, 0.7910992449169789, 0.78714

In [36]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [17]:
column_to_drop_1 = 'Cat_가구주 동거 여부'

In [18]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(19949, 211)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [40]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [41]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7849387044617637


In [42]:
optuna_2 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [43]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.7900845955033147


In [44]:
X_train = X_train.values
y_train = y_train.values

In [45]:
auc_bootstrap = []

In [46]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78345075, 0.79157517])

In [47]:
t_2 = auc_bootstrap
print(t_2)

[0.7898199043519241, 0.787643466830659, 0.7905009161166796, 0.7879253285657228, 0.7887479316050744, 0.7898138286069321, 0.7880254462766778, 0.7859665612128667, 0.7868449026084494, 0.7864740180011116, 0.7886100386100385, 0.7901831282373154, 0.7845170997880357, 0.7857021342243018, 0.7887038164131761, 0.7857401736712082, 0.7860333944077788, 0.7845384969769207, 0.7885284122968852, 0.7899591181610885, 0.7894405665341626, 0.7887600830950585, 0.7865609275707798, 0.7881424704084802, 0.7827268683180013, 0.7867714653428939, 0.7882832691945993, 0.7889452612359016, 0.7879599338958945, 0.786255819507051, 0.7898228101430071, 0.7859261443005285, 0.7877187532359947, 0.7877565285200753, 0.7872628081987687, 0.7877974737580649, 0.784327430879155, 0.7867978816254678, 0.7885947171661456, 0.7883020247552267, 0.7885653950924886, 0.7891587047990989, 0.7885836223274647, 0.7879274418683285, 0.7857026625499531, 0.788838539454303, 0.78912330698045, 0.7907243978672551, 0.789436075766125, 0.7857277580183984, 0.7901

In [48]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [19]:
column_to_drop_2 = '소득 중 재산소득의 비중(월평균)'

In [20]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(19949, 210)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [52]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [53]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7878809893639372


In [54]:
optuna_3 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [55]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7924850431008067


In [56]:
X_train = X_train.values
y_train = y_train.values

In [57]:
auc_bootstrap = []

In [58]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78323948, 0.79147148])

In [59]:
t_3 = auc_bootstrap
print(t_3)

[0.7883754620207823, 0.7845834046572964, 0.790642507391276, 0.7859034262975149, 0.7860463383862398, 0.7822056750628178, 0.7881554143869414, 0.7892363686698661, 0.7874112677068341, 0.7894202259965806, 0.7879461974289561, 0.7906258651332543, 0.7852232070212365, 0.7856894544086661, 0.786293330628306, 0.7853951770207928, 0.7892984469339148, 0.7887814802839433, 0.7830121641697996, 0.7869352462948522, 0.7894363399289507, 0.7890696819268248, 0.7886414739863017, 0.7888409169197347, 0.7879398575211384, 0.788574112465738, 0.7848673796949659, 0.7894580012806613, 0.7861313988161279, 0.7847403173757853, 0.7877615476137643, 0.787562368843157, 0.7885352805303544, 0.7864711122100285, 0.7855203902001932, 0.7898579437988305, 0.7868327511184654, 0.7887318176727044, 0.7865255297521307, 0.7879406500096156, 0.7857864021657125, 0.7858038369122113, 0.7880288803934123, 0.7894706810962969, 0.7887981225419649, 0.7924792315186404, 0.7876239187815544, 0.7874897240660788, 0.7846703142269644, 0.7864494508583179, 0.7

In [60]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [21]:
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [22]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(19949, 209)


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [64]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [65]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7890268623842001


In [66]:
optuna_4 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [67]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7913272174355919


In [68]:
X_train = X_train.values
y_train = y_train.values

In [69]:
auc_bootstrap = []

In [70]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78274977, 0.79137045])

In [71]:
t_4 = auc_bootstrap
print(t_4)

[0.7878827983507786, 0.7847218259779836, 0.787075252592494, 0.7869016976159833, 0.7889426196076442, 0.7852448683729472, 0.7859636554217835, 0.7864499791839693, 0.7889883197764971, 0.7857243239016638, 0.7852707563298695, 0.7835795859194874, 0.7894022629244305, 0.7861535884934898, 0.7864843203513154, 0.7905138600951408, 0.7860891327640096, 0.7885379221586118, 0.7917847474497721, 0.7899591181610887, 0.787155293928693, 0.7861266438852645, 0.7903170587899652, 0.7837526125703466, 0.7862299315501287, 0.7864829995371867, 0.7850594260692783, 0.7850747475131712, 0.7909235766378624, 0.7868559974471304, 0.7877546793802952, 0.787848985509084, 0.7886007929111378, 0.7861810614273669, 0.7864193362961835, 0.791044563212051, 0.7879134412385644, 0.7861427578176347, 0.7896957478238267, 0.7839026570553664, 0.7834253148292557, 0.7900822180378831, 0.7908485543953524, 0.7936309814388631, 0.7916698366205756, 0.7877298480746756, 0.7880552966759864, 0.7912062308614032, 0.7877039601177532, 0.7854773316595977, 0.7

In [72]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [23]:
column_to_drop_4 = 'Cat_가구주 주민등록상 등재 여부'

In [24]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(19949, 207)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [76]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [77]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7862335905080766


In [78]:
optuna_5 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [79]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.792832153053828


In [80]:
X_train = X_train.values
y_train = y_train.values

In [81]:
auc_bootstrap = []

In [82]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.784947  , 0.79263108])

In [83]:
t_5 = auc_bootstrap
print(t_5)

[0.7886348699156582, 0.7882209267677248, 0.7884858820819411, 0.7893594685466606, 0.7892606716498342, 0.7886800417588595, 0.7927914719786642, 0.7879456691033045, 0.7845210622304217, 0.7902264509407367, 0.7884776930343432, 0.7937723085506336, 0.7881403571058743, 0.7884370119591795, 0.7859351258366036, 0.7925893874169736, 0.7904776697880147, 0.7905608810781224, 0.7894046403898621, 0.7879517448482967, 0.7928802306881124, 0.7886216617743712, 0.7897361647361647, 0.7885646026040115, 0.7916103999847842, 0.7893996212961731, 0.7917300657448442, 0.7904544234593496, 0.7920763832093881, 0.7907442100791854, 0.791026335977075, 0.7924868922405869, 0.7883746695323051, 0.7897380138759449, 0.7929721593514698, 0.790181543260361, 0.7882161718368614, 0.7859073887399011, 0.7882930432191515, 0.7851331274976595, 0.7876733172299675, 0.7904623483441218, 0.7910358458388015, 0.7845020425069685, 0.7873240939743402, 0.7867638046209475, 0.7868279961876021, 0.7894709452591224, 0.788571206674655, 0.7884555033569811, 0.

In [84]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [25]:
column_to_drop_5 = '소득 중 사적이전소득의 비중(월평균)'

In [26]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(19949, 206)


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [88]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [89]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7859017166218045


In [90]:
optuna_6 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [91]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7927851320708463


In [92]:
X_train = X_train.values
y_train = y_train.values

In [93]:
auc_bootstrap = []

In [94]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7849701 , 0.79276953])

In [95]:
t_6 = auc_bootstrap
print(t_6)

[0.7863754852671109, 0.7916941396005435, 0.7901464096045376, 0.788366480484707, 0.7866887823784375, 0.7872184288440445, 0.7888686540164372, 0.7888768430640352, 0.7860410551297251, 0.7912429494941811, 0.7892054616192548, 0.786405863992071, 0.7883921042788038, 0.7905759383591896, 0.7871254435293844, 0.7905260115851249, 0.787480214204352, 0.7860209787549689, 0.7896674824014727, 0.7902771702032787, 0.7870290240979896, 0.7934981075375164, 0.7856670005684785, 0.7892197264118447, 0.7883075721745674, 0.7896521609575797, 0.7853787989255969, 0.7861010200911678, 0.790516765886224, 0.7873660958636328, 0.7903038506486783, 0.7927077323629047, 0.7914701295243167, 0.7913134809686535, 0.792060269277018, 0.7905595602639937, 0.7862745750676785, 0.7877282630977213, 0.7883099496399989, 0.7892302929248742, 0.7876902236508148, 0.7888300862438793, 0.7866129676474505, 0.7891766678712491, 0.7929063828078606, 0.7869360387833294, 0.7903558907253488, 0.7876379194113184, 0.7884335778424449, 0.787589841777034, 0.789

In [96]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
column_to_drop_6 = 'Cat_가구주 장애 여부'

In [28]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(19949, 204)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [30]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [31]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 197, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 6}
0.7870686413438086


In [32]:
optuna_7 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [33]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7922708070491322


In [34]:
X_train = X_train.values
y_train = y_train.values

In [35]:
auc_bootstrap = []

In [36]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78578622, 0.79266152])

In [37]:
t_7 = auc_bootstrap
print(t_7)

[0.7902779626917558, 0.7871513314863069, 0.7884113881650827, 0.7889291473035316, 0.7877433203787885, 0.7916748557142645, 0.7886097744472129, 0.7897086918022879, 0.787099555572462, 0.7852871344250655, 0.7886549462904142, 0.7910184110923028, 0.7890173776873284, 0.7893599968723122, 0.7872316369853316, 0.7886723810369131, 0.7882959490102348, 0.7895013239840825, 0.7906118645034901, 0.7884634282417533, 0.7901802224462323, 0.7896146498363248, 0.788749516582029, 0.7863149919800166, 0.7874178717774777, 0.7889566202374083, 0.7879607263843716, 0.7898341691445139, 0.7894273583928757, 0.7900964828304728, 0.7892207830631477, 0.7915763229802639, 0.7887489882563774, 0.7909410113843611, 0.7865891929931339, 0.7901144459026233, 0.788728119393144, 0.7908126282510518, 0.7907980992956363, 0.7904385736898052, 0.7877950962926332, 0.7944416971510568, 0.7892928995145744, 0.7889228073957139, 0.7916397220584414, 0.7911214345943409, 0.7880766938648711, 0.7901339939517279, 0.7901664859792938, 0.7914236368669867, 0.

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
column_to_drop_7 = '부채 중 비금융기관 대출금의 비중'

In [35]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(19949, 203)


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [37]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7855417737998777


In [38]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7855417737998777


In [39]:
optuna_8 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [40]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7907386626598449


In [41]:
X_train = X_train.values
y_train = y_train.values

In [42]:
auc_bootstrap = []

In [43]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78304983, 0.79142022])

In [44]:
t_8 = auc_bootstrap
print(t_8)

[0.7874041353105392, 0.7887244211135837, 0.7860104122419393, 0.7870699693359793, 0.7864045431779423, 0.7880719389340078, 0.7916730065744844, 0.789076550160294, 0.7883810094401227, 0.7874154943120462, 0.7864510358352723, 0.7858442538245494, 0.7871494823465268, 0.7899715338138983, 0.7875179894884328, 0.7883556498088518, 0.7888926928335797, 0.7875169328371299, 0.7884570883339357, 0.7888781638781639, 0.7920637033937526, 0.7854781241480749, 0.7900151206801453, 0.7894418873482913, 0.7868150522091408, 0.7866705551434615, 0.7874133810094401, 0.7880299370447155, 0.7861887221493133, 0.7852784170518161, 0.7883308185032323, 0.7868290528389051, 0.7874413822689685, 0.7855748077422955, 0.7882771934496072, 0.7868742246821065, 0.7869542660183054, 0.7892596149985311, 0.7896252163493542, 0.7881237148478528, 0.7867173119636174, 0.778921074487577, 0.7854987288484826, 0.7859314275570434, 0.7858730475725549, 0.7906649612314638, 0.7872995268315466, 0.7887011747849186, 0.7915116030879578, 0.7873262072769462, 0

In [45]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [31]:
column_to_drop_8 = '자산 중 부동산 자산의 비중'

In [32]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(19949, 202)


In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [53]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [54]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7871297325567044


In [55]:
optuna_9 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [56]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7937955548792988


In [57]:
X_train = X_train.values
y_train = y_train.values

In [58]:
auc_bootstrap = []

In [59]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.786811  , 0.79348915])

In [60]:
t_9 = auc_bootstrap
print(t_9)

[0.79262742686388, 0.7882359840487919, 0.7920063800605672, 0.7901823357488382, 0.7868689414255916, 0.7888509551071127, 0.789339920497556, 0.7899805153499735, 0.7909727109234498, 0.7893177308201939, 0.7900531601270517, 0.7909531628743451, 0.7917836907984691, 0.7937709877365049, 0.7897012952431671, 0.7890083961512532, 0.7915306228114111, 0.789446113953503, 0.7897150317101055, 0.7921236683551954, 0.7898685103118601, 0.7886578520814974, 0.7925767076013381, 0.7904295921537301, 0.7897926955808728, 0.7914468831956516, 0.7885101850619093, 0.7904097799417997, 0.7903532490970915, 0.7897395988528992, 0.7930255202422691, 0.7892149714809813, 0.7910757344254881, 0.7904597067158644, 0.7916550435023341, 0.7914532231034694, 0.7887310251842271, 0.7907220204018233, 0.7901995063325112, 0.7894680394680395, 0.7927101098283365, 0.7898693028003372, 0.7899440608800214, 0.7922179744839843, 0.7945291350463765, 0.7910704511689733, 0.7921960489694481, 0.7918204094312469, 0.7904528384823951, 0.7910236943488174, 0.7

In [61]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [33]:
column_to_drop_9 = 'Cat_이사 예상 기간'

In [34]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(19949, 198)


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [65]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [66]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7861052164053023


In [67]:
optuna_10 = RandomForestClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [68]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.7916804031336051


In [69]:
X_train = X_train.values
y_train = y_train.values

In [70]:
auc_bootstrap = []

In [71]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78384555, 0.79198519])

In [72]:
t_10 = auc_bootstrap
print(t_10)

[0.7875230085821219, 0.7876577316232488, 0.7863604279860438, 0.7864111472485857, 0.7858400272193375, 0.789221311388799, 0.7892987110967407, 0.788940770467864, 0.7883110062913019, 0.7847202410010292, 0.787808568596746, 0.7870063060949761, 0.7898267725853934, 0.7895433258733751, 0.787227146217294, 0.7881163182887321, 0.7876548258321658, 0.7895581189916164, 0.7870290240979896, 0.7867225952201322, 0.7873029609482811, 0.7873734924227536, 0.7883749336951308, 0.7917543687248121, 0.7918494673420782, 0.78703272237755, 0.7883685937873129, 0.7853246455463204, 0.7911401901549684, 0.7852493591409848, 0.7931015991360819, 0.7896606141680034, 0.7867828243444007, 0.785057576929498, 0.7905421255174949, 0.7894910216338786, 0.7906760560701447, 0.7874252683365983, 0.7879757836654387, 0.790966371015632, 0.7847495630746864, 0.7859927133326148, 0.7865416436845007, 0.7906221668536939, 0.7895824219715846, 0.7900996527843819, 0.7857034550384305, 0.7914249576811152, 0.7880642782120616, 0.7892641057665688, 0.78276

In [73]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [35]:
column_to_drop_10 = 'Cat_현재 공공기관 접근용이성'

In [36]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(19949, 194)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [77]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [78]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 3}
0.785552506039981


In [79]:
optuna_11 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [80]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7924126624865542


In [81]:
X_train = X_train.values
y_train = y_train.values

In [82]:
auc_bootstrap = []

In [83]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.785661 , 0.7926559])

In [84]:
t_11 = auc_bootstrap
print(t_11)

[0.7876770155095278, 0.7890202834784115, 0.7883115346169534, 0.7886200767974167, 0.7914408074506597, 0.7906739427675389, 0.7885358088560059, 0.7899297960874314, 0.7875660671227174, 0.7919659631482292, 0.788484032942161, 0.7880846187496434, 0.7897808082537148, 0.7862344223181662, 0.7882111527431724, 0.7915271886946764, 0.7864412618107199, 0.7883627822051468, 0.7869323405037691, 0.786948718598965, 0.7899107763639783, 0.788090166168984, 0.7867579930387812, 0.790711189725968, 0.7898917566405251, 0.7890572662740151, 0.792931478276306, 0.7910125995101365, 0.7874014936822819, 0.7901421829993258, 0.7889243923726683, 0.7911100755928342, 0.7871756344662748, 0.7918534297844643, 0.7867355391985935, 0.7865614558964313, 0.7878106818993519, 0.7893919605742266, 0.7864911885847846, 0.7920520802294202, 0.7922856001673736, 0.7891484024488952, 0.7893053151673841, 0.7874017578451076, 0.7941933840948618, 0.788361461391018, 0.7866967072632096, 0.7914368450082736, 0.7860801512279344, 0.787380096493397, 0.7907

In [85]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [37]:
column_to_drop_11 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [38]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(19949, 190)


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [89]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [90]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7848276670545409


In [91]:
optuna_12 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [92]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7893552419414488


In [93]:
X_train = X_train.values
y_train = y_train.values

In [94]:
auc_bootstrap = []

In [95]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78395744, 0.7917869 ])

In [96]:
t_12 = auc_bootstrap
print(t_12)

[0.7875253860475535, 0.7831614161663423, 0.7878534762771217, 0.7891172312354577, 0.7887154395775086, 0.7852649447477034, 0.7920660808591843, 0.7893420338001619, 0.7864206571103123, 0.7900996527843819, 0.7875829735435648, 0.7874041353105392, 0.7903424184212361, 0.7851252026128873, 0.7875364808862347, 0.7860397343155965, 0.789233462878783, 0.7853647982958328, 0.7870929515018186, 0.7866768950512792, 0.7869613984146003, 0.7862508004133619, 0.7869156982457475, 0.7852063006003892, 0.7887389500689994, 0.7881606976434562, 0.7871867293049559, 0.7881205448939439, 0.7887828010980721, 0.7878709110236204, 0.7880344278127529, 0.7864581682315672, 0.7881900197171133, 0.7851241459615843, 0.7877874355706869, 0.7871830310253956, 0.7867931266946044, 0.7855283150849653, 0.7910680737035417, 0.786735803361419, 0.7856094130724672, 0.7915565107683333, 0.7873185465549997, 0.7922663162810946, 0.789245614368767, 0.7883289693634521, 0.789650840143451, 0.7899403626004611, 0.7898209610032271, 0.789586912739622, 0.78

In [97]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [39]:
column_to_drop_12 = '부채 중 금융기관 대출금의 비중'

In [40]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(19949, 189)


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [101]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [102]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 164, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 6}
0.7865856905391588


In [103]:
optuna_13 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [104]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.7907542482665636


In [105]:
X_train = X_train.values
y_train = y_train.values

In [106]:
auc_bootstrap = []

In [107]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78495431, 0.79225293])

In [108]:
t_13 = auc_bootstrap
print(t_13)

[0.7891071930480797, 0.7904834813701809, 0.7911940793714193, 0.790380986193794, 0.7918986016276657, 0.7895406842451176, 0.7909801074825706, 0.7900948978535186, 0.7885749049542153, 0.791042449909445, 0.7887672154913534, 0.79132299083038, 0.7896236313723999, 0.7901596177458247, 0.7900682174081188, 0.7919807562664706, 0.7872538266626936, 0.787850042160387, 0.7854968797087024, 0.7916555718279857, 0.7877021109779729, 0.7894421515111171, 0.7893058434930356, 0.7886406814978243, 0.7860767171112, 0.7864626589996049, 0.788568036720746, 0.789892813291828, 0.7918565997383731, 0.7912730640563153, 0.7892561808817966, 0.7836588347672092, 0.7868655073088571, 0.7872266178916425, 0.7902774343661043, 0.7884642207302307, 0.7879248002400712, 0.7863522389384457, 0.7881039026359223, 0.788761403909187, 0.786642818046759, 0.788043409348828, 0.7884169355844233, 0.7873127349728335, 0.7867513889681378, 0.7900122148890623, 0.7882544754465937, 0.7860587540390496, 0.7871043105033253, 0.7877850581052551, 0.7862122326

In [109]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [41]:
column_to_drop_13 = 'Cat_현재 상업시설 접근용이성'

In [42]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(19949, 185)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [113]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [114]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7876044277920439


In [115]:
optuna_14 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [116]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7907516066383061


In [117]:
X_train = X_train.values
y_train = y_train.values

In [118]:
auc_bootstrap = []

In [119]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78457891, 0.79260229])

In [120]:
t_14 = auc_bootstrap
print(t_14)

[0.7889188449533279, 0.7879887276439002, 0.790040744474242, 0.7841565175309018, 0.7887027597618731, 0.7897580902507011, 0.7869669458339409, 0.7893071643071643, 0.7895927243217884, 0.7891399492384714, 0.7873243581371661, 0.7884914295012817, 0.789502908961037, 0.7934431616697627, 0.7914228443785094, 0.7912894421515111, 0.7884502201004664, 0.7884903728499788, 0.7891293827254419, 0.7914896775734214, 0.7884753155689117, 0.7897559769480952, 0.7923434518262105, 0.7875713503792321, 0.7876601090886804, 0.7863263509815234, 0.7922982799830091, 0.7848222078517646, 0.7870110610258394, 0.7891201370265409, 0.7922187669724615, 0.7880198988573374, 0.7917604444698041, 0.7916624400614548, 0.7883812736029485, 0.7879567639419855, 0.786180797264541, 0.7915699830724461, 0.789239538623775, 0.7919461509362988, 0.7861028692309482, 0.7865794189685815, 0.7851236176359329, 0.78970050275469, 0.7872599024076856, 0.7857760998155087, 0.7870948006415988, 0.7857790056065919, 0.785807271028946, 0.7885466395318613, 0.7908

In [121]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
column_to_drop_14 = '소득 중 정부 보조금의 비중(월평균)'

In [44]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(19949, 184)


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [125]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [126]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 117, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7869088960776551


In [127]:
optuna_15 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [128]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7921072902599997


In [129]:
X_train = X_train.values
y_train = y_train.values

In [130]:
auc_bootstrap = []

In [131]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78482636, 0.79250269])

In [132]:
t_15 = auc_bootstrap
print(t_15)

[0.7916259855915029, 0.788406104908568, 0.7869014334531577, 0.7889280906522287, 0.7870712901501078, 0.7912654033343689, 0.7872839412248279, 0.7900291213099095, 0.7925183276168498, 0.7898780201735867, 0.7902391307563722, 0.7882885524511141, 0.7920063800605672, 0.7916666666666667, 0.7895578548287908, 0.7860215070806204, 0.7851981115527913, 0.7857309279723073, 0.7875557647725135, 0.7874963281367222, 0.7888876737398907, 0.791795049799976, 0.7867841451585296, 0.7877874355706868, 0.787211032284924, 0.7920452119959509, 0.7887421200229082, 0.787556293098165, 0.7893710917109933, 0.787843966415395, 0.790392345195301, 0.7891090421878599, 0.7880555608388121, 0.7887846502378522, 0.7879739345256587, 0.7895494016183671, 0.7913232549932057, 0.791847618202298, 0.7896553309114885, 0.7859253518120513, 0.7907706263617593, 0.787122801901127, 0.7892176131092388, 0.7912791398013073, 0.785801195283954, 0.7866351573248125, 0.7902380741050691, 0.7866016086459435, 0.7876075406863585, 0.7890102452910335, 0.790351

In [133]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [45]:
column_to_drop_15 = 'Cat_현재 교육환경'

In [46]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(19949, 180)


In [136]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [137]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [138]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7848470676424198


In [139]:
optuna_16 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [140]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7916209664978138


In [141]:
X_train = X_train.values
y_train = y_train.values

In [142]:
auc_bootstrap = []

In [143]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78306964, 0.79153835])

In [144]:
t_16 = auc_bootstrap
print(t_16)

[0.7898777560107609, 0.7845007216928399, 0.7889476387013332, 0.7871243868780815, 0.7877034317921018, 0.7866544412110914, 0.7887743478876483, 0.7863517106127944, 0.7910831309846088, 0.7906755277444933, 0.7876619582284606, 0.790279811831536, 0.7881245073363301, 0.7842759191281359, 0.7871386516706714, 0.7851595437802334, 0.7870205708875659, 0.7877829448026492, 0.7849490060081193, 0.7903994775915958, 0.7874508921306951, 0.7870374773084132, 0.78697196492763, 0.7816295359398808, 0.7881831514836442, 0.7848401709239148, 0.7871962391666826, 0.7866396480928501, 0.7898772276851094, 0.7898560946590503, 0.7867550872476982, 0.7883714995783961, 0.7889265056752742, 0.7880027282736642, 0.7878109460621776, 0.7859190119042336, 0.7912934045938973, 0.7878117385506547, 0.7852987575893979, 0.7869471336220105, 0.7884805988254264, 0.7880135589495195, 0.7870681201961991, 0.7873087725304475, 0.7903783445655366, 0.7863414082625905, 0.7894788701438947, 0.7875507456788245, 0.7886441156145589, 0.7914434490789171, 0.

In [145]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [47]:
column_to_drop_16 = 'Cat_현재 주택의 위치'

In [48]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(19949, 176)


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [149]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [150]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 191, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7870934080517393


In [151]:
optuna_17 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [152]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.791018146929477


In [153]:
X_train = X_train.values
y_train = y_train.values

In [154]:
auc_bootstrap = []

In [155]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78547286, 0.79277797])

In [156]:
t_17 = auc_bootstrap
print(t_17)

[0.7885603759987996, 0.7878991764459745, 0.7882740234956984, 0.7888945419733596, 0.792758187462621, 0.7883963308840156, 0.7889341663972205, 0.7934146320845828, 0.7905529561933503, 0.7894823042606294, 0.7894793984695463, 0.7890863241848463, 0.790994900600812, 0.7854913322893619, 0.7863567297064833, 0.7869405295513671, 0.791237930400492, 0.7913705401390131, 0.7914302409376301, 0.7935430152178922, 0.7881847364605985, 0.7832435708051471, 0.786742143269237, 0.7875185178140843, 0.7856400559602531, 0.7900069316325474, 0.7907328510776785, 0.7894062253668166, 0.7897020877316443, 0.7900560659181348, 0.788834577011917, 0.7896574442140945, 0.7892889370721883, 0.790116823368055, 0.7894162635541946, 0.7897620526930871, 0.7887965375650106, 0.7868195429771785, 0.7926461824245076, 0.7906010338276348, 0.7884240679807182, 0.7895612889455255, 0.7894109802976799, 0.7896759356118963, 0.7906287709243375, 0.7882742876585241, 0.7891528932169326, 0.7883073080117415, 0.7876001441272377, 0.7886055478420011, 0.790

In [157]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [49]:
column_to_drop_17 = 'Cat_현재 대중교통 접근용이성'

In [50]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(19949, 172)


In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [161]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [162]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 107, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7869431566902927


In [163]:
optuna_18 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [164]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.7907846269915236


In [165]:
X_train = X_train.values
y_train = y_train.values

In [166]:
auc_bootstrap = []

In [167]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78350287, 0.79131968])

In [168]:
t_18 = auc_bootstrap
print(t_18)

[0.7921508771262467, 0.7911787579275263, 0.7899266261335227, 0.7836152479009622, 0.7854913322893617, 0.7896714448438586, 0.7892622566267886, 0.7911216987571668, 0.7881886989029846, 0.7874186642659549, 0.7879364234044037, 0.7878923082125052, 0.7878738168147035, 0.7877678875215821, 0.7887101563209937, 0.7885083359221291, 0.788907485951821, 0.7916698366205756, 0.7859528247459282, 0.7888971836016172, 0.7855436365288583, 0.7857269655299213, 0.7864750746524146, 0.7891064005596025, 0.7871661246045483, 0.7893457320797221, 0.7894239242761409, 0.786970908276327, 0.7879113279359584, 0.7906192610626108, 0.7858424046847692, 0.7891830077790669, 0.7836353242757184, 0.7846748049950019, 0.7891650447069166, 0.7859869017504486, 0.7872601665705115, 0.7873148482754395, 0.7826661108680812, 0.7885313180879683, 0.7869830597663109, 0.7861871371723589, 0.7896133290221959, 0.7905275965620793, 0.7861448711202407, 0.7897707700663367, 0.7857385886942538, 0.785121504333327, 0.7897113334305453, 0.7892532750907135, 0.

In [169]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [51]:
column_to_drop_18 = 'Cat_현재 의료시설 접근용이성'

In [52]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(19949, 168)


In [172]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [173]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [174]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 200, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7860701302357337


In [175]:
optuna_19 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [176]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7899694205112924


In [177]:
X_train = X_train.values
y_train = y_train.values

In [178]:
auc_bootstrap = []

In [179]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78500955, 0.79187448])

In [180]:
t_19 = auc_bootstrap
print(t_19)

[0.7878851758162102, 0.7877031676292761, 0.7857549667894494, 0.7861588717500048, 0.7858453104758524, 0.7882911940793715, 0.7918742986476976, 0.7874482505024376, 0.7894973615416966, 0.788239946491178, 0.7893631668262211, 0.7916944037633692, 0.7870794791977058, 0.7884293512372329, 0.7892759930937272, 0.7904554801106525, 0.7889806590545505, 0.7871851443280016, 0.7868641864947283, 0.7920008326412267, 0.7875763694729212, 0.7865448136384097, 0.7904951045345133, 0.7890540963201061, 0.784303127899187, 0.7863876367570949, 0.7893909039229237, 0.7891381000986913, 0.7873985878911987, 0.7901221066245696, 0.7883432341560421, 0.7851561096634988, 0.7885075434336518, 0.7885281481340595, 0.783499016257637, 0.7892707098372123, 0.7897071068253334, 0.7862270257590454, 0.7884216905152867, 0.7891563273336673, 0.791127510339333, 0.7866768950512792, 0.7901242199271756, 0.7896429152586788, 0.7872902811326457, 0.7938954084274281, 0.787800379549148, 0.788818198916721, 0.7899979500964723, 0.7874928940199877, 0.789

In [181]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [53]:
column_to_drop_19 = '소득 중 근로/사업소득의 비중(월평균)'

In [54]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(19949, 167)


In [184]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [185]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [186]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 187, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 3}
0.786887844375914


In [187]:
optuna_20 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [188]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7927766788604227


In [189]:
X_train = X_train.values
y_train = y_train.values

In [190]:
auc_bootstrap = []

In [191]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78591197, 0.79308105])

In [192]:
t_20 = auc_bootstrap
print(t_20)

[0.7896141215106732, 0.7886335491015294, 0.7886031703765695, 0.7876830912545199, 0.7874012295194561, 0.7916888563440287, 0.7887278552303183, 0.7894086028322482, 0.7873766623766624, 0.7887093638325166, 0.7897562411109209, 0.7904744998341058, 0.7885196949236359, 0.7886877024808059, 0.7908271572064676, 0.7865432286614552, 0.7905344647955486, 0.7907267753326866, 0.7883339884571412, 0.7892228963657535, 0.7885458470433839, 0.791263290031763, 0.7917105176957394, 0.7901559194662642, 0.7890839467194146, 0.7861535884934899, 0.7873085083676217, 0.7897340514335588, 0.7915512275118186, 0.7862291390616514, 0.7889098634172526, 0.7902404515705009, 0.7911494358538693, 0.7920306830405351, 0.7877026393036245, 0.7911713613684057, 0.7865757206890212, 0.7889278264894029, 0.7896917853814405, 0.7906388091117156, 0.7891349301447824, 0.789282068838719, 0.7899398342748096, 0.7896526892832312, 0.7915171505072982, 0.7896806905427595, 0.791330123226675, 0.7915942860524141, 0.7919651706597519, 0.7877419995646597, 0.

In [193]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [55]:
column_to_drop_20 = 'Cat_현재 주변도로의 보행 안전'

In [56]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(19949, 163)


In [196]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [197]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [198]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.787755091931956


In [199]:
optuna_21 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [200]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7936682283972926


In [201]:
X_train = X_train.values
y_train = y_train.values

In [202]:
auc_bootstrap = []

In [203]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78546366, 0.79299743])

In [204]:
t_21 = auc_bootstrap
print(t_21)

[0.7878281166458505, 0.7880365411153589, 0.790392345195301, 0.7917741809367425, 0.7924158324404629, 0.7904544234593496, 0.7899741754421559, 0.785921653532491, 0.7875731995190124, 0.7887561206526723, 0.7899461741826274, 0.7894273583928757, 0.7869983812102038, 0.7915525483259473, 0.7887870277032838, 0.788808160729343, 0.7910044104625387, 0.7906855659318712, 0.7917276882794124, 0.7910897350552522, 0.7921749159433887, 0.7879786894565219, 0.791893318371151, 0.7925642919485284, 0.7917165934407314, 0.7886050195163495, 0.791014184487091, 0.7901424471621517, 0.7906409224143214, 0.7922705428863065, 0.7881432628969575, 0.793671662514027, 0.7883131195939077, 0.7889146183481159, 0.7921418955901715, 0.788986470636717, 0.7903994775915958, 0.7916423636866987, 0.7909071985426664, 0.7897499012031033, 0.7893658084544783, 0.791067809540716, 0.7890781351372484, 0.7875383300260148, 0.791291555454117, 0.7893496945221083, 0.7876495425756509, 0.7877174324218659, 0.7875414999799235, 0.7903096622308445, 0.790385

In [205]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [57]:
column_to_drop_21 = 'Cat_현재 문화시설 접근용이성'

In [58]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(19949, 159)


In [208]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [209]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [210]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7848417015223682


In [211]:
optuna_22 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [212]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.791810107081043


In [213]:
X_train = X_train.values
y_train = y_train.values

In [214]:
auc_bootstrap = []

In [215]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78574579, 0.79296273])

In [216]:
t_22 = auc_bootstrap
print(t_22)

[0.7893692425712129, 0.7895435900362008, 0.7903490224918797, 0.7885358088560059, 0.7889669225876121, 0.7906720936277587, 0.7877414712390082, 0.790007459958199, 0.7907526632896091, 0.7869761915328417, 0.788794952588056, 0.7911185288032578, 0.790268716992855, 0.7879039313768378, 0.7917543687248121, 0.7912387228889692, 0.7861842313812757, 0.7884605224506702, 0.7891901401753618, 0.7902164127533585, 0.7894344907891705, 0.7940121683964049, 0.7889685075645666, 0.7880640140492358, 0.7927383752506906, 0.7857803264207206, 0.7901939589131707, 0.7913015936414951, 0.789732994782256, 0.7896912570557891, 0.7903934018466038, 0.78909477739527, 0.7902272434292139, 0.7920135124568622, 0.7874014936822819, 0.7887096279953423, 0.7910041462997128, 0.791206495024229, 0.788584414815942, 0.7898447356575434, 0.7895459675016325, 0.7911528699706041, 0.7904000059172472, 0.7891486666117208, 0.7906549230440856, 0.7919617365430172, 0.7911029431965393, 0.7897570335993982, 0.7870393264481934, 0.7892001783627398, 0.79055

In [217]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
column_to_drop_22 = 'Cat_현재 주택의 구조'

In [60]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(19949, 157)


In [220]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [221]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [222]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7879115349703852


In [223]:
optuna_23 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [224]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.7944673209451535


In [225]:
X_train = X_train.values
y_train = y_train.values

In [226]:
auc_bootstrap = []

In [227]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.785239  , 0.79306102])

In [228]:
t_23 = auc_bootstrap
print(t_23)

[0.7903009448575951, 0.785778741443766, 0.7918621471577136, 0.7904618200184703, 0.7878722318377491, 0.791927659538497, 0.7893370147064729, 0.7887133262749026, 0.787768151684408, 0.7890052261973444, 0.7924501736078091, 0.7891185520495866, 0.7868414684917149, 0.7868771304731896, 0.7900468202192341, 0.789225537994011, 0.7850361797406132, 0.7915734171891808, 0.7888509551071127, 0.7906303559012919, 0.7908364029053684, 0.7903154738130108, 0.7895100413573319, 0.7870805358490087, 0.7880032565993157, 0.7897028802201216, 0.7876786004864824, 0.7931523183986238, 0.789259879161357, 0.7889626959824003, 0.7893087492841186, 0.7902935482984743, 0.7890208118040631, 0.788937864676781, 0.7875660671227174, 0.7919643781712747, 0.787361869258421, 0.7879646888267577, 0.7903051714628069, 0.7889101275800783, 0.7872836770620022, 0.7869563793209113, 0.7886496630338995, 0.7903654005870754, 0.7886863816666774, 0.7894376607430794, 0.7894561521408812, 0.7892725589769926, 0.7905373705866316, 0.7907925518762957, 0.7913

In [229]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [61]:
column_to_drop_23 = '소득 대비 생활비의 비율'

In [62]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(19949, 156)


In [232]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [233]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [234]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7871206514304632


In [235]:
optuna_24 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [236]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7937670252941189


In [237]:
X_train = X_train.values
y_train = y_train.values

In [238]:
auc_bootstrap = []

In [239]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78558946, 0.7930739 ])

In [240]:
t_24 = auc_bootstrap
print(t_24)

[0.7871848801651756, 0.7902761135519756, 0.7876553541578173, 0.7888160856141152, 0.788734195138136, 0.7857702882333424, 0.787595917522026, 0.7880143514379967, 0.7904956328601649, 0.7921868032705471, 0.7877507169379091, 0.7914418641019627, 0.7929071752963379, 0.7911629081579821, 0.7917091968816107, 0.7896870304505773, 0.7897155600357572, 0.7905923164543853, 0.7892376894839949, 0.7886779284562535, 0.790569334288546, 0.7908047033662797, 0.7918235793851558, 0.7888123873345547, 0.7903167946271394, 0.7881847364605986, 0.7909505212460877, 0.7931906220083559, 0.7896577083769203, 0.7894587937691386, 0.7917588594928496, 0.789018962664283, 0.788217756813816, 0.7907550407550409, 0.786690367355392, 0.7848330385276198, 0.7892701815115608, 0.7937873658317008, 0.7883358375969213, 0.7834089367340599, 0.7891668938466967, 0.792305412379304, 0.7913306515523264, 0.7912624975432857, 0.786477716280672, 0.7898344333073397, 0.7876756946953992, 0.7862357431322949, 0.7868538841445245, 0.789235576181389, 0.790596

In [241]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [63]:
column_to_drop_24 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [64]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(19949, 152)


In [244]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [245]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [246]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 137, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7863396745737133


In [247]:
optuna_25 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [248]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.792849323637501


In [249]:
X_train = X_train.values
y_train = y_train.values

In [250]:
auc_bootstrap = []

In [251]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78535928, 0.79245537])

In [252]:
t_25 = auc_bootstrap
print(t_25)

[0.7894339624635192, 0.7896920495442663, 0.787843966415395, 0.789438981557208, 0.7853769497858168, 0.7921276307975815, 0.7883165537106424, 0.7886589087328002, 0.789917116271796, 0.7863797118723228, 0.7907616448256841, 0.7893938097140067, 0.7875967100105031, 0.7915512275118186, 0.7936238490425682, 0.7881522444330327, 0.7885907547237597, 0.7864782446063234, 0.7911673989260196, 0.7924071150672135, 0.7897393346900737, 0.788434370330922, 0.7887738195619969, 0.7936235848797426, 0.7898550380077474, 0.7915726247007036, 0.7870969139442046, 0.7898180552121439, 0.7892865596067566, 0.787297941854592, 0.7884193130498549, 0.7858273474037021, 0.7892516901137591, 0.7851212401705012, 0.7879377442185322, 0.7891402134012973, 0.7894207543222321, 0.7875232727449477, 0.7903075489282387, 0.7886314357989235, 0.7876772796723537, 0.7935250521457418, 0.7891827436162412, 0.790276641877627, 0.7877779257089603, 0.7868433176314951, 0.7897792232767602, 0.7899784020473676, 0.7867614271555159, 0.7912881213373824, 0.788

In [253]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [65]:
column_to_drop_25 = '소득 대비 주거관리비의 비율'

In [66]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(19949, 151)


In [256]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [257]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [258]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 114, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7848891710459023


In [259]:
optuna_26 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [260]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.7934294252028243


In [261]:
X_train = X_train.values
y_train = y_train.values

In [262]:
auc_bootstrap = []

In [263]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78579915, 0.79289658])

In [264]:
t_26 = auc_bootstrap
print(t_26)

[0.7897464670863685, 0.7900486693590141, 0.7857914212594015, 0.7869864938830455, 0.7907904385736899, 0.7884893161986759, 0.7877443770300913, 0.7899488158108847, 0.7888432943851663, 0.7872268820544683, 0.7905484654253128, 0.7891085138622084, 0.7897903181154412, 0.7878909873983766, 0.7864713763728542, 0.7888866170885875, 0.7883839152312059, 0.7887273269046667, 0.790178637469278, 0.7910926408463355, 0.7898439431690665, 0.7888060474267371, 0.7854115551159886, 0.7894738510502057, 0.7895686855046461, 0.7872905452954714, 0.7896024983463408, 0.7917990122423619, 0.7873560576762547, 0.7874796858787007, 0.7888086890549945, 0.789920286225705, 0.7882325499320573, 0.7854955588945738, 0.7906467339964875, 0.7905178225375269, 0.7872070698425376, 0.7940893039415207, 0.7903854769618317, 0.7886937782257979, 0.7903130963475791, 0.7893047868417327, 0.7878582312079849, 0.7877232440040322, 0.7875301409784169, 0.7878339282280169, 0.789393545551181, 0.7883059871976128, 0.788865748225354, 0.7911666064375423, 0.7

In [265]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [67]:
column_to_drop_26 = '현재 무주택 기간(총 개월)'

In [68]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(19949, 150)


In [268]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [269]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [270]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7859124488619079


In [271]:
optuna_27 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [272]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27) 
print(auc_27)

0.7939453352014929


In [273]:
X_train = X_train.values
y_train = y_train.values

In [274]:
auc_bootstrap = []

In [275]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78576485, 0.79349235])

In [276]:
t_27 = auc_bootstrap
print(t_27)

[0.7892199905746704, 0.7919181496767703, 0.7900626699887784, 0.7887695929567851, 0.791544887604001, 0.7911293594791131, 0.7885923397007141, 0.7923405460351273, 0.7867783335763632, 0.7873135274613108, 0.7893539211273202, 0.7875000264162826, 0.7921976339464024, 0.7857689674192138, 0.7871333684141566, 0.788187378088856, 0.789614385673499, 0.7907531916152606, 0.789290257886317, 0.7892358403442147, 0.7928767965713779, 0.7854131400929429, 0.7867109720557997, 0.7909423321984899, 0.7888673332023086, 0.7906105436893613, 0.7894609070717444, 0.7909668993412836, 0.7888786922038153, 0.7917086685559591, 0.7861765706593293, 0.7876178430365623, 0.7892577658587511, 0.7905619377294253, 0.789707635150985, 0.7896801622171081, 0.7901881473310044, 0.7889603185169687, 0.7902890575304368, 0.7931380536060338, 0.7901649010023394, 0.7872733747117984, 0.786634628999161, 0.7878566462310305, 0.7880259746023294, 0.7870279674466866, 0.7877480753096517, 0.7878413247871376, 0.7849881021063287, 0.7870187217477858, 0.789

In [277]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [69]:
column_to_drop_27 = 'Cat_소득 계층'

In [70]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(19949, 148)


In [280]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [281]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [282]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7874083580209255


In [283]:
optuna_28 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [284]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7933063253260298


In [285]:
X_train = X_train.values
y_train = y_train.values

In [286]:
auc_bootstrap = []

In [287]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78638763, 0.79391106])

In [288]:
t_28 = auc_bootstrap
print(t_28)

[0.7911346427356281, 0.7895356651514286, 0.791336198971667, 0.7892382178096464, 0.7890076036627762, 0.791429184286327, 0.7925880666028449, 0.794648008317959, 0.7881805098553867, 0.7904393661782825, 0.7938441608392347, 0.792196048969448, 0.7908802539344412, 0.7886797775960337, 0.7894778134925917, 0.7911095472671827, 0.7885849431415934, 0.7886697394086557, 0.7882108885803467, 0.7892265946453139, 0.7897784307882829, 0.7920742699067823, 0.7918217302453756, 0.7906287709243375, 0.7941888933268244, 0.7909090476824467, 0.7891861777329758, 0.7923096389845159, 0.7919688689393123, 0.7919820770805992, 0.7916032675884892, 0.7918156545003836, 0.788885032111633, 0.7912585351008996, 0.7909040285887576, 0.7919149797228615, 0.7917876532408552, 0.7888802771807697, 0.7919699255906154, 0.7902137711251012, 0.7858194225189298, 0.7876690906247557, 0.7905999771763319, 0.7930963158795672, 0.7902222243355248, 0.792422964836758, 0.792104912794568, 0.7934579547880041, 0.7914611479882416, 0.7888398602684319, 0.7917

In [289]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [71]:
column_to_drop_28 = 'Cat_이사 계획 중인 주택의 유형'

In [72]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(19949, 129)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [77]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [78]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7855492038122569


In [79]:
optuna_29 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [80]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7951536159664239


In [81]:
X_train = X_train.values
y_train = y_train.values

In [82]:
auc_bootstrap = []

In [83]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78707422, 0.79440753])

In [84]:
t_29 = auc_bootstrap
print(t_29)

[0.7906670745340696, 0.7913768800468307, 0.7927930569556185, 0.7931488842818891, 0.7953874000672031, 0.7898357541214684, 0.7923923219489722, 0.791108226453054, 0.789282068838719, 0.79127834731283, 0.7917609727954555, 0.7925053836383886, 0.7895729121098579, 0.7937252875676521, 0.7913084618749644, 0.7908345537655882, 0.7918132770349519, 0.7908879146563875, 0.7922124270646438, 0.7927217329926689, 0.7892778422335073, 0.7915007724121024, 0.7921875957590243, 0.7912130990948724, 0.7912566859611194, 0.7899033798048576, 0.7897343155963845, 0.7940842848478316, 0.7915525483259473, 0.7873867005640405, 0.7910781118909198, 0.7893716200366446, 0.7924710424710424, 0.7876912803021178, 0.7891296468882677, 0.7901189366706609, 0.7918489390164267, 0.7890683611126961, 0.791051695608346, 0.7892862954439308, 0.7900087807723276, 0.7909856549019112, 0.792971895188644, 0.794796996151676, 0.785943843209853, 0.7923558674790201, 0.7871740494893205, 0.7934880693501383, 0.7938602747716048, 0.7898920208033509, 0.79161

In [85]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [73]:
column_to_drop_29 = 'Cat_기초생활보장 수급가구 여부'

In [74]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(19949, 127)


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [89]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [90]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7858344337319259


In [91]:
optuna_30 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [92]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.794108323664974


In [93]:
X_train = X_train.values
y_train = y_train.values

In [94]:
auc_bootstrap = []

In [95]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78680552, 0.79363281])

In [96]:
t_30 = auc_bootstrap
print(t_30)

[0.7901767883294977, 0.7886832117127685, 0.789828093399522, 0.7891560631708415, 0.7888768430640352, 0.7927875095362781, 0.7902264509407366, 0.7893745258277278, 0.793425462760438, 0.7897425046439824, 0.7915940218895885, 0.7925645561113541, 0.7878389473217059, 0.7915668131185373, 0.7909172367300447, 0.7888792205294668, 0.7895301177320881, 0.7889531861206738, 0.7898022054425995, 0.7909917306469031, 0.7882188134651189, 0.791520584624033, 0.7887738195619969, 0.7925539895983245, 0.7910791685422227, 0.7900103657492821, 0.79079228771347, 0.7917850116125978, 0.789972854628027, 0.7893964513422642, 0.7893684500827358, 0.7906308842269433, 0.7881443195482604, 0.7891956875947024, 0.7909608235962915, 0.79305272901332, 0.7881794532040838, 0.7911071698017511, 0.7882037561840518, 0.7885838864902905, 0.7899157954576674, 0.793100542484779, 0.7917681051917504, 0.7929576303960539, 0.7912857438719507, 0.7923151864038563, 0.787110650411143, 0.7910789043793969, 0.7908126282510518, 0.7917622936095843, 0.7915966

In [97]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc 

In [75]:
column_to_drop_30 = 'Cat_현재 대기오염 정도'

In [76]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(19949, 123)


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [101]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [102]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7862051087939561


In [103]:
optuna_31 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [104]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7945830242628273


In [105]:
X_train = X_train.values
y_train = y_train.values

In [106]:
auc_bootstrap = []

In [107]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78787701, 0.79489455])

In [108]:
t_31 = auc_bootstrap
print(t_31)

[0.7923640565266181, 0.7917691618430536, 0.7919796996151676, 0.7925740659730807, 0.7917369339783134, 0.7854115551159886, 0.7886446439402104, 0.7917292732563669, 0.7915430384642208, 0.7932725124843351, 0.790171505072983, 0.7911634364836335, 0.7927573949741437, 0.7898436790062405, 0.794662273110549, 0.7922834868647677, 0.7919815487549478, 0.7911444167601803, 0.7897720908804652, 0.7916634967127578, 0.788682683387117, 0.7889476387013332, 0.793129864558436, 0.7921363481708308, 0.7911465300627862, 0.7928807590137639, 0.7924715707966941, 0.7889360155370007, 0.791510282273829, 0.7904739715084542, 0.7917736526110911, 0.7917963706141046, 0.7912997445017149, 0.7934458032980201, 0.7952386763963118, 0.7919287161898, 0.7910382233042332, 0.7909967497405921, 0.7905981280365517, 0.7916391937327898, 0.7909745600632301, 0.7924554568643238, 0.7914056737948363, 0.7915522841631215, 0.7914814885258235, 0.7905397480520633, 0.7899453816941502, 0.7932450395504582, 0.7929964623314376, 0.794287426060825, 0.790028

In [109]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [77]:
column_to_drop_31 = 'Cat_이사 계획 첫 번째 이유'

In [78]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(19949, 110)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [113]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [114]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7849242572154707


In [115]:
optuna_32 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [116]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.795163654153802


In [117]:
X_train = X_train.values
y_train = y_train.values

In [118]:
auc_bootstrap = []

In [119]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78779193, 0.79439322])

In [120]:
t_32 = auc_bootstrap
print(t_32)

[0.7891534215425839, 0.7909386339189295, 0.7895137396368923, 0.7917786717047801, 0.7943928270282952, 0.7916109283104357, 0.7888599366431879, 0.7918745628105234, 0.7885973587944031, 0.7896085740913328, 0.7913055560838812, 0.7866689701665072, 0.7885130908529923, 0.7898603212642622, 0.7897657509726477, 0.7908995378207202, 0.7900919920624355, 0.7894883800056214, 0.7890894941387552, 0.7924456828397715, 0.7930133687522851, 0.7920721566041763, 0.7906105436893613, 0.7923991901824414, 0.792687391825323, 0.790661527114729, 0.7936161883206218, 0.7937736293647624, 0.7924166249289402, 0.7926422199821214, 0.7914439774045685, 0.7919289803526257, 0.7914820168514749, 0.7894511330471923, 0.7891439116808575, 0.7933039478605981, 0.7913940506305039, 0.7916613834101519, 0.7942303668904654, 0.7899543632302253, 0.7923717172485646, 0.7905297098646852, 0.7899844777923596, 0.7902943407869516, 0.7914413357763112, 0.7911644931349365, 0.791216533211607, 0.7887563848154981, 0.7902238093124793, 0.793372894358116, 0.7

In [121]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [79]:
column_to_drop_32 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [80]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(19949, 106)


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [125]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [126]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 101, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7869551272657926


In [127]:
optuna_33 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [128]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.7940877189645663


In [129]:
X_train = X_train.values
y_train = y_train.values

In [130]:
auc_bootstrap = [] 

In [131]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78812101, 0.79504399])

In [132]:
t_33 = auc_bootstrap
print(t_33)

[0.7913713326274903, 0.7920224939929372, 0.7912255147476822, 0.792842191241206, 0.7915232262522902, 0.7895470241529355, 0.7921846899679412, 0.7936970221452981, 0.7911766446249204, 0.7908702157470631, 0.7921786142229491, 0.7885788673966014, 0.7910252793257719, 0.7923777929935565, 0.7920935537930611, 0.7913203492021226, 0.7893304106358293, 0.7932466245274127, 0.7915020932262311, 0.7928778532226808, 0.7919110172804755, 0.7950595740004607, 0.7915646998159314, 0.7944562261064724, 0.7932241706872248, 0.7904943120460362, 0.7923664339920498, 0.7905672209859402, 0.7904618200184703, 0.792117064284552, 0.7935873945726162, 0.79324926615567, 0.7909000661463715, 0.7897483162261487, 0.7924308897215302, 0.7881768115758263, 0.7927143364335483, 0.7924829297982008, 0.7894944557506134, 0.7914878284336412, 0.7915850403535133, 0.7918336175725338, 0.7917015361596642, 0.7916539868510313, 0.7943846379806971, 0.7883770469977366, 0.7896125365337189, 0.7909071985426664, 0.7926018030697835, 0.7933927065700465, 0.7

In [133]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [81]:
column_to_drop_33 = 'Cat_현재 청소/쓰레기 처리상태'

In [82]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(19949, 102)


In [136]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [137]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [138]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7856899612689967


In [139]:
optuna_34 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [140]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.7930865418550147


In [141]:
X_train = X_train.values
y_train = y_train.values

In [142]:
auc_bootstrap = []

In [143]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78776767, 0.79452975])

In [144]:
t_34 = auc_bootstrap
print(t_34)

[0.7945838167513044, 0.7927790563258543, 0.7896297071173919, 0.7929431014406384, 0.7912257789105079, 0.7902166769161842, 0.7884504842632922, 0.7901722975614601, 0.7912231372822505, 0.7908502714537198, 0.7894947199134391, 0.7930101987983762, 0.7924678725171336, 0.7932212648961418, 0.7892765214193785, 0.7959321038138771, 0.7899408909261126, 0.791699687019884, 0.7903767595885822, 0.7902251301266081, 0.7911922302316392, 0.7898317916790824, 0.792803623468648, 0.7897897897897899, 0.7945682311445859, 0.7915779079572183, 0.7924443620256428, 0.790748700847223, 0.7913731817672705, 0.7906509606016996, 0.7913454446705679, 0.7923474142685966, 0.7897340514335588, 0.7933979898265613, 0.7908842163768273, 0.791950905867162, 0.7957981732612274, 0.7917715393084851, 0.791104264010668, 0.7907405117996251, 0.7904126857328826, 0.7919178855139446, 0.7896278579776117, 0.7913958997702839, 0.7912823097552162, 0.7902716227839379, 0.7904335545961162, 0.7915818703996045, 0.7934148962474085, 0.7913731817672705, 0.79

In [145]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [83]:
column_to_drop_34 = 'Cat_현재 주차시설 이용편의성'

In [84]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(19949, 98)


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [149]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [150]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.786704983515692


In [151]:
optuna_35 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [152]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.7933483272153223


In [153]:
X_train = X_train.values
y_train = y_train.values

In [154]:
auc_bootstrap = []

In [155]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78760263, 0.79454581])

In [156]:
t_35 = auc_bootstrap
print(t_35)

[0.7898796051505411, 0.7918283343160191, 0.7898484339371038, 0.7899041722933349, 0.78991315382941, 0.7907901744108641, 0.7924258706278411, 0.7927485455194815, 0.789070474415302, 0.7905843915696132, 0.79103531751315, 0.7896149139991504, 0.7910361100016272, 0.7889022026953061, 0.793731363312644, 0.7907838345030463, 0.7921514054518981, 0.7905880898491735, 0.7899517216019679, 0.7891034947685194, 0.7900645191285585, 0.7916706291090528, 0.7886834758755942, 0.7868145238834894, 0.7933448930985878, 0.7929446864175926, 0.7922597122104511, 0.789785563184578, 0.792962649489743, 0.7920547218576776, 0.7935398452639832, 0.7928561918709702, 0.7881385079660942, 0.7955178965031182, 0.7900610850118239, 0.7956106176549526, 0.7908091941343174, 0.7862233274794852, 0.7915425101385694, 0.7921030636547878, 0.7915049990173143, 0.7942158379350498, 0.7886575879186716, 0.7906322050410721, 0.7913829557918227, 0.7899791945358448, 0.7886103027728644, 0.7905178225375269, 0.791964114008449, 0.7920919688161068, 0.790151

In [157]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [85]:
column_to_drop_35 = '현재 주택 거주 기간(총 개월)'

In [86]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(19949, 97)


In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [161]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [162]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 98, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7854307363926547


In [163]:
optuna_36 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [164]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.7932664367393432


In [165]:
X_train = X_train.values
y_train = y_train.values

In [166]:
auc_bootstrap = []

In [167]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7862541 , 0.79320279])

In [168]:
t_36 = auc_bootstrap
print(t_36)

[0.7903086055795415, 0.790019347285357, 0.7882964773358861, 0.7894149427400659, 0.7903680422153329, 0.7924332671869618, 0.7861321913046051, 0.7891233069804497, 0.7921640852675336, 0.7912957820593288, 0.7868242979080418, 0.7879749911769618, 0.788423275492241, 0.7922795244223815, 0.7914371091710993, 0.7905804291272271, 0.7895906110191824, 0.7882914582421972, 0.7910577713533378, 0.7898151494210608, 0.7884290870744073, 0.7844727204333116, 0.7925917648824053, 0.7902280359176911, 0.7922752978171697, 0.7891021739543908, 0.7918888276031133, 0.7894434723252457, 0.7872147305644842, 0.7893584118953577, 0.7915570390939849, 0.7886145293780762, 0.7889080142774724, 0.7893837715266286, 0.787720338212949, 0.7842859573155139, 0.7885088642477805, 0.7872844695504794, 0.784827755271105, 0.7891935742920966, 0.7883617255538438, 0.7908039108778024, 0.7944805290864404, 0.7904417436437141, 0.7933219109327485, 0.7857185123194975, 0.7923558674790203, 0.789085531696369, 0.7882098319290437, 0.7884893161986759, 0.78

In [169]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [87]:
column_to_drop_36 = '자산 중 기타자산의 비중'

In [88]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(19949, 96)


In [172]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [173]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [174]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7854699503468783


In [175]:
optuna_37 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [176]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7953259822102187


In [177]:
X_train = X_train.values
y_train = y_train.values

In [178]:
auc_bootstrap = []

In [179]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78665247, 0.79396675])

In [180]:
t_37 = auc_bootstrap
print(t_37)

[0.7895266836153534, 0.7908463090113337, 0.7933853100109258, 0.7949644753831946, 0.7891832719418925, 0.7926963733613981, 0.7902370174537663, 0.7924966662651391, 0.7901878831681788, 0.7918072012899598, 0.7885086000849548, 0.7884747872432603, 0.7898418298664603, 0.7922937892149715, 0.7917313865589728, 0.7926620321940518, 0.7909021794489776, 0.7915327361140169, 0.7900151206801453, 0.7886348699156582, 0.7901012377613362, 0.7902953974382546, 0.7948701692544057, 0.7912627617061114, 0.7907687772219792, 0.7902428290359325, 0.7881942463223253, 0.7876136164313504, 0.7867170478007917, 0.791378729186611, 0.7896381603278154, 0.7947000483946296, 0.7904464985745774, 0.7905635227063798, 0.7859615421191776, 0.7915927010754598, 0.7919538116582452, 0.789346788731025, 0.7917651994006674, 0.7914595630112872, 0.7886850608525486, 0.7884239358993054, 0.7909027077746289, 0.7891983292229597, 0.7898825109416242, 0.7893700350596904, 0.7880901661689839, 0.7891103630019887, 0.7944234699160807, 0.7946289885945058, 0

In [181]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [89]:
column_to_drop_37 = '총 이사 횟수'

In [90]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(19949, 95)


In [184]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [185]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [186]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7857242218816342


In [187]:
optuna_38 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [188]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7940933984653197


In [189]:
X_train = X_train.values
y_train = y_train.values

In [190]:
auc_bootstrap = []

In [191]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78771245, 0.79433503])

In [192]:
t_38 = auc_bootstrap
print(t_38)

[0.7941453064605775, 0.7910271284655521, 0.7935797338506698, 0.7910918483578584, 0.7945566079802533, 0.791810107081043, 0.7904319696191617, 0.7932915322077885, 0.7918037671732253, 0.7914899417362472, 0.7876574674604232, 0.7909671635041092, 0.7913697476505358, 0.7888826546462014, 0.7915208487868586, 0.7873177540665224, 0.7902432252801711, 0.7897639018328675, 0.7913129526430018, 0.7895462316644584, 0.791865845437274, 0.7918336175725339, 0.7904042325224592, 0.7920251356211948, 0.7910073162536216, 0.789649783492148, 0.7896902004044861, 0.7887304968585758, 0.7914326184030618, 0.7918064088014827, 0.7895235136614447, 0.7882211909305505, 0.7937731010391108, 0.7910802251935257, 0.7918616188320623, 0.7926142187225931, 0.7898106586530231, 0.7914157119822145, 0.789898096548343, 0.7912617050548085, 0.7919987193386209, 0.7902504897578788, 0.7892746722795982, 0.7910723003087535, 0.7896888795903574, 0.788347328679841, 0.7900085166095018, 0.790826893043642, 0.7904105724302769, 0.7905445029829267, 0.790

In [193]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [91]:
column_to_drop_38 = '자산 중 금융자산의 비중'

In [92]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(19949, 94)


In [196]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [197]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [198]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7861588776058189


In [199]:
optuna_39 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [200]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.7943336545553293


In [201]:
X_train = X_train.values
y_train = y_train.values

In [202]:
auc_bootstrap = []

In [203]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78764788, 0.79427085])

In [204]:
t_39 = auc_bootstrap
print(t_39)

[0.7945815713672856, 0.7851003713072678, 0.7922966950060545, 0.7915005082492768, 0.7903585323536063, 0.792135291519528, 0.791431297588933, 0.7905096334899291, 0.7909584461308599, 0.7913409539025302, 0.7920763832093881, 0.7903292102799493, 0.7898167343980151, 0.790527860724905, 0.7908324404629823, 0.7950545549067717, 0.7928996466558043, 0.7901899964707846, 0.7901833924001411, 0.7928844572933242, 0.7932970796271289, 0.7908222701941914, 0.7901168233680549, 0.7891307035395706, 0.7935781488737155, 0.7901752033525432, 0.7916611192473262, 0.7924285122560984, 0.7887975942163135, 0.7887246852764094, 0.792652786495151, 0.7912371379120147, 0.7887260060905381, 0.7908781406318353, 0.7882333424205344, 0.7902238093124794, 0.7908229306012557, 0.7922488815345958, 0.7894276225557013, 0.7902338474998574, 0.7895552132005335, 0.7894430760810071, 0.794710879070485, 0.7905141242579667, 0.789673029820813, 0.7923685472946557, 0.7939994885807693, 0.7925405172942117, 0.7919313578180573, 0.7915385476961831, 0.791

In [205]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [93]:
column_to_drop_39 = '가구주 나이'

In [94]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(19949, 93)


In [208]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [209]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [210]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7857287624447549


In [211]:
optuna_40 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [212]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.7936978146337752


In [213]:
X_train = X_train.values
y_train = y_train.values

In [214]:
auc_bootstrap = []

In [215]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78866076, 0.79517845])

In [216]:
t_40 = auc_bootstrap
print(t_40)

[0.7932947021616972, 0.7903215495580027, 0.7903236628606085, 0.7913932581420267, 0.7948515457751911, 0.7896743506349418, 0.7944086767978394, 0.7890671723799803, 0.7961394716320825, 0.7931951127763937, 0.7894622278858732, 0.7928437762181605, 0.7915356419051001, 0.792782490442589, 0.7948926230945936, 0.7927243746209264, 0.7957971166099246, 0.792206087156826, 0.7925228183848873, 0.7900948978535185, 0.7947528809597775, 0.789794808883479, 0.7889127692083356, 0.7936177732975762, 0.7919733597073498, 0.7918785252529095, 0.7934273119002183, 0.790517822537527, 0.7910654320752843, 0.7895882335537509, 0.79164104287257, 0.7936264906708256, 0.7879617830356747, 0.7944012802387187, 0.7910133919986136, 0.7900491976846657, 0.7885648667668372, 0.7923315644990521, 0.7918153903375579, 0.7932989287669091, 0.7935990177369487, 0.7924868922405868, 0.790677905209925, 0.7932122833600667, 0.7914529589406437, 0.7898782843364124, 0.790328417791472, 0.7934726158248326, 0.7904406869924112, 0.7934629738816932, 0.79318

In [111]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [112]:
column_to_drop_40 = 'Cat_주택 보유 의식'

In [113]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(19949, 91)


In [114]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [115]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [116]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 137, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7841408036879278


In [117]:
optuna_41 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [118]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.7901347864402052


In [119]:
X_train = X_train.values
y_train = y_train.values

In [120]:
auc_bootstrap = []

In [121]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78495319, 0.79129249])

In [122]:
t_41 = auc_bootstrap
print(t_41)

[0.7864085056203283, 0.7899278148662385, 0.7906289030057503, 0.7894500763958893, 0.7873631900725496, 0.7895818936459331, 0.7869233589676939, 0.7844838152719926, 0.7874302874302874, 0.7878215125752072, 0.7883504986337497, 0.7874851012166283, 0.7888078965665173, 0.7856717554993418, 0.7873856439127376, 0.7910258076514234, 0.7888469926647268, 0.7856157529802851, 0.7850773891414284, 0.7893739975020764, 0.7868705264025462, 0.7880177855547312, 0.7897160883614085, 0.7917814454144504, 0.7891861777329758, 0.7883668767289458, 0.7875494248646959, 0.7902602637824312, 0.7889228073957137, 0.787687053696906, 0.7909234445564495, 0.7871506710792424, 0.7875646142271757, 0.7875170649185427, 0.7888644274112255, 0.7875317259553714, 0.7906641687429865, 0.7893627705819823, 0.7875677841810848, 0.7918758836246521, 0.7883627822051468, 0.7886083215516713, 0.7858764816892895, 0.7871777477688808, 0.7868003911723125, 0.7866915560881079, 0.7853964978349215, 0.7889409025492771, 0.7893589402210093, 0.7865205106584416, 

In [123]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [124]:
column_to_drop_41 = '총 가구원 수'

In [125]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(19949, 90)


In [126]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [127]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [128]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7850829705354603


In [129]:
optuna_42 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [130]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.7918781290086708


In [131]:
X_train = X_train.values
y_train = y_train.values

In [132]:
auc_bootstrap = []

In [133]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78586006, 0.79300849])

In [134]:
t_42 = auc_bootstrap
print(t_42)

[0.7897277115257411, 0.7911428317832259, 0.791435524194145, 0.7877375087966221, 0.7910770552396168, 0.7925370831774772, 0.7898851525698817, 0.7926880522323871, 0.7895694779931233, 0.7910007121829781, 0.7917855399382492, 0.7867261614182797, 0.7910820743333059, 0.7896081778470941, 0.7902697736441578, 0.7940148100246623, 0.7904564046805427, 0.7936781345032577, 0.793286777276925, 0.7885846789787676, 0.7907730038271911, 0.7885615647315155, 0.7876183713622138, 0.7933190051416652, 0.7914220518900322, 0.790171505072983, 0.787981727329018, 0.7879211019605108, 0.7911003015682818, 0.7903251157561504, 0.7882024353699232, 0.7893700350596903, 0.7908567434429503, 0.7893461283239609, 0.7888823904833757, 0.7924965341837263, 0.7903520603643757, 0.7895188908119942, 0.7905511070535702, 0.7883380829809401, 0.7901262011483686, 0.7903683063781586, 0.7904298563165558, 0.789206122026319, 0.7888640311669868, 0.787837890670403, 0.7877484715538904, 0.7898119794671519, 0.7894640770256534, 0.791586229086229, 0.7893

In [135]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [136]:
column_to_drop_42 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [137]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(19949, 86)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [139]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [140]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.785302568679113


In [141]:
optuna_43 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [142]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.7944567544321239


In [143]:
X_train = X_train.values
y_train = y_train.values

In [144]:
auc_bootstrap = []

In [145]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78742722, 0.79408701])

In [146]:
t_43 = auc_bootstrap
print(t_43)

[0.7915076406455717, 0.7930362188367114, 0.7899332302041662, 0.7921136301678173, 0.7914188819361233, 0.7896504438992122, 0.7924356446523935, 0.7916251931030256, 0.7890387748762133, 0.7936325664158177, 0.7890033770575642, 0.7919346598533791, 0.7883132516753206, 0.794838469715317, 0.7904340829217676, 0.7907100009932523, 0.7887046089016531, 0.7910267322213135, 0.7919086398150437, 0.7902449423385384, 0.7915216412753359, 0.7905677493115917, 0.7893876018876019, 0.7919468113433632, 0.7910929050091611, 0.7891247598759914, 0.788184208134947, 0.7911363597939953, 0.7883815377657741, 0.7888646915740511, 0.7914355241941449, 0.7940863981504375, 0.792821982785037, 0.7876782042422437, 0.7913602377888093, 0.7914429207532656, 0.789715560035757, 0.7903442675610164, 0.7902016196351171, 0.7901795620391681, 0.790521917061326, 0.7928377004731685, 0.78785202338158, 0.7897204470480332, 0.7893806015727197, 0.7898286217251735, 0.7895774028778955, 0.7915032819589469, 0.790066104105513, 0.7882845900087279, 0.79210

In [149]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [147]:
column_to_drop_43 = 'Cat_현재 거주 지역'

In [148]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(19949, 69)


In [150]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [151]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [152]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7807578777738197


In [153]:
optuna_44 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [154]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.7884696360681582


In [155]:
X_train = X_train.values
y_train = y_train.values

In [156]:
auc_bootstrap = []

In [157]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78425293, 0.79039412])

In [158]:
t_44 = auc_bootstrap
print(t_44)

[0.7890403598531677, 0.7889460537243789, 0.7874761196805533, 0.7878002474677352, 0.7892230284471664, 0.7879641605011063, 0.7878397398101832, 0.7882124735573013, 0.7870903098735611, 0.7882531546324649, 0.7848664551250759, 0.7889876593694327, 0.7900208001808987, 0.788198472927537, 0.7874211738127994, 0.7886690790015913, 0.7871548976844545, 0.7856636985331567, 0.7892062541077319, 0.7856589436022934, 0.7878707789422075, 0.787335452975847, 0.7840821556954563, 0.7870843662099819, 0.7894253771716826, 0.7873273960096621, 0.7869306234454018, 0.786483792025664, 0.7874866861935826, 0.7878003795491479, 0.7872749596887528, 0.7921973697835767, 0.7880000866454068, 0.7863247660045689, 0.784675861646305, 0.7855420515519038, 0.7884346344937478, 0.7892321420646544, 0.7877474149025873, 0.7854750862755789, 0.7863221243763115, 0.7895669684462787, 0.7860134501144354, 0.7865964574708417, 0.7881584522594375, 0.790212978636624, 0.7860820003677147, 0.7893272406819205, 0.7898465847973237, 0.7889052405678021, 0.78

In [159]:
column_to_drop_44 = 'Cat_가구주 성별'

In [160]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(19949, 67)


In [161]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [163]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [164]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 65, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7839546405999818


In [165]:
optuna_45 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [166]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.7909853907390854


In [167]:
X_train = X_train.values
y_train = y_train.values

In [168]:
auc_bootstrap = []

In [169]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78469629, 0.79099583])

In [170]:
t_45 = auc_bootstrap
print(t_45)

[0.7884879953845471, 0.7877545472988823, 0.7884206338639836, 0.7883925005230423, 0.7859051433558824, 0.7863472198447566, 0.7849960269911008, 0.7883021568366395, 0.7878057948870757, 0.7861059071034441, 0.7881165824515579, 0.7900004596433168, 0.7868845270323102, 0.7848749083354994, 0.7877894167918799, 0.7875915588354012, 0.7864803579089293, 0.7878324753324754, 0.7878046061543598, 0.7897393346900736, 0.7912747811146826, 0.7869418503654957, 0.790091463736784, 0.7830365992311806, 0.7859335408596493, 0.7877233760854452, 0.7892151035623942, 0.7891114196532916, 0.7896439719099817, 0.785929710498676, 0.7862786695914774, 0.7893100700982474, 0.7879084221448754, 0.7889694321344568, 0.7846894660318307, 0.7863918633623067, 0.7873505102569143, 0.7874578924455773, 0.7855631845779629, 0.7879369517300552, 0.7861230776871172, 0.786543492824281, 0.7865564368027422, 0.79112552911814, 0.790189600226546, 0.7866580074092389, 0.7891927818036193, 0.7888047266126084, 0.789902058990729, 0.7890542284015192, 0.7902

In [171]:
column_to_drop_45 = 'Cat_가구주 종사상 지위'

In [172]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(19949, 62)


In [173]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [174]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [175]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [176]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7828473623662444


In [177]:
optuna_46 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [178]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.7910663566451744


In [179]:
X_train = X_train.values
y_train = y_train.values

In [180]:
auc_bootstrap = []

In [181]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78450786, 0.79068539])

In [182]:
t_46 = auc_bootstrap
print(t_46)

[0.7850977296790105, 0.7849418736118243, 0.7842942784445248, 0.7898780201735867, 0.786739105396741, 0.7908482902325268, 0.7838784861568112, 0.7886365869740254, 0.7878731564076391, 0.7881496028047752, 0.7876539012622755, 0.7899741754421559, 0.7897273152815025, 0.7875508777602374, 0.788544790392081, 0.788754535675718, 0.78938152614261, 0.788647153487055, 0.7868397514333475, 0.7903513999573113, 0.7867990703581836, 0.7882383615142237, 0.7869484544361393, 0.7889584693771886, 0.7850386892874578, 0.7883902551390236, 0.7873262072769462, 0.7877764728134187, 0.7859817505753467, 0.7876611657399835, 0.7872953002263348, 0.7853798555768999, 0.7885548285794591, 0.7856179983643039, 0.788718081205766, 0.7870352319243945, 0.7865203785770287, 0.7883421775047391, 0.7900535563712904, 0.7869804181380537, 0.7859530889087539, 0.7883701787642674, 0.7901341260331407, 0.7876405610395758, 0.7889354872113492, 0.7870424964021023, 0.7853032483574356, 0.7885968304687516, 0.7860575653063339, 0.7881209411381824, 0.7887

In [183]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [184]:
column_to_drop_46 = '중기부채부담지표'

In [185]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(19949, 61)


In [186]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [187]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [188]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 53, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5}
0.7826568651044101


In [189]:
optuna_47 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [190]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.7872669027225677


In [191]:
X_train = X_train.values
y_train = y_train.values

In [192]:
auc_bootstrap = []

In [193]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78238384, 0.78897166])

In [194]:
t_47 = auc_bootstrap
print(t_47)

[0.7856058468743199, 0.7843867354335335, 0.7831648502830768, 0.7868048819403499, 0.7850192733197658, 0.784482494457864, 0.7892962015498962, 0.7854979363600052, 0.7828561760212006, 0.7874707043426256, 0.7852073572516922, 0.7848809840804916, 0.7852718129811725, 0.7846701821455516, 0.7888324637093109, 0.7879546506393798, 0.7856432259141619, 0.7850554636268922, 0.7870841020471562, 0.7841908586982479, 0.7867553514105238, 0.7868846591137231, 0.7859911283556604, 0.786726821825344, 0.7872486754875917, 0.7836406075322331, 0.7874582886898157, 0.7875691049952134, 0.782248073196349, 0.7887557244084337, 0.782750378809492, 0.783861579735964, 0.7856828503380228, 0.7866432142909976, 0.784537044081379, 0.7858828215971073, 0.7876943181746138, 0.7836592310114479, 0.7869220381535652, 0.7867852018098324, 0.7853574017367122, 0.7875934079751814, 0.7845098353103279, 0.7869676062410051, 0.7864320161118191, 0.7882100960918694, 0.7860723584245752, 0.7849991969450096, 0.7871772194432292, 0.7876956389887424, 0.784

In [195]:
column_to_drop_47 = '소득 대비 주택 임대료의 비율'

In [196]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(19949, 60)


In [197]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [199]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [200]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 62, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 7}
0.7825606877219458


In [201]:
optuna_48 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [202]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.7883207803158543


In [203]:
X_train = X_train.values
y_train = y_train.values

In [204]:
auc_bootstrap = []

In [205]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78281146, 0.78860888])

In [206]:
t_48 = auc_bootstrap
print(t_48)

[0.7868970747665329, 0.7864605456969989, 0.784655653190136, 0.7835152622714199, 0.7852834361455051, 0.7831286599759506, 0.787132972169918, 0.7870644219166386, 0.7845748193654598, 0.7852394530350195, 0.7850051406085888, 0.7840910051501184, 0.789609234498397, 0.7858871802837321, 0.787530008897004, 0.7845527617695105, 0.7870604594742527, 0.7871934654570124, 0.7842549181834897, 0.7850965409462946, 0.7825121039406754, 0.7848038485353757, 0.7852444721287086, 0.7834641467646393, 0.7843584700111794, 0.7855131257224853, 0.7857891758753828, 0.7843553000572705, 0.7859316917198691, 0.7837549900357783, 0.7884551071127426, 0.7846951455325839, 0.7837831233767194, 0.7867589176086713, 0.7847450723066487, 0.7855012383953272, 0.784579706377736, 0.783611417539989, 0.7847837721606193, 0.7828136458062566, 0.7860250732787679, 0.7837384798591694, 0.7873089046118603, 0.7864361106356179, 0.7845054766237032, 0.7858268190780506, 0.7867211423245906, 0.7866720080390031, 0.785314871521768, 0.7861523997607742, 0.7870

In [207]:
column_to_drop_48 = 'Cat_현재 주택의 유형'

In [208]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(19949, 49)


In [209]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [211]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [212]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7835187465404005


In [213]:
optuna_49 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [214]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.7861527960050128


In [215]:
X_train = X_train.values
y_train = y_train.values

In [216]:
auc_bootstrap = []

In [217]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78033357, 0.78651375])

In [218]:
t_49 = auc_bootstrap
print(t_49)

[0.784235238052972, 0.7821248412381419, 0.7823244162529877, 0.7862999346989494, 0.7833562362503249, 0.7820445357391171, 0.7843706215011633, 0.7840272098277025, 0.7816745757016692, 0.7816324417309639, 0.7832524202598096, 0.7830334292772717, 0.784079381985786, 0.7851172777281151, 0.7835498676015917, 0.7834936009197092, 0.7840669663329762, 0.7834807890226609, 0.7826996595469502, 0.7866414972326301, 0.7826818285562127, 0.7841890095584677, 0.785802119853844, 0.7854488020744179, 0.7838482395132642, 0.7814534714165257, 0.780646718146718, 0.7829087444235228, 0.7816920104481679, 0.7851006354700936, 0.7823703805846663, 0.7850322172982271, 0.785125466775713, 0.7863674283009259, 0.7831722468421976, 0.7840005293823029, 0.7842189920391891, 0.7817048223452165, 0.7848700213232234, 0.7823286428581997, 0.7852327168829631, 0.7836404754508203, 0.785054671138415, 0.7831375094306128, 0.7842691829760795, 0.7839459797587877, 0.7867395016409795, 0.7833336503287243, 0.7832969316959465, 0.7847556388196784, 0.784

In [219]:
column_to_drop_49 = 'Cat_이사 계획 중인 주택의 점유형태' 

In [220]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(19949, 25)


In [221]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc, comp_49, X_49, y_49

In [222]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [223]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [224]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7832923375520668


In [225]:
optuna_50 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [226]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.7851695819676114


In [227]:
X_train = X_train.values
y_train = y_train.values

In [228]:
auc_bootstrap = []

In [229]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77751231, 0.78452929])

In [230]:
t_50 = auc_bootstrap
print(t_50)

[0.7825778804842843, 0.7782482517704192, 0.7777929671402577, 0.781086417170161, 0.7834335038768536, 0.7841246859104002, 0.7814707740816114, 0.7776665652281416, 0.7814343196116595, 0.7805415813420739, 0.7809582981996775, 0.7844744374916789, 0.7818757356934697, 0.7823455492790468, 0.784261654335546, 0.7821780700475282, 0.778052903360785, 0.7806677190913645, 0.7828160232716883, 0.7802245859511869, 0.7805195237461248, 0.7825012732648201, 0.782392966506267, 0.779672485645392, 0.7821021232351282, 0.7811356835371613, 0.7801808670035271, 0.7804871637999717, 0.7779040476084812, 0.779991462257472, 0.7791684629738816, 0.7819710984735615, 0.7815183233902445, 0.781099493230035, 0.7797084117896925, 0.7804319537693922, 0.7808984653196476, 0.7801537903138889, 0.7802585308742943, 0.7827178867819262, 0.7818859059622606, 0.7818765281819469, 0.7806465860653052, 0.781748805455702, 0.7794266821360417, 0.7807798562108907, 0.7819850991033257, 0.780280192226005, 0.7792576179275688, 0.7799573852529518, 0.783013

In [231]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [232]:
column_to_drop_50 = '장기부채부담지표'

In [233]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(19949, 24)


In [234]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [235]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [236]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7784240283298117


In [237]:
optuna_51 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [238]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.7813795058253186


In [239]:
X_train = X_train.values
y_train = y_train.values

In [240]:
auc_bootstrap = []

In [241]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77368798, 0.78025826])

In [242]:
t_51 = auc_bootstrap
print(t_51)

[0.7776429226552378, 0.7759110711696918, 0.7768777750304843, 0.7787345755326052, 0.7755409790508314, 0.7782399306414085, 0.775976715631888, 0.7766867853074748, 0.7734242423281831, 0.7773236818803322, 0.7772065256671169, 0.7756226053639848, 0.7772201300526425, 0.7757787255939965, 0.7793368667752905, 0.7795654997009678, 0.7746473162113556, 0.7779704845591545, 0.777121201074403, 0.7761299300708168, 0.7771717882555321, 0.7775785990071705, 0.7778247987607594, 0.7765961774582464, 0.7785186224225633, 0.7752176437521265, 0.7752455129302419, 0.7761220051860446, 0.781032792116536, 0.7752736462711831, 0.7766224616594075, 0.7761157973596396, 0.7766953705993114, 0.7784741109864263, 0.7771469569499125, 0.7741139714661882, 0.7760259819988884, 0.7785863801873654, 0.7768727559367953, 0.7756430779829795, 0.7783835031371976, 0.7791979171289516, 0.7774313282318208, 0.777687434091375, 0.7777681358346382, 0.7804124057202875, 0.77469446927575, 0.7775681645755537, 0.7744334764039198, 0.7758319544033829, 0.779

In [243]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [244]:
column_to_drop_51 = 'Cat_가구주 최종 학력' 

In [245]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(19949, 21)


In [246]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [247]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [248]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7735856455463062


In [249]:
optuna_52 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [250]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.7768271878493553


In [251]:
X_train = X_train.values
y_train = y_train.values

In [252]:
auc_bootstrap = []

In [253]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77031844, 0.77672338])

In [254]:
t_52 = auc_bootstrap
print(t_52)

[0.7751114502961792, 0.7761716677972836, 0.7718988340909522, 0.7735021703617762, 0.7716820884924334, 0.7731253420908594, 0.7733928069519203, 0.7743729831168255, 0.7736561772891822, 0.7741657473800331, 0.7756591919153496, 0.7766159896701769, 0.7733649377738048, 0.770799256328813, 0.7729470321834854, 0.7728301401330958, 0.7720569355421573, 0.7727551178905859, 0.773847827419256, 0.7728392537505837, 0.7756103217925878, 0.7713106755594441, 0.7726654346112475, 0.7722298301116035, 0.7755275067467187, 0.7738737153761786, 0.7719968384993016, 0.7719767621245452, 0.7744334764039198, 0.7731394748020365, 0.7748286639912255, 0.7726960774990332, 0.7722744736291535, 0.7766756904687939, 0.7742316560050551, 0.7729314465767667, 0.7728411028903639, 0.7770319140393032, 0.7743551521260882, 0.7711251011743623, 0.7724547647577205, 0.7716744277704869, 0.7710214172652596, 0.7760280953014943, 0.7738969617048435, 0.777014875537043, 0.7731446259771383, 0.772777307567948, 0.7736922355148957, 0.7698124760932643, 0.7

In [255]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [256]:
column_to_drop_52 = 'Cat_이사 계획 중인 거주 지역'

In [257]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(19949, 14)


In [258]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [259]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [260]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7714278461178392


In [261]:
optuna_53 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [263]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.7701424154626125


In [264]:
X_train = X_train.values
y_train = y_train.values

In [265]:
auc_bootstrap = []

In [266]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76258756, 0.77065637])

In [267]:
t_53 = auc_bootstrap
print(t_53)

[0.7691618430534687, 0.7680844549686915, 0.7665865196653374, 0.7678147447236118, 0.767297910155053, 0.7641234654781452, 0.767397235377531, 0.767025426200303, 0.7665846705255572, 0.7682858791233176, 0.7681857614123624, 0.7659693032229978, 0.7676599453077286, 0.7661098378462912, 0.7647908728573753, 0.7670205391880268, 0.769392325118926, 0.7691190486756989, 0.767880257104395, 0.7657324812497226, 0.7671893392336742, 0.7668247945341541, 0.7694983864934604, 0.764754814631662, 0.7684003937082755, 0.7665964257713026, 0.7660574015253818, 0.7679731103376423, 0.763046077393368, 0.7667949441348456, 0.7711933872648158, 0.7659343016485873, 0.7660354760108455, 0.7662536745049061, 0.7642950392334629, 0.7672062456545214, 0.762823256049857, 0.7649349736788161, 0.7657171598058297, 0.766437928075859, 0.765377710574755, 0.7641424852015984, 0.7655623603899466, 0.7664202291665346, 0.7644298943560026, 0.7638504531977439, 0.7657732944062994, 0.7643498530198036, 0.7685785715342366, 0.7665883688051176, 0.7619429

In [268]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [269]:
column_to_drop_53 = 'Cat_현재 주택의 점유형태'

In [270]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(19949, 10)


In [271]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [272]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [273]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 99, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 5}
0.7662258055062171


In [274]:
optuna_54 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [275]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.7612252030355479


In [276]:
X_train = X_train.values
y_train = y_train.values

In [277]:
auc_bootstrap = []

In [278]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75468618, 0.76208276])

In [279]:
t_54 = auc_bootstrap
print(t_54)

[0.7599705616946996, 0.7604308654185502, 0.7567566246753439, 0.7519920518688993, 0.7605865894043234, 0.7574923181450275, 0.7561462764664736, 0.7588977964593728, 0.7582712022367195, 0.7611579735963971, 0.7593674779635371, 0.7620388245388245, 0.7604509417933063, 0.7585002314066354, 0.7613247924208515, 0.7603134450425091, 0.7598627832617981, 0.7587353363215432, 0.7617710955149379, 0.7568284769639451, 0.7555690806922335, 0.7582277474518855, 0.758690032396929, 0.7566690546986113, 0.7598189322327252, 0.7615923893633253, 0.7568514591297844, 0.759454651696031, 0.7599416358652812, 0.7567307367184214, 0.7618638166667724, 0.7537287903667214, 0.7607913155942713, 0.7606748197881202, 0.7568662522480257, 0.7594267825179155, 0.7577760290198714, 0.7579231677138081, 0.7570407317944264, 0.7587614884412914, 0.7567448694295985, 0.7618527218280913, 0.754847255770901, 0.759511446703565, 0.7591696200070583, 0.759507220098353, 0.7588552662444288, 0.7578317673761024, 0.758938213371711, 0.7601579852195615, 0.760

In [280]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [281]:
column_to_drop_54 = '현재 주택의 면적(㎡)'

In [282]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(19949, 9)


In [283]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [284]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = RandomForestClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [285]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7352387324894216


In [286]:
optuna_55 = RandomForestClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [287]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7423720342439556


In [288]:
X_train = X_train.values
y_train = y_train.values

In [289]:
auc_bootstrap = []

In [290]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74119704, 0.74239105])

In [291]:
t_55 = auc_bootstrap
print(t_55)

[0.7423720342439556, 0.7423720342439556, 0.74155524278677, 0.7423720342439556, 0.7423720342439556, 0.7422558026006303, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7414199914199915, 0.7423720342439556, 0.74155524278677, 0.7422367828771771, 0.7414199914199915, 0.7422367828771771, 0.7422367828771771, 0.7423720342439556, 0.7422367828771771, 0.74155524278677, 0.7423720342439556, 0.7422558026006303, 0.7423720342439556, 0.7414199914199915, 0.7422367828771771, 0.7422367828771771, 0.7414199914199915, 0.7423720342439556, 0.7423720342439556, 0.7423419196818213, 0.7409777828497042, 0.74155524278677, 0.7422066683150428, 0.74155524278677, 0.7423720342439556, 0.7422367828771771, 0.7424121869934679, 0.7423720342439556, 0.7423910539674088, 0.7423720342439556, 0.7423720342439556, 0.74155524278677, 0.74155524278677, 0.7423820724313337, 0.7415742625102232, 0.74155524278677, 0.7424121869934679, 0.7422066683150428, 0.7414199914199915, 0.7423720342439556, 0.74155524278677, 0.742236782877177

In [292]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc