In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
청년가구 = pd.read_csv('청년가구_변수추가.csv', encoding='cp949')
청년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
청년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [9]:
cat = 청년가구.select_dtypes(include = 'object')
num = 청년가구.select_dtypes(exclude = 'object')
num_청년 = num.drop('target',axis=1)
target = 청년가구.target

In [10]:
scaler=RobustScaler()
scaler.fit(num_청년)
num_scaled_청년=scaler.transform(num_청년)
num_df_scaled_청년=pd.DataFrame(data=num_scaled_청년, columns=num_청년.columns)

In [11]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [12]:
comp = pd.concat([num_df_scaled_청년, target,cat2],axis=1)

In [13]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(8444, 213)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [17]:
print(study.best_trial.params)

{'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 3}


In [18]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7595974082798916


In [19]:
optuna_0 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)

In [20]:
optuna_0.fit(X_train, y_train)

In [21]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.741


In [22]:
X_train = X_train.values
y_train = y_train.values

In [23]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [24]:
auc_bootstrap = []

In [25]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71988098, 0.74487699])

In [26]:
np.mean(auc_bootstrap)

0.7336892311629117

In [27]:
t_0 = auc_bootstrap
print(t_0)

[0.733295585640608, 0.7356417390517553, 0.726833944082519, 0.7303853374954759, 0.7298099325913863, 0.7253473635993486, 0.7390142903999276, 0.7358983385360116, 0.726374468422005, 0.7366638956749909, 0.7361061629116902, 0.7309183292616721, 0.7356063947701774, 0.7338490770901195, 0.7274192453854506, 0.7480263753166847, 0.7425777008686211, 0.7380267711726385, 0.7315580607582338, 0.7376344496471228, 0.7368900990770901, 0.7369070643322475, 0.7409575190010858, 0.734039229325009, 0.7324657019091566, 0.7131981202497285, 0.7456738599348534, 0.7341403139703221, 0.7333797050307637, 0.745798271806008, 0.7365211047774158, 0.7299739300579081, 0.7332757928429243, 0.7406153863554108, 0.7365218116630474, 0.7286124683315236, 0.7421047943811074, 0.730526714621788, 0.729808518820123, 0.7360764737151646, 0.7349921111563518, 0.7420616743575823, 0.7305677139884184, 0.7281643028411148, 0.7302623393955845, 0.7348047864639883, 0.7339310758233804, 0.7430286939015562, 0.7338321118349619, 0.742643441232356, 0.73657

In [28]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
# 1.
column_to_drop = '현재 주택 거주 기간(총 개월)'

In [30]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(8444, 212)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [32]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [33]:
print(study.best_trial.params)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}


In [34]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7598449122465847


In [35]:
optuna_1 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [36]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.741


In [37]:
X_train = X_train.values
y_train = y_train.values

In [38]:
auc_bootstrap = []

In [39]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72032915, 0.74475899])

In [40]:
from scipy.stats import shapiro

In [41]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9942549467086792, pvalue=5.270025553727464e-07)

In [42]:
np.mean(auc_bootstrap)

0.733511958384229

In [43]:
t_1 = auc_bootstrap
print(t_1)

[0.740730608713355, 0.7328057138979371, 0.7257764431777054, 0.7266847912142599, 0.7353186923181324, 0.7301725649203763, 0.7435404790988057, 0.7368236518277235, 0.743575116494752, 0.7415392858758595, 0.723635993485342, 0.7152445541530944, 0.7365592766015201, 0.7367331704668838, 0.7426830268277235, 0.724768424267101, 0.7318104189287007, 0.7322324296507419, 0.7266282403637352, 0.7311742218602968, 0.7415025278230185, 0.7393783365001809, 0.7323730998914224, 0.7144514284744843, 0.7374344010133912, 0.7416071468964893, 0.7389534982356134, 0.7411137407256605, 0.7315658365001809, 0.746394176393413, 0.733849783975751, 0.7323844100615273, 0.7375050895765473, 0.7382996290264205, 0.7218765551483896, 0.7443746041440462, 0.7271301291621426, 0.7340243847267464, 0.7311388775787186, 0.7242177603601158, 0.736599569082519, 0.7299399995475931, 0.7393260269634453, 0.7278956863011219, 0.7345043600705755, 0.7392461488870793, 0.7242700698968512, 0.7318697973217517, 0.7370936821389793, 0.7428067318132465, 0.7383

In [44]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [45]:
#### 2. 
column_to_drop_1 = 'Cat_가구주 성별'

In [46]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(8444, 210)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [48]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [49]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7598449122465847


In [50]:
optuna_2 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [51]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.741


In [52]:
X_train = X_train.values
y_train = y_train.values

In [53]:
auc_bootstrap = []

In [54]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72139434, 0.74496669])

In [55]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9943565130233765, pvalue=6.626692652389465e-07)

In [56]:
np.mean(auc_bootstrap)

0.7336341934010813

In [57]:
t_2 = auc_bootstrap
print(t_2)

[0.7458329092019544, 0.7375192272891786, 0.7336525628845458, 0.7303026318765834, 0.7213951660332971, 0.7358269430872241, 0.7270615612558813, 0.7397211760314876, 0.7297371233713354, 0.7294960753709736, 0.7300531012486428, 0.7305401454487876, 0.7335649090662324, 0.742866817091929, 0.7355222753800217, 0.720107220412595, 0.735932975931958, 0.7324699432229461, 0.7363917447068404, 0.7303379761581614, 0.7227651103872602, 0.7263518480817951, 0.7198965684943901, 0.7371431641331886, 0.7408069523615636, 0.7449026477108216, 0.7411398954940283, 0.7420659156713717, 0.736787600660514, 0.7407327293702498, 0.7383795071027868, 0.7349694908161419, 0.7096615714350344, 0.7328396444082519, 0.7387986902823018, 0.7369113056460369, 0.7237165784473398, 0.7355505508052842, 0.7217443675352877, 0.7372605071480275, 0.7346259443992038, 0.7405340945077814, 0.7257078752714441, 0.7413307546145494, 0.7201149961545422, 0.7328382306369887, 0.7341572792254796, 0.7331195711183496, 0.7346471509681506, 0.7286930532935214, 0.7

In [58]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
#### 3.
column_to_drop_2 = 'Cat_기초생활보장 수급가구 여부'

In [60]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(8444, 208)


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [62]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [63]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 6}
0.7598449122465847


In [64]:
optuna_3 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [65]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.741


In [66]:
X_train = X_train.values
y_train = y_train.values

In [67]:
auc_bootstrap = []

In [68]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7200737 , 0.74506677])

In [69]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9926771521568298, pvalue=1.9145163676625998e-08),
 0.7338251306324646)

In [70]:
t_3 = auc_bootstrap
print(t_3)

[0.7475732616268549, 0.7311346362649295, 0.7306023513843649, 0.7398095367354325, 0.7405517666485704, 0.7367960832880928, 0.7368490997104596, 0.7352621414676076, 0.7419436244571118, 0.7328495408070937, 0.7357230308993847, 0.7440989187477379, 0.7372880756876583, 0.7413357028139703, 0.7338024226384364, 0.7501618768096272, 0.7348627510857763, 0.7395585923362288, 0.7329265913409337, 0.7317694195620703, 0.7363401420557365, 0.7252095209011943, 0.7413074273887079, 0.7391924255790807, 0.7408762271534564, 0.7373587642508144, 0.7184743146036916, 0.7403114255338401, 0.7265617931143685, 0.7285792447068404, 0.733565615951864, 0.7338632148027506, 0.7337161825913863, 0.7221381028320665, 0.7153682591386175, 0.7241654508233804, 0.7325922344372058, 0.7357894781487514, 0.7474120917028592, 0.7354374491042346, 0.7423055499004705, 0.7292034247195078, 0.7313877013210278, 0.7447464259862467, 0.725889544878755, 0.7349977662414042, 0.7272241449511401, 0.730663143548679, 0.7332319659337676, 0.7365889657980456, 0.

In [71]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
### 4. 
column_to_drop_3 = 'Cat_현재 주택의 구조'

In [73]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(8444, 206)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7566826071007121


In [77]:
optuna_4 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [78]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.733


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69271735, 0.73089827])

In [82]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989854693412781, pvalue=0.31897687911987305),
 0.7120449939773343)

In [83]:
t_4 = auc_bootstrap
print(t_4)

[0.7062204521806007, 0.7210897914404633, 0.715463688698878, 0.7162384353510676, 0.7210636366720955, 0.7315375610749186, 0.7081870080076005, 0.7086896036916394, 0.7237731292978645, 0.7090819252171553, 0.7062593308903367, 0.7118550375497649, 0.7085333819670647, 0.72123328922367, 0.70763846475751, 0.7150430917480999, 0.7227799549855229, 0.7202238565418023, 0.7150020923814694, 0.7122148423362287, 0.7097654836228737, 0.7053234143141514, 0.7150494537187838, 0.702339650063337, 0.723311532980456, 0.7241428304831703, 0.7136618372240319, 0.7165480512576908, 0.7211795659156714, 0.7231885348805647, 0.7276108113916033, 0.7234995645584509, 0.7079311154089758, 0.7066014635360117, 0.7076469473850886, 0.7268671677072024, 0.7262988316594281, 0.7087051551755338, 0.7140916236880204, 0.7141587778230184, 0.7018405888074557, 0.7268657539359391, 0.7179321333242853, 0.7072235228917844, 0.6960384715436121, 0.7159012509048135, 0.7188524984165763, 0.727607983849077, 0.6960328164585595, 0.7272432308631922, 0.72655

In [84]:
## 5.현재 주택의 위치
column_to_drop_4 = 'Cat_가구주 장애 여부'

In [85]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(8444, 204)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7559168917037554


In [89]:
optuna_5 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [90]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.729


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69244949, 0.73070527])

In [94]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980096220970154, pvalue=0.014448744244873524),
 0.7119285628449602)

In [95]:
t_5 = auc_bootstrap
print(t_5)

[0.7122028252804923, 0.7122431177614912, 0.7233744458016649, 0.7232309480184581, 0.7147886129207384, 0.7162398491223307, 0.713656889024611, 0.7288754297864639, 0.7036120442001448, 0.7130814841205211, 0.715502567408614, 0.7157952180600795, 0.7134398751357218, 0.7000938744118711, 0.7123109787821208, 0.6991918883460005, 0.7171870758686212, 0.7167862717155267, 0.7169693550941008, 0.7087857401375318, 0.7209986031939921, 0.7085256062251175, 0.7186361914133189, 0.7111580483170468, 0.7121462744299673, 0.7225155797593197, 0.6977689275696706, 0.7001044776963445, 0.7037251459011943, 0.7182827485975389, 0.7052866562613102, 0.7304228024339486, 0.7058924572475571, 0.707007215888527, 0.7027432817589576, 0.6977321695168295, 0.7237575778139704, 0.7135996312884545, 0.7161507815327542, 0.6976515845548317, 0.6995580551031488, 0.7227177490499458, 0.6988101701049584, 0.7289425839214622, 0.6871225230727469, 0.7086429492399566, 0.725312726203402, 0.6938570224846182, 0.7054944806369887, 0.7195714011038725, 0.7

In [96]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [97]:
## 6
column_to_drop_5 = 'Cat_가구주 동거 여부'

In [98]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(8444, 203)


In [99]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [100]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [101]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7550981397425075


In [102]:
optuna_6 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [103]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.731


In [104]:
X_train = X_train.values
y_train = y_train.values

In [105]:
auc_bootstrap = []

In [106]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69309988, 0.73068564])

In [107]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9994840025901794, pvalue=0.8941740989685059),
 0.7117371573018458)

In [108]:
t_6 = auc_bootstrap
print(t_6)

[0.7148246640879479, 0.7053474484256242, 0.7103062511310171, 0.7017084011943537, 0.710955172140789, 0.720478335369164, 0.7090119435396308, 0.7169120973579441, 0.7194802128574014, 0.7095180736518277, 0.705853578537821, 0.7207010043431052, 0.7025425262395947, 0.6998860500361925, 0.7161536090752805, 0.7024159937115455, 0.6997884998190372, 0.7110074816775245, 0.7339233000814332, 0.7085065203130656, 0.7142245181867535, 0.709763362965979, 0.7066551868440101, 0.7195827112739777, 0.7140619344914948, 0.7138647134002896, 0.7005123507057546, 0.7313615465526602, 0.7121137576909158, 0.7219627951954397, 0.7130185712993123, 0.7080173554560261, 0.6988342042164314, 0.7133076875226203, 0.7087044482899023, 0.718885015155628, 0.7136158896579804, 0.7123816673452767, 0.7125336477560623, 0.6989451852605862, 0.7167898061436845, 0.7121434468874411, 0.7311452395494028, 0.7151915377307276, 0.7151194353963083, 0.7066573075009047, 0.7006205042073833, 0.714990075325733, 0.718933083378574, 0.7068488735070576, 0.7200

In [109]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [110]:
## 7 .
column_to_drop_6 = 'Cat_가구주 최종 학력'

In [111]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(8444, 200)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [113]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [114]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7651894510273625


In [115]:
optuna_7 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [116]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.727


In [117]:
X_train = X_train.values
y_train = y_train.values

In [118]:
auc_bootstrap = []

In [119]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69252755, 0.73115719])

In [120]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9995228052139282, pvalue=0.9261836409568787),
 0.7120343549951366)

In [121]:
t_7 = auc_bootstrap
print(t_7)

[0.7097251911418748, 0.7232804300126674, 0.7111128076366268, 0.7274383312975027, 0.7230549334962, 0.6984609686029678, 0.7012305465074194, 0.717469830121245, 0.6934215809355772, 0.7232578096724575, 0.6964364481541802, 0.713622958514296, 0.7090812183315237, 0.7029998812432139, 0.7180897688201232, 0.7151512452497285, 0.7194307308631922, 0.7121349642598624, 0.7031066209735795, 0.7215337156170829, 0.7149229211907348, 0.7211986518277236, 0.7079897869163952, 0.7302262882283749, 0.7128616426891061, 0.7071796959826275, 0.7094799018277235, 0.7024301314241765, 0.7025799911780674, 0.7087440338852696, 0.7200824794154903, 0.7047501300669562, 0.7111948063698877, 0.7055248767191458, 0.7072468501176258, 0.7254442069308723, 0.7063915185034384, 0.7187499999999999, 0.7193133878483531, 0.7069025968150561, 0.693254755926529, 0.7168435294516828, 0.7412593591657618, 0.7179667707202316, 0.7190532539359392, 0.7157330121245025, 0.7126311979732176, 0.7121321367173363, 0.723139759771987, 0.7097287255700326, 0.7091

In [122]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [123]:
## 8 .
column_to_drop_7 = 'Cat_가구주 주민등록상 등재 여부'

In [124]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(8444, 198)


In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [126]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [127]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7682059056214338


In [128]:
optuna_8 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [129]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.729


In [130]:
X_train = X_train.values
y_train = y_train.values

In [131]:
auc_bootstrap = []

In [132]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.6923478 , 0.73006517])

In [133]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9969931244850159, pvalue=0.0006095560383982956),
 0.7119591572085369)

In [134]:
t_8 = auc_bootstrap
print(t_8)

[0.7305691277596815, 0.7093321627307274, 0.7282038884364822, 0.7105430578175896, 0.7159973873507058, 0.6969553022077452, 0.7099344292888164, 0.7185775199058995, 0.7097053983441911, 0.731241375995295, 0.714903128393051, 0.707903546869345, 0.7204302671462179, 0.7213555804379299, 0.717741274203764, 0.7002458548226564, 0.705227277868259, 0.7053743100796235, 0.7262175398117987, 0.7082916270810713, 0.7112068234256244, 0.7059702146670286, 0.7042143107582337, 0.7235370294969234, 0.7081120781306549, 0.6976890494933045, 0.7236119593738689, 0.7076285683586683, 0.7132136717336229, 0.6998097063879839, 0.6976134127307275, 0.7097223635993486, 0.7063901047321752, 0.7223600649203764, 0.7077303598896127, 0.7140272970955484, 0.7327180600796236, 0.6947964734889613, 0.7142796552660152, 0.7087447407709011, 0.7066219632193268, 0.6978339610477743, 0.717840945077814, 0.6939206421914584, 0.7211534111473037, 0.732604958378574, 0.7041238293973942, 0.7079883731451321, 0.7124742693630112, 0.697271986970684, 0.70116

In [135]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [136]:
#9.
column_to_drop_8 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [137]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(8444, 194)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [139]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [140]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7680269072169503


In [141]:
optuna_9 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [142]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.730


In [143]:
X_train = X_train.values
y_train = y_train.values

In [144]:
auc_bootstrap = []

In [145]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69203738, 0.73113034])

In [146]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9977496266365051, pvalue=0.006260082125663757),
 0.7121802353363645)

In [147]:
t_9 = auc_bootstrap
print(t_9)

[0.7213633561798769, 0.7183873676710097, 0.7190172027687296, 0.7154665162414044, 0.7231948968512487, 0.7114478714259862, 0.7104419731722765, 0.7114280786283026, 0.7032324466159972, 0.702227255247919, 0.6995347278773072, 0.7225360794426348, 0.7217592121335505, 0.718968427660152, 0.7200711692453855, 0.7125527336681143, 0.7228251956659427, 0.6973949850705755, 0.7161140234799132, 0.7207971407889975, 0.7024880960459645, 0.7235327881831343, 0.7173659179334057, 0.7337494062160694, 0.7158659066232357, 0.7119603635088673, 0.7018957258867173, 0.716962286237785, 0.7146953040173725, 0.7245747376040536, 0.717098715164676, 0.7113757690915672, 0.7278808417028592, 0.7078335651918205, 0.7142033116178067, 0.7113722346634094, 0.70890732446616, 0.713186103193992, 0.7060670579985522, 0.7344774984165762, 0.7111792548859934, 0.7061900560984439, 0.7155994107401374, 0.7134306856225118, 0.7161062477379659, 0.7001475977198697, 0.701121686120159, 0.7113086149565689, 0.7220638798407528, 0.7177935837404996, 0.70043

In [148]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [149]:
# 10.
column_to_drop_9 = 'Cat_현재 주택의 위치'

In [150]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(8444, 190)


In [151]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [152]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [153]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7680269072169503


In [154]:
optuna_10 = DecisionTreeClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [155]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.731


In [156]:
X_train = X_train.values
y_train = y_train.values

In [157]:
auc_bootstrap = []

In [158]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69366395, 0.73088742])

In [159]:
from scipy.stats import shapiro

In [160]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992727041244507, pvalue=0.6465475559234619),
 0.7122039562975027)

In [161]:
t_10 = auc_bootstrap
print(t_10)

[0.7127782301845821, 0.7268678745928339, 0.7093689207835685, 0.7236176144589215, 0.71788901330076, 0.719397507238509, 0.7158270279134996, 0.7019961036463989, 0.7275881910513935, 0.7157867354325008, 0.7080032177433947, 0.7196795546055013, 0.7049487649294246, 0.7125499061255881, 0.7084457281487513, 0.6951046756243213, 0.6984355207202315, 0.7054216714169381, 0.7110124298769454, 0.7126757317680058, 0.7130694670647846, 0.7068283738237424, 0.7170336816865726, 0.7231390528863554, 0.7194498167752442, 0.7129238486246834, 0.7227587484165762, 0.7262210742399566, 0.7201658919200142, 0.7225735443811074, 0.7264175884455303, 0.7236218557727108, 0.7031928610206296, 0.718054424538545, 0.7196993474031849, 0.7074468987513571, 0.7071302139884184, 0.7085305544245385, 0.7148911113373145, 0.7090501153637349, 0.7136788024791894, 0.7222985658704306, 0.706393639160333, 0.7076731021534564, 0.7152742433496199, 0.7232790162414042, 0.7107763300760044, 0.7072072645222585, 0.6991374581523706, 0.7286888119797322, 0.71

In [162]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [163]:
# 11.
column_to_drop_10 = 'Cat_현재 주택의 점유형태'

In [164]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(8444, 186)


In [165]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [166]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [167]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7671042919839652


In [168]:
optuna_11 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [169]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.728


In [170]:
X_train = X_train.values
y_train = y_train.values

In [171]:
auc_bootstrap = []

In [172]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69260311, 0.73208748])

In [173]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980385899543762, pvalue=0.01587357185781002),
 0.7125075184355772)

In [174]:
t_11 = auc_bootstrap
print(t_11)

[0.7134441164495116, 0.7027807466974303, 0.7357484787821208, 0.70454796077633, 0.709177354777416, 0.6962752782301845, 0.7096750022620341, 0.7204500599439015, 0.6974225536102062, 0.6998160683586682, 0.6866482028139703, 0.728830189106044, 0.7078943573561347, 0.7125562680962723, 0.7183894883279045, 0.7114994740770901, 0.7100213762214983, 0.6951280028501629, 0.707733187432139, 0.7167495136626855, 0.7163161927705393, 0.7056089961093014, 0.7123611676619618, 0.7255763945439739, 0.6820188088128845, 0.7025114232718059, 0.7127810577271081, 0.712240290218965, 0.7131790343376764, 0.7346662368802026, 0.7310978782120883, 0.7233808077723489, 0.7085468127940644, 0.7113708208921462, 0.707421450868621, 0.7192031136898299, 0.713374134771987, 0.7106490906623235, 0.6954453944987331, 0.7139764013300759, 0.7109191209735795, 0.7023361156351792, 0.7106809005157437, 0.7086733453221137, 0.7015217833876222, 0.7004649893684401, 0.7106285909790083, 0.722739662504524, 0.7139113678519726, 0.7131726723669924, 0.700606

In [175]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [176]:
# 12
column_to_drop_11 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [177]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(8444, 182)


In [178]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [179]:
X_train.shape

(5404, 182)

In [180]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [181]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.766249077384767


In [182]:
optuna_12 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [183]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.727


In [184]:
X_train = X_train.values
y_train = y_train.values

In [185]:
auc_bootstrap = []

In [186]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.6920522 , 0.73130599])

In [187]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9974854588508606, pvalue=0.0027222365606576204),
 0.7128425440531125)

In [188]:
t_12 = auc_bootstrap
print(t_12)

[0.7123915637441187, 0.7156255655085054, 0.7067477888617445, 0.7039273151918205, 0.710844191096634, 0.7216842822566052, 0.7062649859753892, 0.7051686063608397, 0.7187959475660514, 0.7372583864911328, 0.7112810464169381, 0.7173432975931958, 0.7137897835233441, 0.7158998371335504, 0.6975222244842562, 0.6996640879478827, 0.7282222674629025, 0.7070270086862107, 0.7083283851339124, 0.7204592494571119, 0.7113835448335143, 0.7111163420647846, 0.714660666621426, 0.7036791983351429, 0.70307056980637, 0.7113679933496198, 0.7188489639884184, 0.7053064490589936, 0.69789546009772, 0.7049402823018458, 0.701867450461455, 0.704369825597177, 0.7321723443720592, 0.7226746290264205, 0.6995877442996744, 0.7284209023253709, 0.7110993768096272, 0.7204175432048499, 0.6866418408432862, 0.7106151601520087, 0.7063271919109664, 0.7126495769996382, 0.7288287753347811, 0.708590639703221, 0.7074405367806733, 0.7091222176981542, 0.6784257091476656, 0.7247959928067318, 0.7117207292797684, 0.7088790490408975, 0.714083

In [189]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [190]:
# 13.
column_to_drop_12 = 'Cat_현재 주택의 유형'

In [191]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(8444, 171)


In [192]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [193]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [194]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 6}
0.7644425194135924


In [195]:
optuna_13 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [196]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.719


In [197]:
X_train = X_train.values
y_train = y_train.values

In [198]:
auc_bootstrap = []

In [199]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.68253218, 0.72406497])

In [200]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9972004890441895, pvalue=0.0011343328515067697),
 0.7045312708531261)

In [201]:
t_13 = auc_bootstrap
print(t_13)

[0.708081682048498, 0.7164971554922185, 0.7102928203040173, 0.7123251164947522, 0.7146090639703222, 0.7002416135088672, 0.713480167616721, 0.7070602323108939, 0.698894289495114, 0.7268282889974665, 0.6886578786644951, 0.712760558043793, 0.714474048814694, 0.6968217008233804, 0.7013288036102062, 0.7096120894408252, 0.698272937024973, 0.705609702994933, 0.6973511581614187, 0.7067845469145856, 0.7140654689196526, 0.7216432828899746, 0.6922283579895041, 0.707233419290626, 0.701554300126674, 0.7151300386807817, 0.6771724009229099, 0.7054407573289903, 0.7059058880745567, 0.7110103092200507, 0.6983598839576548, 0.6846653886174447, 0.7103514918114369, 0.7130376572113645, 0.6985754840752805, 0.7212608577633007, 0.7032875836952589, 0.7270905435667752, 0.7066869966974304, 0.7049268514748461, 0.7013796993756785, 0.7163593127940644, 0.6888975128935939, 0.6927076264477019, 0.6953527924809988, 0.6994909009681506, 0.7079940282301845, 0.7067449613192182, 0.6865739798226566, 0.6962851746290264, 0.714582

In [202]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [203]:
#14.
column_to_drop_13 = '부채 중 임대 보증금의 비중'

In [204]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(8444, 170)


In [205]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [206]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [207]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 3}
0.7678987355199131


In [208]:
optuna_14 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [209]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.736


In [210]:
X_train = X_train.values
y_train = y_train.values

In [211]:
auc_bootstrap = []

In [212]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.70333217, 0.73737529])

In [213]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9973155856132507, pvalue=0.001610561739653349),
 0.7208916810871335)

In [214]:
t_14 = auc_bootstrap
print(t_14)

[0.7302319433134274, 0.7269470457835686, 0.7125718195801665, 0.7209766897394136, 0.7360474914042707, 0.7267045840119437, 0.7393684401013391, 0.7253876560803474, 0.7244672909880565, 0.7138230071480275, 0.7229093150560985, 0.7256216352243937, 0.7207236246833152, 0.7111050318946797, 0.723057761038726, 0.7293002680510314, 0.7239031962540716, 0.7264988802931596, 0.7180834068494389, 0.7128065056098443, 0.7177023954940281, 0.7184693664042705, 0.7090190123959463, 0.7255445846905537, 0.7132461884726746, 0.7265405865454215, 0.7142485522982266, 0.7144811176710097, 0.7143227752895402, 0.7164010190463265, 0.7348846645403547, 0.7112753913318856, 0.7074822430329352, 0.7140647620340209, 0.7109898095367354, 0.7029899848443721, 0.7261581614187478, 0.7310738441006152, 0.7106568664042706, 0.7186100366449512, 0.7099082745204487, 0.7221310339757508, 0.720455715028954, 0.7294741619163952, 0.7242757249819036, 0.7206218331523707, 0.7344181200235251, 0.7308271410152009, 0.721617128121607, 0.7328092483260948, 0.

In [215]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [216]:
## 15.
column_to_drop_14 = '부채 중 비금융기관 대출금의 비중'

In [217]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(8444, 169)


In [218]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [219]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [220]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7673573205927718


In [221]:
optuna_15 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [222]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.731


In [223]:
X_train = X_train.values
y_train = y_train.values

In [224]:
auc_bootstrap = []

In [225]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69219968, 0.7314685 ])

In [226]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9976387619972229, pvalue=0.00440325029194355),
 0.7126874261304516)

In [227]:
t_15 = auc_bootstrap
print(t_15)

[0.7200089633098082, 0.7300036192544336, 0.7049438167300036, 0.7244397224484255, 0.7170294403727833, 0.7215923871245025, 0.7061405741042346, 0.7097280186844009, 0.7134363407075642, 0.7090600117625769, 0.7237681810984439, 0.7129818132464714, 0.7143234821751718, 0.7095993654994571, 0.7031971023344191, 0.7018794675171913, 0.7202910106768006, 0.7268798916485704, 0.6990561663047411, 0.7202047706297503, 0.7131387418566775, 0.6955747545693087, 0.7136349755700324, 0.6869026816413321, 0.7218171767553384, 0.7205490239323198, 0.7104964033659066, 0.7061165399927616, 0.7236140800307637, 0.7076441198425624, 0.7158037006876583, 0.7066523593014838, 0.7120826547231272, 0.7221317408613825, 0.7176599823561345, 0.7144125497647484, 0.7207483656804198, 0.7301237898117987, 0.7351398502533478, 0.7119808631921825, 0.7118833129750272, 0.7209795172819399, 0.7057150289540356, 0.7047960776330077, 0.7151512452497286, 0.7002232344824466, 0.7068517010495837, 0.7276539314151285, 0.7092148197158885, 0.7209413454578358,

In [228]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [229]:
# 16.
column_to_drop_15 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [230]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(8444, 165)


In [231]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [232]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [233]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7563776468560368


In [234]:
optuna_16 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [235]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.728


In [236]:
X_train = X_train.values
y_train = y_train.values

In [237]:
auc_bootstrap = []

In [238]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69318431, 0.73123084])

In [239]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9977999329566956, pvalue=0.007350616622716188),
 0.7131927853838672)

In [240]:
t_16 = auc_bootstrap
print(t_16)

[0.7316789382012306, 0.7285955030763663, 0.7215973353239231, 0.7259468026149114, 0.7129966578447339, 0.7008629659790083, 0.7215506808722403, 0.726098076140065, 0.70939224800941, 0.7143680159699601, 0.7245641343195802, 0.7090133573108939, 0.7120861891512849, 0.7106031430962721, 0.6969934740318494, 0.7272432308631922, 0.7124042876854868, 0.7337946468964894, 0.712301082383279, 0.7032140675895765, 0.7143418612015925, 0.7082492139431776, 0.7070524565689469, 0.7140477967788634, 0.6850287278320666, 0.7200506695620702, 0.6926941956207022, 0.7164222256152731, 0.717694619752081, 0.708165094553022, 0.7023255123507057, 0.7241810023072748, 0.7137735251538183, 0.6982305238870794, 0.7143043962631199, 0.7190065994842563, 0.7211732039449872, 0.7085524678791169, 0.7062480207202316, 0.7179448572656535, 0.7177356191187115, 0.7230294856134637, 0.7237137509048137, 0.697207660378212, 0.7195940214440825, 0.720910242490047, 0.7186036746742671, 0.71884967087405, 0.7113277008686211, 0.7264147609030037, 0.7040128

In [241]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [242]:
# 17.
column_to_drop_16 = 'Cat_이사 예상 기간'

In [243]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(8444, 161)


In [244]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [245]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [246]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7550815658161665


In [247]:
optuna_17 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [248]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.727


In [249]:
X_train = X_train.values
y_train = y_train.values

In [250]:
auc_bootstrap = []

In [251]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69376258, 0.73159049])

In [252]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.997239887714386, pvalue=0.0012784515274688601),
 0.7135336276098216)

In [253]:
t_17 = auc_bootstrap
print(t_17)

[0.7225523378121606, 0.7103175613011219, 0.7107812782754251, 0.718602260903004, 0.7159606292978646, 0.7107332100524792, 0.7043563947701772, 0.7150642983170468, 0.7254936889250816, 0.7326176823199421, 0.7118451411509229, 0.7119794494209192, 0.7131033975750996, 0.7244326535921102, 0.7199651364006515, 0.7223296688382194, 0.7190546677072023, 0.7181364232718059, 0.7242326049583785, 0.7187521206568945, 0.7120614481541802, 0.7066622557003257, 0.7155909281125589, 0.7223876334600072, 0.7046829759319579, 0.7016370057455664, 0.7227601621878393, 0.713592562432139, 0.7095491766196165, 0.689093320213536, 0.7088267395041621, 0.7252187104144046, 0.714483945213536, 0.7060373688020267, 0.7130334158975751, 0.7201701332338039, 0.7070178191730002, 0.7181774226384364, 0.7056188925081435, 0.7292804752533478, 0.7132341714169381, 0.7071302139884185, 0.7239350061074918, 0.7217747636174447, 0.70264007645675, 0.725099246742671, 0.7206331433224756, 0.7215167503619254, 0.70304794946616, 0.7234189795964532, 0.713787

In [254]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [255]:
## 18.
column_to_drop_17 ='Cat_현재 청소/쓰레기 처리상태'

In [256]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(8444, 157)


In [257]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [258]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [259]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7558251826446684


In [260]:
optuna_18 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [261]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.727


In [262]:
X_train = X_train.values
y_train = y_train.values

In [263]:
auc_bootstrap = []

In [264]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69374943, 0.73187936])

In [265]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9983429908752441, pvalue=0.04284561052918434),
 0.7134059612372194)

In [266]:
t_18 = auc_bootstrap
print(t_18)

[0.7168025300850525, 0.7159966804650741, 0.6996732774610931, 0.7161458333333333, 0.7208897428067318, 0.710094185441549, 0.7152134511853059, 0.7125103205302208, 0.7109502239413681, 0.7230711918657255, 0.7157520980365544, 0.7139177298226564, 0.7087086896036916, 0.7201998224303292, 0.720773813563156, 0.7133960482265652, 0.7088578424719507, 0.7031546891965255, 0.7071584894136808, 0.7106681765743756, 0.7102638379931233, 0.7081643876673905, 0.7280137362015926, 0.7132709294697792, 0.7048625248823741, 0.7243534824013753, 0.733012831387984, 0.7177893424267101, 0.7082082145765473, 0.7186178123868983, 0.7113891999185669, 0.7305238870792616, 0.7117652630745566, 0.7249564558450958, 0.7025262678700688, 0.7243252069761128, 0.7170930600796237, 0.7114945258776691, 0.7199389816322838, 0.7089603408885268, 0.7179314264386536, 0.7122395833333333, 0.7149822995837857, 0.6888657030401738, 0.7305832654723128, 0.7236741653094463, 0.700604952723489, 0.7151477108215706, 0.7130984493756786, 0.7097661905085052, 0.7

In [267]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [268]:
# 19
column_to_drop_18 = '자산 중 부동산 자산의 비중'

In [269]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(8444, 156)


In [270]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [271]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [272]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7555599998232115


In [273]:
optuna_19 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [274]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.727


In [275]:
X_train = X_train.values
y_train = y_train.values

In [276]:
auc_bootstrap = []

In [277]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69277575, 0.73142941])

In [278]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.995026707649231, pvalue=3.1757278975419467e-06),
 0.7136546128528773)

In [279]:
t_19 = auc_bootstrap
print(t_19)

[0.7114690779949331, 0.7157888560893955, 0.7052908975750993, 0.7153343286283025, 0.7064600864096995, 0.71804311436844, 0.7155881005700326, 0.6968520969055375, 0.7147143899294246, 0.7249790761853059, 0.7105805227560622, 0.7133394973760404, 0.7183279892779587, 0.7001624423181325, 0.6906138312070214, 0.7141206059989142, 0.7110944286102062, 0.7091186832699964, 0.70892994480637, 0.7173277461093014, 0.7127626787006875, 0.7040446582066595, 0.715862372195078, 0.7146175465979009, 0.6891852153456388, 0.7235865114911328, 0.716732548407528, 0.7254823787549763, 0.7088451185305827, 0.7048985760495838, 0.7192950088219328, 0.7294020595819762, 0.7063441571661238, 0.7093286283025697, 0.7150833842290988, 0.7101380123507058, 0.7209689139974664, 0.7018165546959827, 0.7020717404089757, 0.7131535864549403, 0.6960278682591385, 0.6987317057998552, 0.7281240103601159, 0.7154940847810352, 0.7175306222855591, 0.7080385620249727, 0.7026549210550126, 0.7096587438925082, 0.7094982808541441, 0.7200485489051756, 0.717

In [280]:
# 20.
column_to_drop_19 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [281]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(8444, 152)


In [282]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [283]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [284]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7545324163900663


In [285]:
optuna_20 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [286]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.724


In [287]:
X_train = X_train.values
y_train = y_train.values

In [288]:
auc_bootstrap = []

In [289]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.69420826, 0.73254478])

In [290]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988347887992859, pvalue=0.20528049767017365),
 0.7136734513549584)

In [291]:
t_20 = auc_bootstrap
print(t_20)

[0.719092132645675, 0.7081622670104959, 0.711688919426348, 0.7262323844100615, 0.7179688913771264, 0.7055269973760405, 0.710187494344915, 0.7078533579895042, 0.7085043996561708, 0.7116867987694534, 0.7240064015562795, 0.7106794867444807, 0.7132355851882013, 0.7087355512576908, 0.7149285762757871, 0.7045324092924357, 0.7361365589938473, 0.7110640325280492, 0.7125753540083244, 0.7190150821118351, 0.708112078130655, 0.7197071231451321, 0.7050229879207383, 0.713303446208831, 0.7228018684401013, 0.7325427524429966, 0.7055439626311979, 0.7096205720684039, 0.7206281951230545, 0.7009329476565328, 0.7316994378845456, 0.705963145810713, 0.7138237140336591, 0.703662939965617, 0.7008601384364821, 0.7137480772710821, 0.6961600558722404, 0.7089942713988417, 0.7173171428248281, 0.7120317589576545, 0.705348155311256, 0.7148812149384727, 0.720389974665219, 0.7277755157437565, 0.716022835233442, 0.727365522077452, 0.7134837020448788, 0.72927694082519, 0.7157697701773436, 0.7119935871335504, 0.7039640732

In [292]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [293]:
# 21
column_to_drop_20 = 'Cat_현재 문화시설 접근용이성'

In [294]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(8444, 148)


In [295]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [296]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [297]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [298]:
optuna_21 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [299]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.743


In [300]:
X_train = X_train.values
y_train = y_train.values

In [301]:
auc_bootstrap = []

In [302]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72001416, 0.74525261])

In [303]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9862277507781982, pvalue=6.001313270116715e-13),
 0.7342489955155175)

In [304]:
t_21 = auc_bootstrap
print(t_21)

[0.7339310758233804, 0.7398752770991677, 0.7420475366449512, 0.7396639182953312, 0.7290818403908796, 0.7441074013753166, 0.7411971532301845, 0.7426872681415129, 0.7297569161690192, 0.7360665773163229, 0.7413738746380745, 0.728965204261672, 0.7355639816322838, 0.7282173192634817, 0.7385555216250452, 0.7369650289540355, 0.724243208242852, 0.7270551992851971, 0.7326042514929424, 0.7370350106315599, 0.7143567057998552, 0.7333139646670285, 0.7377659303745928, 0.7246991494752081, 0.7418093161871155, 0.7413993225208108, 0.7385979347629388, 0.7327293702497285, 0.7288316028773072, 0.7402704261672095, 0.7300806697882736, 0.7281763198968512, 0.7437673893865364, 0.7347920625226203, 0.7434457564241765, 0.7458746154542164, 0.7366914642146218, 0.7381858204397393, 0.7401954962902642, 0.7410741551302931, 0.7348995091386175, 0.736798910830619, 0.7452822452949692, 0.731059706387984, 0.737898824873326, 0.7298261909609121, 0.7371749739866088, 0.7416453187205936, 0.7361931098443721, 0.7361740239323199, 0.73

In [305]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [306]:
# 22
column_to_drop_21 = 'Cat_현재 공공기관 접근용이성'

In [307]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(8444, 144)


In [308]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [309]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [310]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [311]:
optuna_22 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [312]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.743


In [313]:
X_train = X_train.values
y_train = y_train.values

In [314]:
auc_bootstrap = []

In [315]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72167704, 0.74543246])

In [316]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9938256144523621, pvalue=2.0460377925246576e-07),
 0.7343516059734664)

In [317]:
t_22 = auc_bootstrap
print(t_22)

[0.7427777495023524, 0.7337161825913862, 0.741562613101701, 0.728562279451683, 0.7393034066232355, 0.7301654960640607, 0.7267943584871516, 0.7358672355682229, 0.7332567069308722, 0.7400908772167935, 0.7355512576909158, 0.72833890359211, 0.7351956942182412, 0.7306426438653638, 0.7452688144679697, 0.7299414133188563, 0.7315778535559174, 0.7396907799493304, 0.7252696061798769, 0.7337543544154904, 0.7369021161328266, 0.738777483713355, 0.7435652200959102, 0.7338950246561708, 0.7388665513029316, 0.7358347188291711, 0.7318302117263844, 0.7340187296416938, 0.7309190361473037, 0.730475111970684, 0.7347927694082519, 0.7418284020991676, 0.7357103069580166, 0.7273358328809265, 0.7295059717698154, 0.7451026963445531, 0.7450885586319218, 0.7249614040445169, 0.734630185712993, 0.7209081218331523, 0.7359803372692726, 0.7392362524882375, 0.7447443053293521, 0.7275464847991313, 0.7249458525606226, 0.7359739752985885, 0.7272750407166124, 0.7255990148841839, 0.7291836319218241, 0.7388651375316685, 0.7396

In [318]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [319]:
# 23.
column_to_drop_22 = 'Cat_이사 계획 첫 번째 이유'

In [320]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(8444, 131)


In [321]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [322]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [323]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7553776866334598


In [324]:
optuna_23 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [325]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.723


In [326]:
X_train = X_train.values
y_train = y_train.values

In [327]:
auc_bootstrap = []

In [328]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.6947934 , 0.73363729])

In [329]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.99867844581604, pvalue=0.12648697197437286),
 0.7140910885755971)

In [330]:
t_23 = auc_bootstrap
print(t_23)

[0.7047091307003257, 0.7058337857401376, 0.7139756944444444, 0.7095315044788274, 0.7162122805827, 0.6960264544878756, 0.7136745611654, 0.7090289087947883, 0.7062649859753891, 0.7087701886536373, 0.7073585380474123, 0.7093604381559898, 0.7150084543521534, 0.7128390223488962, 0.7230888640065146, 0.7114294923995655, 0.7066862898117987, 0.7149215074194716, 0.7185195552841117, 0.6924460787640246, 0.7086592076094824, 0.7274298486699239, 0.7110251538183132, 0.7226449398298952, 0.7204203707473761, 0.7132539642146216, 0.7256407211364458, 0.711078877126312, 0.7115673350977199, 0.701485025334781, 0.7215068539630837, 0.7230563472674629, 0.70619288364097, 0.7143142926619616, 0.7185167277415853, 0.7078561855320304, 0.7252081071299312, 0.7161373507057548, 0.7329789008776693, 0.7167721340028954, 0.7142471385269635, 0.6901911135993486, 0.7173553146489323, 0.7080314931686573, 0.7241753472222222, 0.7253410016286644, 0.7153484663409337, 0.684287911690192, 0.7130284676981542, 0.706282658116178, 0.717156679

In [331]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [332]:
# 24
column_to_drop_23 = 'Cat_현재 주변도로의 보행 안전'

In [333]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(8444, 127)


In [334]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [335]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [336]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [337]:
optuna_24 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [338]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.743


In [339]:
X_train = X_train.values
y_train = y_train.values

In [340]:
auc_bootstrap = []

In [341]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72251381, 0.74543212])

In [342]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9933760166168213, pvalue=7.881760666350601e-08),
 0.7346991035276421)

In [343]:
t_24 = auc_bootstrap
print(t_24)

[0.7409907426257691, 0.7414756661690193, 0.7346775470503076, 0.7419330211726385, 0.7350218003528773, 0.7340293329261671, 0.7392666485703945, 0.7272835233441912, 0.7355554990047051, 0.7313233747285559, 0.7350953164585596, 0.7276051563065508, 0.7386198482175172, 0.725200331387984, 0.7292076660332972, 0.7358601667119073, 0.7472841454035468, 0.7376090017643866, 0.729773174538545, 0.7366575337043068, 0.7336037877759682, 0.7363556935396308, 0.7411858430600796, 0.7445145674990952, 0.7349878698425624, 0.7405355082790446, 0.733348602062975, 0.7403262701321028, 0.7391387022710821, 0.7311508946344553, 0.7332708446435034, 0.7367557908070937, 0.7407687805374592, 0.7397586409699601, 0.7351999355320304, 0.73974945145675, 0.7378839802750633, 0.7360877838852697, 0.7398392259319579, 0.7246517881378935, 0.7323872376040534, 0.7317779021896489, 0.7337317340752805, 0.7452341770720231, 0.7412324975117627, 0.7342442261581614, 0.741620577723489, 0.7368858577633006, 0.7398201400199059, 0.7380960459645313, 0.737

In [344]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [345]:
column_to_drop_24 = 'Cat_현재 교육환경'

In [346]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(8444, 123)


In [347]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [348]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [349]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [350]:
optuna_25 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [351]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.743


In [352]:
X_train = X_train.values
y_train = y_train.values

In [353]:
auc_bootstrap = []

In [354]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72163816, 0.74601681])

In [355]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9954977631568909, pvalue=1.0166957508772612e-05),
 0.7345796165286826)

In [356]:
t_25 = auc_bootstrap
print(t_25)

[0.7298184152189647, 0.7395571785649656, 0.734315621606949, 0.7278030842833876, 0.734917888165038, 0.7390453933677162, 0.7382056132374231, 0.7361542311346363, 0.7312830822475569, 0.7354056392508144, 0.7387329499185668, 0.7258358215707565, 0.739177580980818, 0.7375361925443359, 0.7305465074194715, 0.7369261502442997, 0.731784264160333, 0.7229411249095186, 0.7421330698063698, 0.7422914121878393, 0.7342357435305827, 0.7302665807093738, 0.7343113802931596, 0.7387909145403546, 0.7414509251719146, 0.7413887192363373, 0.7276694828990228, 0.7339784371606949, 0.739914862694535, 0.7252208310712993, 0.7264720186391603, 0.7377411893774883, 0.7307811934491495, 0.7452836590662322, 0.7381589587857402, 0.7383462834781036, 0.7333910152008686, 0.728126837902642, 0.7382883188563156, 0.7259807331252262, 0.7300382566503799, 0.7284845220322114, 0.7411420161509229, 0.733631356315599, 0.7426045625226203, 0.7280936142779587, 0.7236593207111834, 0.7364779847538907, 0.740777263165038, 0.7298113463626493, 0.73588

In [357]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [358]:
# 26
column_to_drop_25 = '총 가구원 수'

In [359]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(8444, 122)


In [360]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [361]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [362]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [363]:
optuna_26 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [364]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.743


In [365]:
X_train = X_train.values
y_train = y_train.values

In [366]:
auc_bootstrap = []

In [367]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72204205, 0.7455743 ])

In [368]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9931541681289673, pvalue=4.986891610769817e-08),
 0.7347808301099349)

In [369]:
t_26 = auc_bootstrap
print(t_26)

[0.7369742184672459, 0.7317962812160694, 0.73384483577633, 0.7357491856677525, 0.7454893627850163, 0.7371968874411872, 0.7479514454397395, 0.7352974857491857, 0.7265632068856315, 0.737632328990228, 0.736157765562794, 0.7324275300850525, 0.7251706421914585, 0.7352388142417662, 0.7403121324194716, 0.7368264793702498, 0.7305387316775244, 0.7297321751719146, 0.7373488678519724, 0.751026397937025, 0.738411316956207, 0.7413342890427072, 0.7430442453854506, 0.7297781227379659, 0.7314117354325009, 0.7414657697701772, 0.7276192940191821, 0.7358439083423814, 0.731022948335143, 0.7373276612830256, 0.730603765155628, 0.721385976520087, 0.7406860749185668, 0.7344647744752082, 0.735396449737604, 0.7128135744661599, 0.7468338592562432, 0.7456201366268549, 0.738040201999638, 0.7440600400380021, 0.7302962699058994, 0.7334419109663409, 0.7343778275425263, 0.7457127386445892, 0.7323328074104235, 0.7375199341748101, 0.7312371346815056, 0.7425798215255158, 0.7316096634093378, 0.728520573199421, 0.731381339

In [370]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [371]:
# 27
column_to_drop_26 = 'Cat_소득 계층'

In [372]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(8444, 120)


In [373]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [374]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [375]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [376]:
optuna_27 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [377]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.743


In [378]:
X_train = X_train.values
y_train = y_train.values

In [379]:
auc_bootstrap = []

In [380]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72143583, 0.74566455])

In [381]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9929548501968384, pvalue=3.328183595385781e-08),
 0.7345382651330076)

In [382]:
t_27 = auc_bootstrap
print(t_27)

[0.7240495215798046, 0.7341919166214259, 0.7353851395674992, 0.7418347640698516, 0.7303754410966341, 0.7354466386174447, 0.7240134704125949, 0.7344400334781035, 0.743751837902642, 0.7410918272710821, 0.7318125395855954, 0.7365317080618893, 0.7347821661237786, 0.7310257758776691, 0.7339529892779587, 0.7275252782301846, 0.736887271534564, 0.727524571344553, 0.7351723669923995, 0.7386050036192545, 0.7296219010133912, 0.7391068924176619, 0.7243273276330077, 0.7343021907799494, 0.7318090051574375, 0.7284194885541078, 0.733104726520087, 0.7295681777053926, 0.7268473749095187, 0.738960567091929, 0.7214934231360839, 0.7349991800126674, 0.7394207496380746, 0.7478743949058994, 0.7289348081795151, 0.7362645052931596, 0.7312010835142959, 0.7405899384726746, 0.7338823007148028, 0.7449139578809265, 0.7261984538997468, 0.7275344677433948, 0.7284725049764749, 0.7324444953402098, 0.7432372251628665, 0.7381264420466884, 0.7347143051031487, 0.7377715854596453, 0.7414883901103873, 0.7344315508505248, 0.72

In [383]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [384]:
# 28
column_to_drop_27  = '소득 중 근로/사업소득의 비중(월평균)'

In [385]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(8444, 119)


In [386]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [387]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [388]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [389]:
optuna_28 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [390]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.743


In [391]:
X_train = X_train.values
y_train = y_train.values

In [392]:
auc_bootstrap = []

In [393]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72209378, 0.74644599])

In [394]:
np.mean(auc_bootstrap)

0.7349285027596816

In [395]:
t_28 = auc_bootstrap
print(t_28)

[0.7318521251809627, 0.7335677366087586, 0.73179203990228, 0.7396589700959102, 0.7306723330618892, 0.721162600660514, 0.746806997602244, 0.742560028727832, 0.7378302569670647, 0.7377482582338039, 0.731276720276873, 0.7406245758686211, 0.7265575518005791, 0.7270424753438292, 0.7407560565960912, 0.7378535841929063, 0.7312526861654, 0.7352176076728193, 0.7319843127940644, 0.7340646772077453, 0.7301400481813247, 0.732609906577995, 0.7331556222855592, 0.7410204318222946, 0.7332757928429243, 0.7381745102696345, 0.7466571378483533, 0.7425663906985162, 0.7336080290897575, 0.7313049957021354, 0.7364030548769454, 0.7330460550126674, 0.7274072283297139, 0.7434521183948606, 0.734333293747738, 0.7373552298226566, 0.7250603680329352, 0.7449853533297142, 0.7332383279044516, 0.7340964870611655, 0.7437362864187477, 0.7283438517915309, 0.7379447724393775, 0.7405800420738328, 0.7366949986427795, 0.7297873122511762, 0.7266487400470503, 0.7337600095005429, 0.7389068437839307, 0.7404740092290989, 0.73451142

In [396]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [397]:
# 29
column_to_drop_28 = '소득 중 재산소득의 비중(월평균)'

In [398]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(8444, 118)


In [399]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [400]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [401]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [402]:
optuna_29 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [403]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.743


In [404]:
X_train = X_train.values
y_train = y_train.values

In [405]:
auc_bootstrap = []

In [406]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72246889, 0.74522562])

In [407]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9914819598197937, pvalue=2.0126760258420973e-09),
 0.7347030652681641)

In [408]:
t_29 = auc_bootstrap
print(t_29)

[0.7399085007238508, 0.7304390608034745, 0.7407694874230909, 0.7220575178700688, 0.7356898072747015, 0.743119175262396, 0.7380225298588491, 0.7347432874140427, 0.7331167435758233, 0.7423161531849439, 0.7266515675895765, 0.7273026092562432, 0.7276320179605502, 0.7293066300217156, 0.7332086387079262, 0.7349185950506696, 0.734924250135722, 0.7347439942996743, 0.7243513617444806, 0.7290995125316685, 0.741845367354325, 0.7366221894227288, 0.7274199522710821, 0.7343764137712631, 0.7397756062251177, 0.7354346215617082, 0.7395628336500182, 0.7262705562341658, 0.7280639250814334, 0.7473626097086501, 0.7275754671100253, 0.7301775131197974, 0.7211633075461455, 0.7377878438291712, 0.7397812613101702, 0.7384339372964169, 0.7298502250723851, 0.7355180340662323, 0.7410974823561347, 0.7255509466612379, 0.7345015325280492, 0.7322713083604777, 0.719763673995657, 0.7334093942272892, 0.7310625339305102, 0.7352699172095547, 0.7416170432953313, 0.7377213965798045, 0.7337062861925443, 0.7277331026058631, 0.7

In [409]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [410]:
column_to_drop_29 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [411]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(8444, 117)


In [412]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [413]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [414]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [415]:
optuna_30 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [416]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.743


In [417]:
X_train = X_train.values
y_train = y_train.values

In [418]:
auc_bootstrap = []

In [419]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72228151, 0.74628102])

In [420]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9964823126792908, pvalue=0.0001398184831487015),
 0.7349837815695123)

In [421]:
t_30 = auc_bootstrap
print(t_30)

[0.7330552445258778, 0.7269816831795151, 0.7416841974303292, 0.7333719292888164, 0.7391344609572927, 0.7304079578356858, 0.7293751979279768, 0.7339593512486428, 0.7359697339847991, 0.7317821435034383, 0.7352861755790807, 0.7393231994209193, 0.7471993191277597, 0.73150645810713, 0.7420531917300035, 0.7364249683315238, 0.7433884986880201, 0.7415378721045964, 0.7334397903094463, 0.7220476214712268, 0.7361506967064786, 0.7361429209645314, 0.7312166349981903, 0.7357392892689105, 0.7366398615635179, 0.7339148174538546, 0.7324317713988417, 0.7409582258867173, 0.7340356948968513, 0.7242566390698516, 0.7302241675714802, 0.7320974144951142, 0.7363189354867897, 0.7374619695530221, 0.7296869344914947, 0.7370767168838219, 0.7339473341929064, 0.7360609222312704, 0.7341876753076366, 0.733825749864278, 0.7355752918023887, 0.7266975151556279, 0.7364603126131017, 0.7445279983260948, 0.7231645007690916, 0.7247062183315237, 0.73249892553384, 0.7327428010767281, 0.7362475400380022, 0.7307409009681506, 0.73

In [422]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [423]:
# 31
column_to_drop_30 = 'Cat_현재 대중교통 접근용이성'

In [424]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(8444, 113)


In [425]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [426]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [427]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [428]:
optuna_31= DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [429]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.743


In [430]:
X_train = X_train.values
y_train = y_train.values

In [431]:
auc_bootstrap = []

In [432]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7226477 , 0.74603708])

In [433]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9966182708740234, pvalue=0.00020526604203041643),
 0.7349751752369481)

In [434]:
t_31 = auc_bootstrap
print(t_31)

[0.7369565463264567, 0.7179038578990228, 0.7386884161237786, 0.7396625045240681, 0.7338193878935939, 0.7300396704216432, 0.7317531611925443, 0.7381575450144771, 0.7421302422638437, 0.7298580008143323, 0.7320147088762214, 0.734257656985161, 0.7320125882193268, 0.7357378754976474, 0.7344796190734708, 0.7291539427252987, 0.7389492569218241, 0.7334885654180239, 0.7361662481903727, 0.7341176936301121, 0.7380607016829532, 0.7323526002081071, 0.7366539992761492, 0.731903020946435, 0.7263878992490047, 0.7328707473760406, 0.7298311391603329, 0.7423394804107855, 0.7355590334328628, 0.7346308925986247, 0.732180826999638, 0.7361266625950054, 0.7356268944534925, 0.7355710504885993, 0.7364235545602607, 0.7367345842381469, 0.7244771873868983, 0.7343820688563156, 0.7364680883550488, 0.7337338547321751, 0.7438069749819036, 0.739331682048498, 0.7308773298950416, 0.7321461896036916, 0.7453974676529136, 0.7411250508957654, 0.736652585504886, 0.7420595537006875, 0.7298184152189648, 0.7305896274429967, 0.72

In [435]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [436]:
# 32
column_to_drop_31 = 'Cat_현재 의료시설 접근용이성'

In [437]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(8444, 109)


In [438]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [439]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [440]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7574041253607591


In [441]:
optuna_32 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [442]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.743


In [443]:
X_train = X_train.values
y_train = y_train.values

In [444]:
auc_bootstrap = []

In [445]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72259473, 0.74655469])

In [446]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.997129499912262, pvalue=0.0009156885789707303),
 0.7350920743276104)

In [447]:
t_32 = auc_bootstrap
print(t_32)

[0.7231503630564603, 0.7208353126131017, 0.7351794358487151, 0.7307133324285198, 0.7336348907437569, 0.7424490476836771, 0.7411314128664495, 0.7286258991585233, 0.7348259930329353, 0.7386657957835687, 0.7413646851248643, 0.7416389567499095, 0.7246786497918929, 0.7356410321661239, 0.7398074160785378, 0.741385891693811, 0.741238152596815, 0.7317411441368078, 0.7379200314422729, 0.7392942171100253, 0.7356961692453856, 0.735178022077452, 0.7377828956297502, 0.7370095627488237, 0.7248850603963084, 0.7267533591205212, 0.7401735828356859, 0.7226265608034745, 0.7320550013572205, 0.732894074601882, 0.734268967155266, 0.736580483170467, 0.7397388481722765, 0.7412734968783931, 0.7402258923724213, 0.7336094428610207, 0.7440664020086862, 0.7345375836952588, 0.7357668578085413, 0.7358827870521173, 0.7402767881378935, 0.732226067680058, 0.7516364402370612, 0.7358375463716973, 0.7389888425171913, 0.7299081896941729, 0.7387866732265653, 0.7365712936572566, 0.7400795670466884, 0.7393854053564966, 0.7394

In [448]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [449]:
# 33.
column_to_drop_32 = 'Cat_이사 계획 중인 주택의 유형'

In [450]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(8444, 90)


In [451]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [452]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [453]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7604305243106352


In [454]:
optuna_33 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [455]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.743


In [456]:
X_train = X_train.values
y_train = y_train.values

In [457]:
auc_bootstrap = []

In [458]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72171878, 0.74621627])

In [459]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9968011379241943, pvalue=0.00034714964567683637),
 0.7348736961494526)

In [460]:
t_33 = auc_bootstrap
print(t_33)

[0.7342117094191096, 0.7444368100796237, 0.7335790467788635, 0.7293497500452407, 0.7232804300126674, 0.731578560441549, 0.7215351293883461, 0.7461510077361564, 0.752502375135722, 0.7322698945892145, 0.7385187635722041, 0.7366405684491495, 0.7367798249185668, 0.7343375350615273, 0.7202238565418024, 0.7312166349981905, 0.7334334283387622, 0.7418241607853783, 0.738959153320666, 0.7392397869163951, 0.732600010179153, 0.7437765788997466, 0.7392129252623959, 0.7261751266739052, 0.7372286972946073, 0.7406061968422005, 0.7340498326094824, 0.7365338287187839, 0.7265179662052117, 0.7355300511219689, 0.7283926269001085, 0.7375573991132827, 0.7350203865816142, 0.7318648491223307, 0.7335832880926528, 0.7250935916576184, 0.7361082835685849, 0.7386191413318856, 0.7330898819218241, 0.734511428926891, 0.7392341318313427, 0.7434683767643866, 0.7387668804288816, 0.734453464305103, 0.7273421948516104, 0.7241025380021715, 0.7477330177795874, 0.737585674538545, 0.7389238090390879, 0.7298975864096996, 0.7348

In [461]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [462]:
# 34
column_to_drop_33 = 'Cat_현재 상업시설 접근용이성'

In [463]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(8444, 86)


In [464]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [465]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [466]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9}
0.7636414463071083


In [467]:
optuna_34 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [468]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.736


In [469]:
X_train = X_train.values
y_train = y_train.values

In [470]:
auc_bootstrap = []

In [471]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71817977, 0.74524347])

In [472]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9972063899040222, pvalue=0.0011548292823135853),
 0.7318949734069625)

In [473]:
t_34 = auc_bootstrap
print(t_34)

[0.7287686900560982, 0.7226887667390517, 0.7402591159971046, 0.7337345616178068, 0.7310872749276149, 0.7321970853691641, 0.7306398163228374, 0.7260075947792255, 0.7360609222312703, 0.7245931166304741, 0.7253664495114007, 0.7382692329442634, 0.7285318833695259, 0.7266077406804199, 0.7380727187386898, 0.7332227764205573, 0.7395204205121244, 0.7336886140517553, 0.7278165151103874, 0.7404930951411509, 0.7431637090571843, 0.7246553225660515, 0.7330672615816143, 0.7339501617354324, 0.7309211568041982, 0.7278306528230183, 0.731554526330076, 0.7391613226112921, 0.723935006107492, 0.7224774079352153, 0.737108526737242, 0.7217903151013392, 0.7365140359211002, 0.7325017530763662, 0.7321709306007962, 0.7339225931958017, 0.7366370340209917, 0.7455402585504886, 0.7401891343195801, 0.7265780514838944, 0.7280059604596454, 0.7410748620159248, 0.7345488938653638, 0.7355441888346, 0.7338674561165399, 0.7353250542888164, 0.7110520154723128, 0.7336228736880201, 0.7315276646760768, 0.7271258878483532, 0.731

In [474]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [475]:
# 35
column_to_drop_34 = '현재 무주택 기간(총 개월)'

In [476]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(8444, 85)


In [477]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [478]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [479]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9}
0.7627939662068691


In [480]:
optuna_35 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [481]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.737


In [482]:
X_train = X_train.values
y_train = y_train.values

In [483]:
auc_bootstrap = []

In [484]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71815031, 0.74504183])

In [485]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9958915710449219, pvalue=2.808445606206078e-05),
 0.7322735862994253)

In [486]:
t_35 = auc_bootstrap
print(t_35)

[0.7520308824194715, 0.7258450110839666, 0.7370533896579804, 0.7370357175171915, 0.7314025459192905, 0.7320613633279045, 0.7228909360296779, 0.7297130892598624, 0.7302319433134273, 0.7352847618078175, 0.7364518299855229, 0.7370321830890335, 0.7355611540897576, 0.7309204499185669, 0.723795042752443, 0.7416559220050669, 0.729490420285921, 0.724911215164676, 0.7304079578356859, 0.7379334622692725, 0.7362708672638437, 0.7313862875497648, 0.7236508380836048, 0.7324727707654722, 0.7378224812251177, 0.7394440768639159, 0.7332220695349259, 0.7485346260857763, 0.7277917741132828, 0.7164010190463265, 0.73673599800941, 0.7360623360025335, 0.7217613327904452, 0.7234055487694535, 0.7257382713536011, 0.7318026431867534, 0.7373410921100253, 0.7314145629750272, 0.7278158082247558, 0.7328622647484618, 0.7388898785287732, 0.7329859697339848, 0.7444276205664133, 0.7291574771534564, 0.7414516320575463, 0.7247450970412594, 0.7306892983170468, 0.7419733136536374, 0.7458534088852695, 0.7339607650199059, 0.73

In [487]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [488]:
# 36
column_to_drop_35 = '자산 중 기타자산의 비중'

In [489]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(8444, 84)


In [490]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [491]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [492]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9}
0.7627773922805281


In [493]:
optuna_36 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [494]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.737


In [495]:
X_train = X_train.values
y_train = y_train.values

In [496]:
auc_bootstrap = []

In [497]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71757608, 0.74591636])

In [498]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9966986775398254, pvalue=0.00025829620426520705),
 0.7322196399690102)

In [499]:
t_36 = auc_bootstrap
print(t_36)

[0.7320076400199059, 0.7367218602967789, 0.7437490103601159, 0.7175793973941368, 0.732087518096272, 0.7258768209373869, 0.7180826999638074, 0.7253424153999277, 0.7412501696525515, 0.7330821061798769, 0.7289885314875135, 0.7413660988961274, 0.7215584566141874, 0.7344046891965255, 0.721117359980094, 0.7294034733532392, 0.7241046586590663, 0.72976186436844, 0.7146783387622151, 0.7115369390155628, 0.7387555702587766, 0.7308306754433587, 0.7211838072294607, 0.7312512723941368, 0.7378783251900108, 0.7277076547231269, 0.734688150334781, 0.7384007136717335, 0.7253721045964532, 0.7371417503619254, 0.7406945575461455, 0.7360708186301121, 0.7260330426619617, 0.7283353691639523, 0.7307507973669924, 0.7422843433315237, 0.7378825665038002, 0.7412720831071299, 0.7347864074375681, 0.7342788635541078, 0.7347772179243575, 0.729860121471227, 0.7232634647575101, 0.730669505519363, 0.7464217449330437, 0.7268855467336228, 0.7313848737785015, 0.7274355037549765, 0.731490199737604, 0.7350345242942452, 0.73207

In [500]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [501]:
# 37
column_to_drop_36 = 'Cat_이사 계획 중인 주택의 점유형태'

In [502]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(8444, 60)


In [503]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [504]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [505]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7626370663708405


In [506]:
optuna_37 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [507]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.733


In [508]:
X_train = X_train.values
y_train = y_train.values

In [509]:
auc_bootstrap = []

In [510]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71780664, 0.74560255])

In [511]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9972085356712341, pvalue=0.0011624203762039542),
 0.7321930808620611)

In [512]:
t_37 = auc_bootstrap
print(t_37)

[0.728191164495114, 0.739497093286283, 0.726821220141151, 0.7263256933134274, 0.7275061923181325, 0.7335338060984437, 0.7183145584509592, 0.7377277585504884, 0.7405241981089395, 0.73640517553384, 0.7449733362739775, 0.7355279304650741, 0.7411080856406081, 0.745231349529497, 0.7415951298407528, 0.7358728906532753, 0.7246779429062613, 0.7370138040626131, 0.741675007917119, 0.7454130191368079, 0.7296381593829171, 0.740476836771625, 0.7316746968874411, 0.732250101791531, 0.7189274282935215, 0.7366610681324647, 0.7363337800850525, 0.724850423000362, 0.7262945903456387, 0.7263461929967427, 0.733530271670286, 0.735378777596815, 0.7393882328990228, 0.7259171134183857, 0.725594066684763, 0.7327647145313064, 0.7250681437748824, 0.7375835538816503, 0.7223862196887441, 0.7363033840028953, 0.7434422219960187, 0.7209830517100976, 0.7336836658523345, 0.7353300024882374, 0.7372576796055014, 0.739177580980818, 0.7435192725298587, 0.7408069523615635, 0.7347666146398841, 0.7455070349258053, 0.74273321570

In [513]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [514]:
# 38
column_to_drop_37 = '부채 중 금융기관 대출금의 비중'

In [515]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(8444, 59)


In [516]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [517]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [518]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7631099757357717


In [519]:
optuna_38 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [520]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.733


In [521]:
X_train = X_train.values
y_train = y_train.values

In [522]:
auc_bootstrap = []

In [523]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7174002, 0.7460744])

In [524]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988351464271545, pvalue=0.2055213302373886),
 0.7318972396822974)

In [525]:
t_38 = auc_bootstrap
print(t_38)

[0.7132900153818313, 0.7309119672909881, 0.7247995272348896, 0.7263963818765835, 0.7147228725570033, 0.7311890664585596, 0.7248659744842562, 0.7234359448516106, 0.7369805804379297, 0.722869729460731, 0.7337373891603329, 0.7267208423814695, 0.7521291395222583, 0.7443859143141512, 0.7302064954306914, 0.7394659903184944, 0.7175921213355049, 0.726024560034383, 0.7232408444173001, 0.7365712936572565, 0.7323405831523706, 0.7336398389431779, 0.7305592313608397, 0.7505845944173001, 0.7185117795421644, 0.7390517553384003, 0.7346422027687296, 0.7387046744933043, 0.7266056200235251, 0.7268134443992037, 0.7433142756967065, 0.727359160106768, 0.7391959600072384, 0.7264380881288454, 0.7299364651194353, 0.7312039110568224, 0.7396837110930148, 0.7326395957745204, 0.7292097866901918, 0.7314817171100253, 0.7324480297683676, 0.7413314615001809, 0.73166055917481, 0.729943533975751, 0.7357117207292796, 0.7394716454035469, 0.7216574206026058, 0.7406528512938835, 0.7272799889160332, 0.7308031069037279, 0.735

In [526]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [527]:
# 38
column_to_drop_38 = 'Cat_현재 대기오염 정도'

In [528]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(8444, 55)


In [529]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [530]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [531]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7631099757357717


In [532]:
optuna_39 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [533]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.733


In [534]:
X_train = X_train.values
y_train = y_train.values

In [535]:
auc_bootstrap = []

In [536]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71773753, 0.74654303])

In [537]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987385869026184, pvalue=0.15277153253555298),
 0.7323623976429605)

In [538]:
t_39 = auc_bootstrap
print(t_39)

[0.7334793759048135, 0.7266374298769454, 0.730420681777054, 0.7206508154632645, 0.7314788895674992, 0.737833084509591, 0.7251890212178792, 0.7274206591567137, 0.7442699850705755, 0.7271018537368802, 0.7408536068132465, 0.7304376470322114, 0.7345255666395223, 0.7473541270810713, 0.7387605184581975, 0.7272227311798769, 0.731530492218603, 0.7281430962721679, 0.7321603273163229, 0.731040620475932, 0.7273626945349259, 0.7328212653818315, 0.7333104302388708, 0.7381518899294245, 0.7346662368802026, 0.7335528920104958, 0.7293730772710821, 0.7227552139884185, 0.735761202723489, 0.7270368202587767, 0.7322967562432139, 0.7387124502352516, 0.7417718512486426, 0.7353455539721316, 0.7122784620430691, 0.7325597176981543, 0.7333896014296054, 0.7244064988237424, 0.7444000520267824, 0.7468084113735071, 0.7381582519001085, 0.7367239809536735, 0.7423628076366268, 0.7335380474122332, 0.7330135382736156, 0.7195537289630834, 0.725565084373869, 0.7274673136083966, 0.7485346260857764, 0.7272361620068767, 0.735

In [539]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [540]:
# 40
column_to_drop_39 = 'Cat_현재 주차시설 이용편의성'

In [541]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(8444, 51)


In [542]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [543]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [544]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7631099757357717


In [545]:
optuna_40 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [546]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.733


In [547]:
X_train = X_train.values
y_train = y_train.values

In [548]:
auc_bootstrap = []

In [549]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71813344, 0.7468817 ])

In [550]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9982697367668152, pvalue=0.03372696787118912),
 0.7329462887090572)

In [551]:
t_40 = auc_bootstrap
print(t_40)

[0.7225509240408975, 0.7357103069580166, 0.7325194252171553, 0.7324056166304741, 0.7427579567046688, 0.7354218976203402, 0.7426695960007239, 0.7349383878483532, 0.7206225400380022, 0.7342823979822657, 0.7329562805374592, 0.7348415445168295, 0.7339996437296417, 0.739302699737604, 0.7091017180148389, 0.7302694082519001, 0.734015195213536, 0.7282929560260587, 0.7361408003076366, 0.726851616223308, 0.7184170568675353, 0.726083938427434, 0.7328756955754614, 0.7289482390065147, 0.73290679854325, 0.7341770720231633, 0.7209385179153094, 0.7334624106496561, 0.7350804718602968, 0.7291702010948244, 0.731524130247919, 0.7320726734980094, 0.7367338773525154, 0.7237406125588128, 0.738223285378212, 0.7409363124321389, 0.7397112796326457, 0.7307048498009411, 0.7304920772258415, 0.7337331478465435, 0.728729104460731, 0.725052592290988, 0.745981355184582, 0.7323214972403184, 0.7242502770991676, 0.7325724416395223, 0.7311558428338762, 0.7381589587857401, 0.7465885699420918, 0.7233998936844009, 0.73466765

In [552]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [553]:
# 41.
column_to_drop_40 = '소득 대비 주거관리비의 비율'

In [554]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(8444, 50)


In [555]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [556]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [557]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7621520027932589


In [558]:
optuna_41 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [559]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.735


In [560]:
X_train = X_train.values
y_train = y_train.values

In [561]:
auc_bootstrap = []

In [562]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71907324, 0.7465581 ])

In [563]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9994019269943237, pvalue=0.808872640132904),
 0.7331138502929334)

In [564]:
t_41 = auc_bootstrap
print(t_41)

[0.7362305747828447, 0.7311494808631922, 0.730415733577633, 0.7222872557003257, 0.7328212653818313, 0.72699299334962, 0.7321051902370611, 0.7311091883821933, 0.731848590752805, 0.7252908127488238, 0.7355293442363373, 0.7322076886536373, 0.7260747489142235, 0.7340611427795873, 0.7349751459011943, 0.725259709781035, 0.7272389895494028, 0.7309070190915671, 0.7298261909609121, 0.7433411373507058, 0.7257255474122332, 0.7297689332247558, 0.7399311210640609, 0.7284180747828447, 0.737473279723127, 0.7382685260586319, 0.7278405492218603, 0.7271753698425624, 0.7281522857853782, 0.7364277958740499, 0.7482172344372059, 0.7326501990589938, 0.7116040931505611, 0.7204048192634817, 0.7269350287278321, 0.7311798769453492, 0.7465553463174086, 0.7444580166485705, 0.7345163771263121, 0.7262097640698516, 0.7221593094010135, 0.715057229460731, 0.7325611314694173, 0.7319058484889613, 0.7161104890517552, 0.7272191967517191, 0.7448418555465074, 0.7368872715345638, 0.7462789540354686, 0.7333478951773436, 0.7363

In [565]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [566]:
# 42.
column_to_drop_41 = '소득 대비 생활비의 비율'

In [567]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(8444, 49)


In [568]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [569]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [570]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7621520027932589


In [571]:
optuna_42 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [572]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.735


In [573]:
X_train = X_train.values
y_train = y_train.values

In [574]:
auc_bootstrap = []

In [575]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71937565, 0.74717187])

In [576]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9946050643920898, pvalue=1.1714482752722688e-06),
 0.7340924514510948)

In [577]:
t_42 = auc_bootstrap
print(t_42)

[0.722184050398118, 0.7275097267462902, 0.7194095242942453, 0.7415428203040173, 0.7290344790535649, 0.7424292548859935, 0.7345361699239956, 0.7237441469869708, 0.7408776409247195, 0.7297498473127036, 0.7308363305284111, 0.7289913590300399, 0.7406804198335143, 0.7375623473127036, 0.7284307987242129, 0.7357809955211726, 0.7253558462269273, 0.7292521998280855, 0.7438062680962721, 0.7418701083514296, 0.7336645799402822, 0.7378875147032211, 0.7353745362830257, 0.7375545715707563, 0.7272750407166123, 0.7343318799764749, 0.7376825178700688, 0.7222526183043794, 0.7343870170557366, 0.7296713830076004, 0.727553553655447, 0.7342986563517916, 0.739661090752805, 0.7428696446344553, 0.7434521183948606, 0.7252879852062973, 0.7309133810622511, 0.7319956229641694, 0.7210388956749909, 0.720126306324647, 0.7410557761038725, 0.7307550386807817, 0.7340632634364821, 0.7309791214259863, 0.7306497127216793, 0.7385145222584147, 0.7243683269996382, 0.7414657697701773, 0.7157019091567137, 0.7227000769091567, 0.7

In [578]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [579]:
# 43.
column_to_drop_42 = '자산 중 금융자산의 비중'

In [580]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(8444, 48)


In [581]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [582]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [583]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7621133302984632


In [584]:
optuna_43 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [585]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.736


In [586]:
X_train = X_train.values
y_train = y_train.values

In [587]:
auc_bootstrap = []

In [588]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72034601, 0.74743286])

In [589]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9978739023208618, pvalue=0.009319519624114037),
 0.7344118163709056)

In [590]:
t_43 = auc_bootstrap
print(t_43)

[0.7417866958469055, 0.7312972199601883, 0.7340173158704306, 0.7353031408342381, 0.7277627918023887, 0.7328339893231995, 0.7300608769905899, 0.7433722403184945, 0.7297145030311255, 0.7426653546869344, 0.7379935475479551, 0.7399721204306913, 0.7407072814875135, 0.725871872737966, 0.7379758754071661, 0.731866262893594, 0.7449415264205573, 0.7346471509681505, 0.7272559548045602, 0.7441399181143684, 0.7286004512757872, 0.7442791745837858, 0.7373693675352877, 0.7455953956297502, 0.7411321197520812, 0.7358587529406442, 0.734693098534202, 0.732402789087948, 0.7214637339395584, 0.7164462597267462, 0.7425904248099892, 0.7273259364820845, 0.7315941119254433, 0.7307401940825189, 0.7353660536554469, 0.7192087687748823, 0.7315934050398119, 0.7330297966431415, 0.7273619876492943, 0.7431234165761853, 0.7343071389793703, 0.7374018842743395, 0.7319284688291712, 0.731260461907347, 0.7335705641512849, 0.7412183597991314, 0.7399360692634819, 0.7233270844643502, 0.7244107401375317, 0.7284081783840028, 0.73

In [591]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [592]:
## 44
column_to_drop_43 = '소득 중 정부 보조금의 비중(월평균)'

In [593]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(8444, 47)


In [594]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [595]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [596]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7621133302984632


In [597]:
optuna_44 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [598]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.736


In [599]:
X_train = X_train.values
y_train = y_train.values

In [600]:
auc_bootstrap = []

In [601]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72057626, 0.74765146])

In [602]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9973888993263245, pvalue=0.0020179192069917917),
 0.7341986207247556)

In [603]:
t_44 = auc_bootstrap
print(t_44)

[0.7346542198244661, 0.7325151839033658, 0.7389697566051394, 0.7407807975931958, 0.734686736563518, 0.735661531849439, 0.7372442487785016, 0.7236564931686572, 0.737031476203402, 0.7280646319670647, 0.7322592913047412, 0.7459177354777414, 0.7272142485522982, 0.7273344191096635, 0.7320818630112197, 0.7267229630383641, 0.7253070711183496, 0.7334560486789721, 0.7270601474846181, 0.7345969620883097, 0.7265928960821572, 0.7367529632645675, 0.737826722538907, 0.74447144747557, 0.735332123145132, 0.7482462167480998, 0.7322211194806371, 0.7205787131288455, 0.7369056505609844, 0.7384897812613102, 0.7344463954487876, 0.7368893921914584, 0.7217811255881289, 0.742884489232718, 0.7407072814875135, 0.7378005677705392, 0.7319058484889612, 0.7344591193901556, 0.7259220616178066, 0.7304037165218965, 0.756933841159971, 0.7427381639069851, 0.7340491257238508, 0.732505287504524, 0.7331846045964532, 0.7283558688472676, 0.7348818369978285, 0.7421896206568946, 0.7353582779134998, 0.7296537108668115, 0.7360566

In [604]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [605]:
# 45
column_to_drop_44 = '현재 주택의 면적(㎡)'

In [606]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(8444, 46)


In [607]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [608]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [609]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9}
0.7660932824771611


In [610]:
optuna_45 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [611]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.742


In [612]:
X_train = X_train.values
y_train = y_train.values

In [613]:
auc_bootstrap = []

In [614]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71959383, 0.74814114])

In [615]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9964277744293213, pvalue=0.00012003983283648267),
 0.7351467915168068)

In [616]:
t_45 = auc_bootstrap
print(t_45)

[0.7322974631288455, 0.7341869684220051, 0.7221847572837495, 0.7282109572927977, 0.7421097425805285, 0.7447499604144046, 0.7209533625135721, 0.7333012407256606, 0.7331725875407166, 0.7186715356948968, 0.7302771839938473, 0.7415187861925443, 0.7185725717064786, 0.7332369141331884, 0.7483204397394136, 0.7350196796959826, 0.7315509919019182, 0.7345276872964168, 0.742465306053203, 0.7264974665218965, 0.7312943924176619, 0.7335882362920737, 0.7212495475931957, 0.7308363305284111, 0.739916276465798, 0.7452603318403909, 0.7387718286283025, 0.7420475366449512, 0.7428229901827723, 0.729069823335143, 0.731447079714079, 0.7264140540173724, 0.7456781012486429, 0.7267597210912051, 0.7326445439739414, 0.7386756921824105, 0.7078385133912414, 0.7285453141965255, 0.7507351610568223, 0.7435164449873326, 0.7428710584057183, 0.735950648072747, 0.7323193765834238, 0.7250695575461455, 0.7346584611382554, 0.7247655967245746, 0.7222808937296417, 0.7265455347448426, 0.7392298905175535, 0.7260620249728555, 0.75

In [617]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [618]:
# 46.
column_to_drop_45 = '장기부채부담지표'

In [619]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(8444, 45)


In [620]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [621]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [622]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9}
0.767189371472516


In [623]:
optuna_46 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [624]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.744


In [625]:
X_train = X_train.values
y_train = y_train.values

In [626]:
auc_bootstrap = []

In [627]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7218052 , 0.74996667])

In [628]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9975151419639587, pvalue=0.002986395498737693),
 0.7362195434791214)

In [629]:
t_46 = auc_bootstrap
print(t_46)

[0.7444035864549403, 0.7421677072023164, 0.7388898785287731, 0.7301845819761129, 0.7277528954035468, 0.7361994718150561, 0.7380769600524792, 0.7528501628664496, 0.7158885269634455, 0.7357237377850163, 0.7381377522167932, 0.7337373891603329, 0.736907771217879, 0.7341664687386898, 0.7486406589305102, 0.7362022993575823, 0.7298756729551212, 0.7402470989413682, 0.7119511739956568, 0.7293674221860297, 0.7346231168566776, 0.7392737174267101, 0.7498197441639521, 0.7348464927162506, 0.7343403626040536, 0.7484328345548316, 0.7366921711002533, 0.7293744910423453, 0.727744412775968, 0.7409532776872964, 0.7359110624773796, 0.7316216804650743, 0.7319758301664856, 0.7400343263662686, 0.7308080551031488, 0.739593229732175, 0.7306984878302569, 0.7357937194625408, 0.7416856112015924, 0.744171021082157, 0.7376097086500181, 0.7467242919833514, 0.7420772258414767, 0.7429339712269272, 0.7435150312160694, 0.7319263481722765, 0.7425678044697792, 0.750679317091929, 0.7574272473307999, 0.7340519532663772, 0.73

In [630]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [631]:
# 47.
column_to_drop_46 = 'Cat_가구주 종사상 지위'

In [632]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(8444, 40)


In [633]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [634]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [635]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9}
0.7658667721505001


In [636]:
optuna_47 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [637]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.743


In [638]:
X_train = X_train.values
y_train = y_train.values

In [639]:
auc_bootstrap = []

In [640]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72242939, 0.74953211])

In [641]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979204535484314, pvalue=0.010827870108187199),
 0.7363454871714395)

In [642]:
t_47 = auc_bootstrap
print(t_47)

[0.7447110817046688, 0.7336857865092291, 0.7379009455302208, 0.7326296993756785, 0.7296006944444444, 0.715959922412233, 0.7433771885179152, 0.7391959600072384, 0.7422935328447339, 0.7475188314332248, 0.7360559740318495, 0.7314930272801303, 0.7306892983170467, 0.7455854992309086, 0.747740086635903, 0.7427572498190372, 0.7382854913137893, 0.7289319806369887, 0.7288726022439377, 0.7411731191187114, 0.7208197611292074, 0.7262373326094825, 0.7322840323018458, 0.7495850581342743, 0.7420355195892147, 0.727559915626131, 0.742569925126674, 0.7372400074647123, 0.7517057150289541, 0.7285000735161056, 0.7187931200235251, 0.7253148468602967, 0.7367381186663047, 0.7331125022620342, 0.7300842042164314, 0.7240445733803836, 0.7230337269272529, 0.7333019476112921, 0.7406076106134637, 0.7430810034382918, 0.7462648163228375, 0.7447230987604052, 0.7358092709464351, 0.7388934129569309, 0.7406825404904089, 0.7375001413771264, 0.7322804978736879, 0.7388955336138254, 0.7456349812251177, 0.7409045025787188, 0.7

In [643]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [644]:
# 48
column_to_drop_47 = '가구주 나이'

In [645]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(8444, 39)


In [646]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [647]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [648]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.767361740306463


In [649]:
optuna_48 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [650]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.741


In [651]:
X_train = X_train.values
y_train = y_train.values

In [652]:
auc_bootstrap = []

In [653]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72166847, 0.74877693])

In [654]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9959511160850525, pvalue=3.286805804236792e-05),
 0.7362721205720684)

In [655]:
t_48 = auc_bootstrap
print(t_48)

[0.7433460855501266, 0.7327823866720955, 0.7339494548498009, 0.7445265845548318, 0.7366038103963083, 0.7276871550398119, 0.7277168442363373, 0.7367063088128845, 0.7363747794516831, 0.7386325721588853, 0.7427219055374593, 0.7400541191639523, 0.7449068890246109, 0.7468783930510314, 0.72283297140789, 0.7323851169471589, 0.7427855252442997, 0.7376238463626493, 0.7260839384274339, 0.7391344609572927, 0.7331577429424537, 0.7305175251085776, 0.7497660208559537, 0.7426879750271445, 0.7412727899927615, 0.7284350400380022, 0.723717992218603, 0.7374612626673905, 0.7430350558722403, 0.7251352979098806, 0.7362539020086862, 0.7447527879569308, 0.7406987988599347, 0.7376005191368078, 0.7287248631469417, 0.7444368100796236, 0.7386276239594644, 0.7433828436029678, 0.7321108453221137, 0.7347510631559897, 0.7380041508324285, 0.732468529451683, 0.7366554130474121, 0.7325957688653637, 0.7476722256152731, 0.7426844405989866, 0.7395211273977561, 0.7348097346634094, 0.7442042447068403, 0.736387503393051, 0.74

In [656]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [657]:
# 49
column_to_drop_48 = '중기부채부담지표'

In [658]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(8444, 38)


In [659]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [660]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [661]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7632967086392144


In [662]:
optuna_49 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [663]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.746


In [664]:
X_train = X_train.values
y_train = y_train.values

In [665]:
auc_bootstrap = []

In [666]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72958649, 0.75003382])

In [667]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9956982731819153, pvalue=1.6968320778687485e-05),
 0.7405077198979823)

In [668]:
t_49 = auc_bootstrap
print(t_49)

[0.7375976915942816, 0.7456173090843287, 0.7362468331523706, 0.7384127307274702, 0.7335613746380746, 0.7344626538183134, 0.7263963818765834, 0.7366971192996742, 0.74051076728194, 0.7496507984980094, 0.7418616257238508, 0.7476701049583786, 0.7405256118802027, 0.7403269770177344, 0.7439907652461092, 0.7395557647937026, 0.7439108871697431, 0.7428498518367717, 0.7388361552207745, 0.7377991539992761, 0.7356417390517554, 0.7441172977741586, 0.7331718806550851, 0.744484171416938, 0.7465624151737242, 0.7410981892417661, 0.7383583005338401, 0.7440409541259501, 0.7420277438472674, 0.7432619661599709, 0.7386749852967789, 0.7254986371245022, 0.744717443675353, 0.7268127375135722, 0.7471851814151285, 0.7379143763572205, 0.7348380100886717, 0.7377284654361201, 0.7363351938563156, 0.7480765641965255, 0.7397777268820123, 0.7445838422909881, 0.7509429854325008, 0.7359965956387984, 0.7349334396489323, 0.7399176902370611, 0.7398597256152732, 0.7322062748823743, 0.7484681788364097, 0.7369204951592472, 0.7

In [669]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [670]:
# 50
column_to_drop_49 = 'Cat_현재 거주 지역'

In [671]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(8444, 21)


In [672]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [673]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [674]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7640071776150341


In [675]:
optuna_50 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [676]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.746


In [677]:
X_train = X_train.values
y_train = y_train.values

In [678]:
auc_bootstrap = []

In [679]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73190071, 0.75064714])

In [680]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9942126274108887, pvalue=4.793039920514275e-07),
 0.7418237119130022)

In [681]:
t_50 = auc_bootstrap
print(t_50)

[0.7457494966974303, 0.7392673554560261, 0.743390619344915, 0.7499674832609482, 0.7509337959192907, 0.7439603691639523, 0.7473894713626493, 0.7360291123778501, 0.7414636491132827, 0.7305210595367353, 0.7414777868259139, 0.7400187748823743, 0.7457608068675353, 0.7378267225389071, 0.7450730071480276, 0.7461220254252624, 0.7457028422457475, 0.7452433665852335, 0.7465694840300398, 0.750814332247557, 0.750372528727832, 0.7315439230456026, 0.7460407335776329, 0.744280588355049, 0.7375786056822294, 0.7485134195168295, 0.7324360127126311, 0.7531364515472313, 0.7444191379388346, 0.7394306460369163, 0.7455664133188563, 0.7476305193630113, 0.7341438483984799, 0.7384664540354688, 0.7358453221136445, 0.7349369740770901, 0.7459898378121607, 0.7397367275153819, 0.7486031939920377, 0.7422284993666305, 0.741456580256967, 0.7466048283116177, 0.7529137825732899, 0.7427650255609844, 0.7452292288726022, 0.7408472448425625, 0.7394645765472313, 0.7429763843648208, 0.7394582145765471, 0.7431983464531307, 0.74

In [682]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [683]:
# 51
column_to_drop_50 = '총 이사 횟수'

In [684]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(8444, 20)


In [685]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [686]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [687]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6}
0.7647717880835678


In [688]:
optuna_51 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [689]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.748


In [690]:
X_train = X_train.values
y_train = y_train.values

In [691]:
auc_bootstrap = []

In [692]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72975821, 0.74863361])

In [693]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.980751633644104, pvalue=8.818519306170829e-16),
 0.7407182466974302)

In [694]:
t_51 = auc_bootstrap
print(t_51)

[0.7335415818403909, 0.7423062567861021, 0.7396384704125951, 0.745266693811075, 0.7438331297502714, 0.7378542910785378, 0.7428682308631921, 0.7435122036735433, 0.7471201479370249, 0.7436125814332247, 0.7423437217245747, 0.7285431935396308, 0.7417845751900108, 0.7413328752714441, 0.7379532550669561, 0.7392949239956569, 0.7349426291621424, 0.7397346068584871, 0.7387725355139341, 0.7428555069218241, 0.7430230388165037, 0.7434613079080709, 0.7400965323018458, 0.745168436708288, 0.7477733102605864, 0.7430138493032935, 0.739967879116902, 0.7424144102877308, 0.7379702203221136, 0.7375475027144408, 0.7461778693901555, 0.7441922276511039, 0.7461990759591024, 0.7413929605501266, 0.740470474800941, 0.7424469270267825, 0.7351546948516106, 0.7429961771625045, 0.7387449669743033, 0.7438112162956931, 0.7384247477832067, 0.7391987875497648, 0.7343438970322114, 0.73419827859211, 0.7440310577271082, 0.7261701784744842, 0.737349574737604, 0.743165829714079, 0.7333295161509228, 0.7438267677795873, 0.74702

In [695]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [696]:
# 52
column_to_drop_51 = '소득 대비 주택 임대료의 비율'

In [697]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(8444, 19)


In [698]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [699]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [700]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7}
0.7681849119814017


In [701]:
optuna_52 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [702]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.752


In [703]:
X_train = X_train.values
y_train = y_train.values

In [704]:
auc_bootstrap = []

In [705]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73587453, 0.75298751])

In [706]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9938709735870361, pvalue=2.2573811975235003e-07),
 0.7451594239164858)

In [707]:
t_52 = auc_bootstrap
print(t_52)

[0.7516724914042708, 0.7425791146398841, 0.7472848522891784, 0.7414014431777055, 0.7394560939196526, 0.7418305227560622, 0.7465906905989866, 0.7506588174086137, 0.7399721204306914, 0.7412515834238147, 0.7446453413409337, 0.7509861054560261, 0.7476778807003257, 0.7465256571208829, 0.7413038929605501, 0.7473364549402823, 0.7449068890246109, 0.7485367467426711, 0.7450539212359755, 0.7482419754343106, 0.7463475219417299, 0.7467808428338762, 0.7495235590843288, 0.7479592211816867, 0.7448517519453492, 0.7492507012305464, 0.7461135427976838, 0.7438861461726383, 0.747545693087224, 0.7443074500090483, 0.7432280356496561, 0.7487360884907708, 0.749869933043793, 0.741267134907709, 0.7423119118711544, 0.7472283014386535, 0.75033082247557, 0.7421429662052118, 0.7483317499095186, 0.7417598341929061, 0.7434351531397032, 0.751021449737604, 0.7467596362649294, 0.7511338445530221, 0.7484066797864639, 0.7473802818494389, 0.7407171778863554, 0.749114979189287, 0.733602374004705, 0.7478178440553747, 0.74795

In [708]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [709]:
# 53
column_to_drop_52 = 'Cat_주택 보유 의식'

In [710]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(8444, 17)


In [711]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [712]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [713]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7645938946075074


In [714]:
optuna_53 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [715]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.745


In [716]:
X_train = X_train.values
y_train = y_train.values

In [717]:
auc_bootstrap = []

In [718]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73411458, 0.75011483])

In [719]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979370832443237, pvalue=0.011425144970417023),
 0.7422644056222855)

In [720]:
t_53 = auc_bootstrap
print(t_53)

[0.748796880655085, 0.7413774090662323, 0.7383363870792615, 0.740936312432139, 0.7455091555826999, 0.7415520098172276, 0.7473470582247557, 0.7378267225389071, 0.7375672955121244, 0.7379172038997467, 0.7430817103239231, 0.742685147484618, 0.73919666689287, 0.7341777789087949, 0.7402110477741586, 0.7397162278320666, 0.7411483781216069, 0.7439518865363736, 0.7417541791078537, 0.7454024158523344, 0.7461064739413682, 0.73831305985342, 0.7416509738056462, 0.7453621233713356, 0.7415293894770176, 0.7408458310712993, 0.7415923022982266, 0.7431842087404994, 0.7371544743032935, 0.7412268424267101, 0.7372979720865, 0.7346054447158886, 0.7424144102877307, 0.7396625045240679, 0.7403283907889976, 0.7387979833966701, 0.738907550669562, 0.7447860115816142, 0.7475803304831705, 0.7411950325732899, 0.7406132656985162, 0.7450645245204488, 0.74533243417481, 0.7432697419019181, 0.7372449556641332, 0.7380324262576909, 0.7466762237604053, 0.7407383844553022, 0.7464556754433586, 0.7392715967698155, 0.7396801766

In [721]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [722]:
# 54
column_to_drop_53 = '소득 중 사적이전소득의 비중(월평균)'

In [723]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(8444, 16)


In [724]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [725]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [726]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7562273765905445


In [727]:
optuna_54 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [728]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.748


In [729]:
X_train = X_train.values
y_train = y_train.values

In [730]:
auc_bootstrap = []

In [731]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73612758, 0.74901273])

In [732]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9468793272972107, pvalue=2.8489668451226087e-26),
 0.7442600925596046)

In [733]:
t_54 = auc_bootstrap
print(t_54)

[0.7418474880112197, 0.7461538352786825, 0.7451359199692363, 0.7488329318222946, 0.7419634172547955, 0.740370803926891, 0.7403177875045239, 0.7431919844824466, 0.7474191605591749, 0.7431043306641332, 0.7395755575913862, 0.7470925793973943, 0.7433390166938111, 0.7393458197611291, 0.7444113621968874, 0.7401354110115816, 0.7347687352967788, 0.7444452927072023, 0.743160881514658, 0.7387817250271443, 0.7430491935848715, 0.7451097652008687, 0.742865403320666, 0.74642457247557, 0.7415350445620702, 0.7483402325370974, 0.7490711522801303, 0.7457770652370612, 0.7440480229822657, 0.7460852673724212, 0.744500429786464, 0.7479443765834238, 0.7361082835685848, 0.7478786362196886, 0.738387282844734, 0.7468925307636627, 0.7464662787278321, 0.7388170693087224, 0.7454681562160695, 0.7408041248190372, 0.7475577101429607, 0.7454370532482808, 0.746069715888527, 0.7460796122873689, 0.7393839915852335, 0.7473166621425986, 0.7465949319127759, 0.7442869503257329, 0.7404358374049946, 0.7460944568856316, 0.74809

In [734]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [735]:
# 55
column_to_drop_54 = 'Cat_이사 계획 중인 거주 지역'

In [736]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(8444, 9)


In [737]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [738]:
def objective(trial):
    params = {
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = DecisionTreeClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [739]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7249413283007526


In [740]:
optuna_55 = DecisionTreeClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [741]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.719


In [742]:
X_train = X_train.values
y_train = y_train.values

In [743]:
auc_bootstrap = []

In [744]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71471156, 0.72112019])

In [745]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9250584840774536, pvalue=1.7672572030149363e-30),
 0.7182611041129433)

In [746]:
t_55 = auc_bootstrap
print(t_55)

[0.7182007498642778, 0.7186715356948967, 0.7142407765562794, 0.7208275368711544, 0.7204302671462178, 0.718767672140789, 0.7203567510405355, 0.7186715356948967, 0.7208275368711544, 0.718767672140789, 0.719535349936663, 0.7186941560351067, 0.7161055408523344, 0.7181781295240679, 0.7151682105048859, 0.7208275368711544, 0.7182007498642778, 0.718745051800579, 0.7163275029406442, 0.7181781295240679, 0.7132582055284111, 0.714780837178791, 0.7186715356948967, 0.7154552060712992, 0.717352487106406, 0.7144683937296417, 0.7175023468602967, 0.7158524757962359, 0.7206663669471588, 0.7208275368711544, 0.7182007498642778, 0.7186771907799492, 0.7158680272801302, 0.7208275368711544, 0.7172789710007238, 0.719535349936663, 0.7195523151918204, 0.718767672140789, 0.7190914257600434, 0.7158524757962359, 0.7182007498642778, 0.7203567510405355, 0.7147978024339485, 0.7158680272801302, 0.7182742659699601, 0.7203567510405355, 0.7186771907799492, 0.7208275368711544, 0.718767672140789, 0.7182007498642778, 0.718677

In [747]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc