In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
청년가구 = pd.read_csv('청년가구_변수추가.csv', encoding='cp949')
청년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [7]:
청년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [8]:
cat = 청년가구.select_dtypes(include = 'object')
num = 청년가구.select_dtypes(exclude = 'object')
num_청년 = num.drop('target',axis=1)
target = 청년가구.target

In [9]:
scaler=RobustScaler()
scaler.fit(num_청년)
num_scaled_청년=scaler.transform(num_청년)
num_df_scaled_청년=pd.DataFrame(data=num_scaled_청년, columns=num_청년.columns)

In [10]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [11]:
comp = pd.concat([num_df_scaled_청년, target,cat2],axis=1)

In [12]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(8444, 213)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [154]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [16]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 162, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3}
0.7860107222254142


In [156]:
optuna_0 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [157]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

0.7763795579985523


In [158]:
X_train = X_train.values
y_train = y_train.values

In [159]:
auc_bootstrap = []

In [160]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76255973, 0.77444481])

In [161]:
t_0 = auc_bootstrap
print(t_0)

[0.7705746131921825, 0.7694322860115818, 0.770204205121245, 0.7709266422366992, 0.7600603963083605, 0.7728140268729642, 0.7708333333333333, 0.7663078515200868, 0.7652390404451683, 0.7702890313970322, 0.7695581116539993, 0.7675830731994208, 0.768928983441911, 0.7715006333695259, 0.7684341634998191, 0.7671744933043794, 0.7670486676619617, 0.7653846588852696, 0.7644204668838219, 0.7661947498190371, 0.7644982243032935, 0.7755680532935215, 0.7604251492942453, 0.7667899475208106, 0.7677555532935215, 0.7666683631921825, 0.7660788205754614, 0.7689996720050669, 0.7664845729279768, 0.7635792729822657, 0.7697489707745204, 0.7696683858125226, 0.7682729935758233, 0.7697659360296779, 0.7661763707926168, 0.7723898954940283, 0.7685189897756063, 0.7684751628664495, 0.7736184627216793, 0.772020901194354, 0.7631381763481723, 0.7708361608758596, 0.768835674538545, 0.7690039133188564, 0.773956354053565, 0.7672282166123778, 0.7722287255700326, 0.7690901533659067, 0.7673879727651104, 0.7687975027144408, 0.76

In [162]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [14]:
column_to_drop = 'Cat_가구주 동거 여부'

In [15]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(8444, 212)


In [165]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [166]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [167]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 174, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7871134407913056


In [168]:
optuna_1= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [169]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7733837766920015


In [170]:
X_train = X_train.values
y_train = y_train.values

In [171]:
auc_bootstrap = []

In [172]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76464335, 0.77548075])

In [173]:
t_1 = auc_bootstrap
print(t_1)

[0.771183948606587, 0.7668592223127035, 0.7735237400470503, 0.7712404994571118, 0.7666584667933405, 0.7692937364277959, 0.7746024475208106, 0.7691792209554832, 0.7711047774158524, 0.772836647213174, 0.7685628166847629, 0.7679209645313065, 0.7704459600072384, 0.7708022303655447, 0.7700416214259862, 0.7737739775606225, 0.7736170489504163, 0.7709407799493305, 0.7764389363916034, 0.7678290693992037, 0.766864877397756, 0.7744384500542888, 0.7734219485161057, 0.7704586839486066, 0.7707654723127035, 0.7680171009771987, 0.7685585753709736, 0.7686547118168657, 0.7708856428700688, 0.7724846181686573, 0.7696401103872603, 0.7631593829171192, 0.7675929695982627, 0.769625972674629, 0.7670571502895404, 0.77107650199059, 0.77023813563156, 0.7730275063336952, 0.7692173927795873, 0.7747466521896489, 0.76953125, 0.7709167458378574, 0.7704855456026058, 0.7690830845095911, 0.7699709328628302, 0.7683549923090843, 0.7634265856858486, 0.7654044516829532, 0.7736594620883099, 0.7674176619616359, 0.7684044743032

In [174]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [16]:
column_to_drop_1 = '부채 중 임대 보증금의 비중'

In [17]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(8444, 211)


In [177]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [178]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [179]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 105, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7856063184226926


In [180]:
optuna_2= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [181]:
X_train = X_train.values
y_train = y_train.values

In [182]:
auc_bootstrap = []

In [183]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76284507, 0.77529449])

In [184]:
t_2 = auc_bootstrap
print(t_2)

[0.7666966386174447, 0.7693531148208469, 0.7674289721317409, 0.7665481926348172, 0.7669426348172277, 0.767229630383641, 0.7731109188382193, 0.7705322000542888, 0.7713804628121608, 0.768928983441911, 0.7750845435215346, 0.766782878664495, 0.7672989051755339, 0.7690548090843286, 0.7684822317227651, 0.7666796733622874, 0.7683832677343467, 0.7754422276511038, 0.7660816481179877, 0.7720406939920376, 0.7664336771625045, 0.7705548203944987, 0.7651400764567499, 0.768917673271806, 0.7709139182953313, 0.768705607582338, 0.773389431777054, 0.7681118236518277, 0.7734276036011583, 0.7628780424357583, 0.7691156012486429, 0.7666259500542888, 0.7650877669200145, 0.7684355772710821, 0.7727871652189648, 0.7680863757690916, 0.7730784020991676, 0.7703059966521897, 0.7689530175533841, 0.7633050013572203, 0.7666188811979733, 0.7715713219326819, 0.7660802343467246, 0.7687451931777054, 0.7662074737604052, 0.7694478374954761, 0.7691566006152732, 0.7669793928700687, 0.7708474710459645, 0.7653125565508505, 0.768

In [185]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [18]:
column_to_drop_2 = '소득 중 재산소득의 비중(월평균)'

In [19]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(8444, 210)


In [188]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [189]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [190]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.786196350200434


In [191]:
optuna_3= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [192]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7723517236699239


In [193]:
X_train = X_train.values
y_train = y_train.values

In [194]:
auc_bootstrap = []

In [195]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76085137, 0.77310919])

In [196]:
t_3 = auc_bootstrap
print(t_3)

[0.7669581863011219, 0.7627295964531307, 0.7650750429786464, 0.7653153840933768, 0.7639383708830982, 0.7589887576909157, 0.7726570982627579, 0.7687861925443359, 0.7645491200687657, 0.7709096769815418, 0.7605043204849801, 0.7685302999457111, 0.7645519476112921, 0.7692145652370611, 0.7674233170466884, 0.7656773095367355, 0.7689968444625407, 0.7662781623235614, 0.7720025221679334, 0.7695171122873687, 0.7642621245023524, 0.7645590164676077, 0.7678389657980456, 0.768268752262034, 0.7680736518277235, 0.774134489232718, 0.7664209532211366, 0.7650481813246471, 0.7701886536373507, 0.7696485930148389, 0.765478674674267, 0.7684822317227651, 0.7668959803655447, 0.7707273004885993, 0.7602130836047776, 0.7643526058631922, 0.7679591363554107, 0.7666513979370251, 0.7654921055012667, 0.7678785513934129, 0.7710001583423814, 0.7668337744299674, 0.7629967992218604, 0.7737358057365183, 0.766283817408614, 0.7699483125226203, 0.7673865589938472, 0.7706283365001809, 0.7616141309265291, 0.769812590481361, 0.76

In [197]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [20]:
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [21]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(8444, 209)


In [200]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [201]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [202]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7855554917152465


In [203]:
optuna_4= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [204]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7695976972493666


In [205]:
X_train = X_train.values
y_train = y_train.values

In [206]:
auc_bootstrap = []

In [207]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76224665, 0.77462415])

In [208]:
t_4 = auc_bootstrap
print(t_4)

[0.7638125452406804, 0.7700656555374593, 0.766607571027868, 0.7746561708288091, 0.7684864730365544, 0.7709054356677525, 0.7715854596453131, 0.7662258527868259, 0.7649534586500182, 0.7641051958921463, 0.769754625859573, 0.7692739436301123, 0.7721071412414042, 0.7695156985161058, 0.7744327949692362, 0.7715232537097357, 0.7669214282482809, 0.7667885337495477, 0.766320575461455, 0.7686122986789721, 0.7703540648751357, 0.7686179537640246, 0.7700359663409337, 0.7709181596091206, 0.7684016467607673, 0.772033625135722, 0.764611326004343, 0.7697730048859934, 0.767913895674991, 0.767819173000362, 0.7715967698154181, 0.7654284857944265, 0.7712574647122692, 0.7665538477198697, 0.7694732853782121, 0.7746547570575462, 0.7717296643141512, 0.7695764906804197, 0.7669977718964893, 0.768681573470865, 0.7746999977379659, 0.764797943811075, 0.7657041711907346, 0.7661495091386175, 0.7694534925805284, 0.7758197045783568, 0.7635877556098443, 0.7692668747737965, 0.7685741268548679, 0.7727094077994933, 0.768326

In [209]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [22]:
column_to_drop_4 = '자산 중 부동산 자산의 비중'

In [23]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(8444, 208)


In [212]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [213]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [214]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7853720735970724


In [215]:
optuna_5 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [216]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7723997918928701


In [217]:
X_train = X_train.values
y_train = y_train.values

In [218]:
auc_bootstrap = []

In [219]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76109821, 0.77395084])

In [220]:
t_5 = auc_bootstrap
print(t_5)

[0.7637715458740499, 0.7662414042707202, 0.7757249819037279, 0.772459170285921, 0.7685712993123416, 0.7667623789811797, 0.7683507509952949, 0.7709676416033298, 0.7694860093195801, 0.7691254976474847, 0.7733187432138979, 0.773956354053565, 0.7659869254433587, 0.7689275696706479, 0.7648050126673906, 0.7740072498190372, 0.7637715458740499, 0.7689685690372784, 0.7679902393231994, 0.7667333966702858, 0.7671617693630113, 0.7651584554831705, 0.7619152642055737, 0.7652404542164314, 0.7680255836047774, 0.7631848307998552, 0.7734078108034746, 0.7620622964169381, 0.7693375633369526, 0.7713578424719507, 0.7665114345819761, 0.7656674131378937, 0.7697122127216794, 0.7652602470141151, 0.7672989051755339, 0.7694195620702136, 0.7650863531487514, 0.7730543679876946, 0.7736368417480999, 0.7685246448606587, 0.7699553813789359, 0.7706863011219688, 0.7759921846724575, 0.767714553926891, 0.7689318109844372, 0.7708022303655448, 0.7671080460550126, 0.7682277528954036, 0.7739506989685125, 0.7662654383821932, 0.

In [221]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [24]:
column_to_drop_5 = 'Cat_가구주 장애 여부'

In [25]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(8444, 206)


In [224]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [225]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [226]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7851731864809798


In [227]:
optuna_6 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [228]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7774427139884185


In [229]:
X_train = X_train.values
y_train = y_train.values

In [230]:
auc_bootstrap = []

In [231]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76236184, 0.77466914])

In [232]:
t_6 = auc_bootstrap
print(t_6)

[0.7632936911871154, 0.7625443924176619, 0.7674346272167933, 0.7669044629931234, 0.7637121674809989, 0.7731349529496924, 0.7673483871697431, 0.7644119842562431, 0.771442668747738, 0.7676269001085776, 0.768387509048136, 0.7688512260224394, 0.7714949782844733, 0.7716745272348895, 0.769142462902642, 0.7726316503800217, 0.7673045602605864, 0.773532222674629, 0.7675095570937387, 0.7692923226565327, 0.7641560916576186, 0.7674134206478465, 0.7686914698697067, 0.7743423136083967, 0.7709619865182772, 0.7735972561527327, 0.7676028659971045, 0.7711387079261671, 0.7645024656170829, 0.7693898728736879, 0.7692937364277959, 0.7699638640065147, 0.7684836454940283, 0.7646834283387622, 0.7697291779768367, 0.7670712880021715, 0.7688144679695983, 0.7692527370611655, 0.7683549923090843, 0.7720703831885632, 0.7624609799131379, 0.7711443630112197, 0.7630420399022801, 0.7721552094643505, 0.7711330528411147, 0.7724789630836049, 0.7695255949149475, 0.7688780876764387, 0.7635792729822657, 0.7669864617263843, 0.7

In [233]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
column_to_drop_6 = '부채 중 비금융기관 대출금의 비중'

In [27]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(8444, 205)


In [236]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [237]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [238]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.789336556777852


In [239]:
optuna_7 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [240]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7712164653456388


In [241]:
X_train = X_train.values
y_train = y_train.values

In [242]:
auc_bootstrap = []

In [243]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76190491, 0.77466868])

In [244]:
t_7 = auc_bootstrap
print(t_7)

[0.7724422050307636, 0.77189224800941, 0.7658243417480999, 0.764987389160333, 0.7719516264024611, 0.7710100547412233, 0.7652560057003257, 0.7648926664857041, 0.7693418046507419, 0.7687649859753891, 0.7691071186210641, 0.7696273864458922, 0.7659162368802026, 0.7643794675171913, 0.7657437567861021, 0.7736142214078899, 0.7730854709554832, 0.7696669720412594, 0.7722287255700326, 0.7691975999819038, 0.7652998326094824, 0.7701349303293521, 0.7724195846905537, 0.7708955392689106, 0.7609920715707565, 0.7588784835323923, 0.7688300194534926, 0.766497296869345, 0.7691848760405356, 0.7699893118892508, 0.7642267802207745, 0.7670755293159608, 0.7709492625769092, 0.7687296416938112, 0.7688455709373869, 0.7703879953854507, 0.7659812703583062, 0.7653931415128483, 0.7671914585595369, 0.7690378438291712, 0.7601112920738329, 0.7695114572023164, 0.7669299108758596, 0.7702904451682953, 0.7678644136807817, 0.7645462925262395, 0.7622658794788273, 0.7653054876945349, 0.7675166259500542, 0.7713126017915309, 0.7

In [245]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
column_to_drop_7 = '중기부채부담지표'

In [29]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(8444, 204)


In [248]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [249]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [250]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7859621053748138


In [251]:
optuna_8 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [252]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7747452384183858


In [253]:
X_train = X_train.values
y_train = y_train.values

In [254]:
auc_bootstrap = []

In [255]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76157688, 0.77429368])

In [256]:
t_8 = auc_bootstrap
print(t_8)

[0.7698168317951501, 0.7650029406442272, 0.7616494752081072, 0.7683337857401376, 0.7650029406442272, 0.7655953108034744, 0.7664605388165039, 0.7667093625588128, 0.7670755293159609, 0.7688893978465436, 0.7679351022439378, 0.7641674018277235, 0.7670628053745929, 0.7682008912414042, 0.7612635156532754, 0.7686547118168658, 0.7648007713536014, 0.7725934785559174, 0.7694351135541078, 0.7691297389612739, 0.7660137870973579, 0.7726457880926529, 0.7657805148389432, 0.7650792842924358, 0.7714539789178427, 0.7695920421643141, 0.7674629026420559, 0.7613709622692726, 0.7664591250452406, 0.7651089734889613, 0.7662258527868259, 0.7671094598262758, 0.7743903818313428, 0.772457756514658, 0.7640712653818315, 0.7705307862830257, 0.7654313133369526, 0.7610316571661239, 0.7646325325732899, 0.7673356632283749, 0.7713041191639521, 0.7662541282120883, 0.7735689807274702, 0.7684468874411872, 0.7746038612920738, 0.7623818087224032, 0.7655938970322114, 0.7743366585233442, 0.7635566526420557, 0.7732028139703222, 

In [257]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
column_to_drop_8 = '장기부채부담지표'

In [31]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(8444, 203)


In [260]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [261]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [262]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7864416443102816


In [263]:
optuna_9 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [264]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7730458853601156


In [265]:
X_train = X_train.values
y_train = y_train.values

In [266]:
auc_bootstrap = []

In [267]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76145243, 0.77407147])

In [268]:
t_9 = auc_bootstrap
print(t_9)

[0.7717126990589938, 0.7712136378031125, 0.7701943087224032, 0.7694577338943177, 0.7672678022077453, 0.7709902619435396, 0.7657819286102063, 0.7580203243756787, 0.7699992082880927, 0.7643695711183497, 0.7730840571842201, 0.764812081523706, 0.7638818200325732, 0.7602979098805646, 0.7636344100615273, 0.767345559627217, 0.7663856089395584, 0.7630208333333333, 0.7722004501447702, 0.7732466408794789, 0.7647597719869708, 0.7701462404994571, 0.7668705324828086, 0.7653309355772712, 0.7693036328266377, 0.75984267553384, 0.7668973941368078, 0.7691198425624323, 0.7733583288092651, 0.7673964553926891, 0.7654426235070575, 0.7646141535468692, 0.7712503958559537, 0.7690675330256967, 0.7687805374592834, 0.7670670466883822, 0.7696669720412596, 0.7677711047774158, 0.7706000610749186, 0.7682461319218241, 0.7682970276872965, 0.764775323470865, 0.7678375520267825, 0.7629232831161781, 0.7675901420557364, 0.7703017553384003, 0.767608521082157, 0.7691580143865364, 0.7662343354144047, 0.7631410038906985, 0.768

In [269]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [32]:
column_to_drop_9 = 'Cat_현재 주택의 위치'

In [33]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(8444, 199)


In [272]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [273]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [274]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7881012468012323


In [275]:
optuna_10 = ExtraTreesClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [276]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.774817340752805


In [277]:
X_train = X_train.values
y_train = y_train.values

In [278]:
auc_bootstrap = []

In [279]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76303373, 0.77504786])

In [280]:
t_10 = auc_bootstrap
print(t_10)

[0.763676823199421, 0.7710029858849077, 0.7630872805827, 0.7706396466702858, 0.770817781849439, 0.765909168023887, 0.7683196480275063, 0.7705067521715527, 0.7718540761853058, 0.7662286803293523, 0.765825755519363, 0.7719657641150923, 0.7677597946073109, 0.7697390743756786, 0.7721891399746652, 0.7699002442996742, 0.7709464350343829, 0.7659317883640969, 0.7725963060984438, 0.7688088128845457, 0.7740765246109302, 0.769237185577271, 0.7647838060984438, 0.7701250339305103, 0.7682885450597178, 0.7704120294969236, 0.7664930555555556, 0.7700854483351429, 0.772008177252986, 0.7640345073289903, 0.7712291892870069, 0.7680637554288816, 0.7665835369163951, 0.7723262757871879, 0.769638696615997, 0.7661636468512486, 0.7669186007057547, 0.7696090074194716, 0.7701009998190372, 0.7664874004705031, 0.7679421711002533, 0.7714158070937386, 0.7681358577633008, 0.764539223669924, 0.7720110047955121, 0.7723192069308722, 0.7671433903365907, 0.7717480433405719, 0.7719657641150923, 0.7705519928519725, 0.77098460

In [281]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
column_to_drop_10 = '소득 대비 생활비의 비율'

In [35]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(8444, 198)


In [284]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [285]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [286]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7870405155154049


In [287]:
optuna_11 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [288]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7740355252442996


In [289]:
X_train = X_train.values
y_train = y_train.values

In [290]:
auc_bootstrap = []

In [291]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76283669, 0.77501279])

In [292]:
t_11 = auc_bootstrap
print(t_11)

[0.7720378664495114, 0.7659869254433587, 0.7713748077271083, 0.76922304786464, 0.7763357310893956, 0.7643978465436121, 0.7666810871335504, 0.7664690214440825, 0.7678417933405719, 0.7661085097719871, 0.7698125904813609, 0.7698521760767281, 0.7714228759500542, 0.7708729189287007, 0.7740072498190372, 0.7694619752081073, 0.7704459600072385, 0.7693107016829532, 0.7677979664314152, 0.7673639386536373, 0.767573176800579, 0.7644430872240319, 0.7746844462540717, 0.7721566232356134, 0.7647625995294969, 0.7665213309808179, 0.7670331161780672, 0.7680637554288818, 0.7681089961093014, 0.76953125, 0.7672607333514296, 0.7719459713174086, 0.769789970141151, 0.7740553180419834, 0.7667885337495475, 0.7656108622873687, 0.7705845095910242, 0.7640670240680421, 0.7692428406623235, 0.7697362468331523, 0.7723573787549765, 0.7715529429062612, 0.7660887169743034, 0.7694322860115816, 0.7629812477379659, 0.7705420964531307, 0.7675774181143683, 0.7687692272891784, 0.7764530741042346, 0.7699172095548317, 0.768897880

In [293]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [36]:
column_to_drop_11 = '현재 주택 거주 기간(총 개월)'

In [37]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(8444, 197)


In [296]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [297]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [298]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.783522423417411


In [299]:
optuna_12 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [300]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7703738576728194


In [301]:
X_train = X_train.values
y_train = y_train.values

In [302]:
auc_bootstrap = []

In [303]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76160508, 0.77353441])

In [304]:
t_12 = auc_bootstrap
print(t_12)

[0.7673059740318493, 0.7706212676438653, 0.7692767711726385, 0.7664082292797685, 0.7712772575099529, 0.7748767191458561, 0.7711867761491134, 0.7730557817589576, 0.7654171756243214, 0.7701858260948244, 0.7706919562070214, 0.7646155673181325, 0.7667941888346002, 0.7724662391422368, 0.7670628053745929, 0.7677810011762577, 0.7696980750090481, 0.7619704012848354, 0.7690859120521173, 0.7669624276149114, 0.7670387712631198, 0.7683337857401374, 0.7669935305826999, 0.7757221543612016, 0.7708305057908071, 0.7689487762395947, 0.7646339463445531, 0.7705010970865002, 0.767653761762577, 0.7699511400651466, 0.7634986880202679, 0.7652305578175896, 0.7691198425624322, 0.7695439739413682, 0.7657805148389432, 0.7733413635541079, 0.7577036396127398, 0.7703993055555556, 0.7665863644589215, 0.7683776126492942, 0.7686221950778139, 0.769765936029678, 0.7670981496561708, 0.762698493485342, 0.7672084238146941, 0.770286203854506, 0.768528886174448, 0.7681146511943538, 0.7687352967788637, 0.7697206953492579, 0.76

In [305]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [38]:
column_to_drop_12 = '소득 대비 주거관리비의 비율'

In [39]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(8444, 196)


In [308]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [309]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [310]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7848019305309403


In [311]:
optuna_13 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [312]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.773048712902642


In [313]:
X_train = X_train.values
y_train = y_train.values

In [314]:
auc_bootstrap = []

In [315]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76216942, 0.77475358])

In [316]:
t_13 = auc_bootstrap
print(t_13)

[0.7665227447520812, 0.7717438020267824, 0.7685727130836049, 0.7677187952406805, 0.7674459373868984, 0.7693983555012667, 0.769330494480637, 0.766532641150923, 0.7696994887803112, 0.769483181777054, 0.7745444828990228, 0.7720675556460369, 0.7693177705392689, 0.7704487875497648, 0.7721566232356134, 0.7707371968874411, 0.7669977718964893, 0.7662088875316685, 0.7748229958378574, 0.770923814694173, 0.7670825981722764, 0.7695001470322114, 0.760584905446978, 0.7672741641784292, 0.7675123846362649, 0.7669892892689106, 0.7644939829895041, 0.7741019724936662, 0.769401183043793, 0.7636018933224755, 0.7682843037459284, 0.770122206387984, 0.7684171982446615, 0.7631975547412233, 0.7668012576909157, 0.7645038793883461, 0.7686250226203402, 0.7685147484618169, 0.7665722267462902, 0.7631805894860659, 0.7629600411690192, 0.7681697882736156, 0.7678474484256244, 0.7692569783749548, 0.7671589418204849, 0.7667376379840752, 0.7683903365906624, 0.7679633776692001, 0.7671250113101702, 0.7679874117806731, 0.7700

In [317]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
column_to_drop_13 = '소득 대비 주택 임대료의 비율'

In [41]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(8444, 195)


In [320]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [321]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [322]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 90, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7858405632483129


In [323]:
optuna_14 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [324]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7725072385088672


In [325]:
X_train = X_train.values
y_train = y_train.values

In [326]:
auc_bootstrap = []

In [327]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7616212 , 0.77435984])

In [328]:
t_14 = auc_bootstrap
print(t_14)

[0.7662894724936663, 0.7708715051574376, 0.7684539562975028, 0.7647159450778139, 0.7644614662504524, 0.7706580256967066, 0.7640712653818313, 0.7702805487694535, 0.7660434762938835, 0.7683648887079262, 0.76407974800941, 0.7712772575099529, 0.767004840752805, 0.76904632645675, 0.7720703831885631, 0.7681881673000363, 0.7710270199963809, 0.7683210617987695, 0.7679435848715165, 0.7662866449511401, 0.7625415648751357, 0.7637687183315237, 0.7617809559355773, 0.7696273864458922, 0.7687381243213898, 0.7714341861201592, 0.7678771376221498, 0.7743423136083967, 0.7684073018458197, 0.7725468241042346, 0.7636513753166847, 0.7715585979913138, 0.7667574307817591, 0.7718229732175172, 0.7657168951321027, 0.769944071208831, 0.7755115024429968, 0.7663149203764025, 0.7719092132645675, 0.7656674131378935, 0.7654397959645314, 0.7719346611473037, 0.7616211997828448, 0.7654525199058994, 0.770157550669562, 0.7674982469236338, 0.7730571955302208, 0.7684779904089758, 0.76666270810713, 0.7692060826094825, 0.762518

In [329]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [42]:
column_to_drop_14 = '현재 주택의 면적(㎡)'

In [43]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(8444, 194)


In [332]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [333]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [334]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7858560322462311


In [335]:
optuna_15 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [336]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7746646534563881


In [337]:
X_train = X_train.values
y_train = y_train.values

In [338]:
auc_bootstrap = []

In [339]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76271253, 0.7748617 ])

In [340]:
t_15 = auc_bootstrap
print(t_15)

[0.7760982175171915, 0.7717522846543613, 0.7717904564784654, 0.7737867015019906, 0.7716151488418386, 0.7692244616359031, 0.7735350502171552, 0.7698012803112559, 0.7667800511219689, 0.7669129456207021, 0.7681132374230908, 0.7689162595005429, 0.7671038047412233, 0.7679930668657255, 0.7693234256243213, 0.7670727017734347, 0.7642126425081432, 0.7640627827542528, 0.7694223896127398, 0.774240522077452, 0.7644713626492943, 0.770404960640608, 0.7679704465255157, 0.7737541847629388, 0.7681330302207745, 0.7691113599348534, 0.7707159903184945, 0.770096758505248, 0.7721043136988781, 0.7704982695439739, 0.7727235455121245, 0.7665368824647123, 0.767112287368802, 0.7684157844733985, 0.7677583808360479, 0.7647654270720232, 0.7699412436663047, 0.7685373688020267, 0.7713705664133188, 0.7700812070213536, 0.7668464983713356, 0.7638351655808903, 0.7662442318132465, 0.7660250972674628, 0.7667022937024973, 0.7706679220955484, 0.7701914811798769, 0.7712320168295331, 0.7685218173181325, 0.7660222697249367, 0.7

In [341]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
column_to_drop_15 = '자산 중 기타자산의 비중'

In [45]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(8444, 193)


In [344]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [345]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [346]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7866272722853014


In [347]:
optuna_16 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [348]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7748526850343829


In [349]:
X_train = X_train.values
y_train = y_train.values

In [350]:
auc_bootstrap = []

In [351]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7632903 , 0.77554554])

In [352]:
t_16 = auc_bootstrap
print(t_16)

[0.7685458514296056, 0.7716264590119435, 0.7697051438653637, 0.7714525651465799, 0.7708686776149113, 0.7747438246471228, 0.7667489481541803, 0.7694478374954759, 0.7715769770177344, 0.7628186640427073, 0.7672239752985884, 0.7689070699873326, 0.7676353827361564, 0.7699214508686211, 0.7692739436301121, 0.7660053044697792, 0.7672140788997467, 0.7703413409337677, 0.7699850705754614, 0.7666513979370249, 0.7738828379478826, 0.7622800171914585, 0.7664167119073471, 0.767950653727832, 0.7735506017010496, 0.7731038499819037, 0.7676848647303656, 0.7668719462540717, 0.7681287889069852, 0.7685543340571841, 0.7690222923452769, 0.7682970276872965, 0.7690859120521174, 0.7732819851610568, 0.7641914359391966, 0.7739902845638797, 0.772774441277597, 0.7703173068222946, 0.7715798045602605, 0.7691481179876944, 0.7686122986789722, 0.7696811097538907, 0.7647795647846544, 0.7697065576366269, 0.7741655922005066, 0.7667546032392327, 0.7678997579623597, 0.7724775493123416, 0.7711797072927976, 0.7697263504343106, 0

In [353]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
column_to_drop_16 = '자산 중 금융자산의 비중'

In [47]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(8444, 192)


In [356]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [357]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [358]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7882382579256515


In [359]:
optuna_17 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [360]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7694549063517916


In [361]:
X_train = X_train.values
y_train = y_train.values

In [362]:
auc_bootstrap = []

In [363]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76124574, 0.77414456])

In [364]:
t_17 = auc_bootstrap
print(t_17)

[0.7625769091567138, 0.7659091680238871, 0.7679039992761492, 0.7622842585052478, 0.7684638526963445, 0.7682701660332971, 0.769319184310532, 0.7679336884726746, 0.761577372873688, 0.7689120181867535, 0.7695029745747375, 0.7635255496742671, 0.7623987739775606, 0.769637282844734, 0.7690173441458559, 0.7657366879297864, 0.7645901194353963, 0.7677937251176257, 0.772788578990228, 0.7654284857944265, 0.7677456568946797, 0.7750152687296417, 0.770428994752081, 0.770427580980818, 0.7700727243937748, 0.7607602130836046, 0.7760968037459283, 0.7671575280492218, 0.7789512079261672, 0.764160332971408, 0.7724535152008685, 0.7684073018458197, 0.7694139069851611, 0.7681005134817228, 0.7644487423090843, 0.7677301054107855, 0.7676735545602607, 0.7656561029677886, 0.7616834057184221, 0.7701504818132465, 0.7652333853601159, 0.772936318087224, 0.7705873371335505, 0.766381367625769, 0.7694520788092654, 0.7671363214802751, 0.7659614775606225, 0.7662809898660875, 0.77153880519363, 0.7694888368621065, 0.77097612

In [365]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [48]:
column_to_drop_17 = '소득 중 정부 보조금의 비중(월평균)'

In [49]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(8444, 191)


In [368]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [369]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [370]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 98, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7865698160073189


In [371]:
optuna_18 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [372]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.7732508821932682


In [373]:
X_train = X_train.values
y_train = y_train.values

In [374]:
auc_bootstrap = []

In [375]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76183701, 0.77341089])

In [376]:
t_18 = auc_bootstrap
print(t_18)

[0.7679761016105682, 0.7712928089938474, 0.7644360183677162, 0.7664449873326095, 0.7665679854325009, 0.7625910468693449, 0.7671561142779587, 0.7699370023525154, 0.765000113101701, 0.7676481066775245, 0.7654822091024249, 0.7658879614549403, 0.7655090707564243, 0.7685486789721319, 0.7674954193811074, 0.764136298859935, 0.770156136898299, 0.7637263051936302, 0.765506243213898, 0.769165083242852, 0.7652814535830619, 0.7667489481541803, 0.7664068155085052, 0.7717833876221499, 0.7704650459192905, 0.7682150289540355, 0.7691707383279045, 0.7691679107853782, 0.7691198425624322, 0.7653592110025336, 0.7678707756514658, 0.7672423543250091, 0.7617724733079985, 0.7715190123959464, 0.7656136898298952, 0.7700953447339848, 0.7706198538726023, 0.7755680532935215, 0.7719191096634094, 0.7615533387622151, 0.7644487423090844, 0.7678064490589938, 0.7714935645132103, 0.7738149769272529, 0.7747127216793341, 0.7714412549764748, 0.7740072498190373, 0.7707075076909158, 0.7712263617444807, 0.7628469394679696, 0.76

In [377]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [50]:
column_to_drop_18 = '총 가구원 수'

In [51]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(8444, 190)


In [380]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [381]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [382]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7867885918350209


In [383]:
optuna_19 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [384]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7693729076185306


In [385]:
X_train = X_train.values
y_train = y_train.values

In [386]:
auc_bootstrap = []

In [387]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7602679 , 0.77341659])

In [388]:
t_19 = auc_bootstrap
print(t_19)

[0.7730402302750634, 0.770947848805646, 0.7583030786283026, 0.7696316277596815, 0.7668387226293882, 0.7603502194173, 0.7751778524249004, 0.7664888142417662, 0.7658625135722041, 0.7603063925081432, 0.760751730456026, 0.7648502533478103, 0.7657296190734708, 0.7680043770358306, 0.7666330189106043, 0.7647654270720232, 0.7666584667933405, 0.7649930442453854, 0.7687960889431777, 0.7693912866449512, 0.771135880383641, 0.7674431098443721, 0.7741825574556642, 0.7727758550488599, 0.7658936165399928, 0.7665764680600796, 0.7681966499276149, 0.7684695077813971, 0.766060441549041, 0.7660321661237784, 0.7671250113101702, 0.7661424402823018, 0.7625429786463989, 0.7660971996018819, 0.7671405627940644, 0.7667772235794427, 0.7692484957473761, 0.7728931980636988, 0.7668464983713357, 0.7694266309265291, 0.7748753053745928, 0.7761448719688744, 0.7696881786102063, 0.7689233283568585, 0.7714454962902643, 0.7654708989323199, 0.7669892892689105, 0.7709662278320666, 0.7726797186029677, 0.765764963355049, 0.76934

In [389]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
column_to_drop_19 = 'Cat_현재 대중교통 접근용이성'

In [53]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(8444, 186)


In [392]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [393]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [394]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 98, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7855853247826605


In [395]:
optuna_20 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [396]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7707640585414405


In [397]:
X_train = X_train.values
y_train = y_train.values

In [398]:
auc_bootstrap = []

In [399]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7624701 , 0.77404867])

In [400]:
t_20 = auc_bootstrap
print(t_20)

[0.7727631311074918, 0.7688908116178067, 0.7655882419471589, 0.7735110161056822, 0.7711330528411148, 0.7681174787368802, 0.7689685690372783, 0.768186753528773, 0.7632258301664857, 0.7679520674990953, 0.7699864843467246, 0.7702975140246109, 0.7664138843648208, 0.7648078402099168, 0.7716222176981542, 0.7672564920376401, 0.7698352108215707, 0.7656066209735795, 0.7686363327904452, 0.7672805261491132, 0.7596617128121608, 0.7676184174809989, 0.7643851226022438, 0.7652390404451683, 0.7722654836228737, 0.7747537210459646, 0.7679930668657257, 0.7697829012848353, 0.7706509568403909, 0.7668521534563881, 0.7684808179515021, 0.7697206953492579, 0.771736733170467, 0.7631664517734347, 0.767879965164676, 0.7666372602243938, 0.7679973081795151, 0.7695298362287368, 0.7676367965074194, 0.7628313879840753, 0.7673455596272167, 0.7702565146579805, 0.77394504388346, 0.7675519702316322, 0.7660802343467246, 0.7663757125407166, 0.7686306777053926, 0.7718851791530944, 0.7710807433043793, 0.7667729822656533, 0.77

In [401]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [54]:
column_to_drop_20 = 'Cat_현재 상업시설 접근용이성'

In [55]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(8444, 182)


In [404]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [405]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [406]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7871355393597603


In [407]:
optuna_21 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [408]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7695991110206297


In [409]:
X_train = X_train.values
y_train = y_train.values

In [410]:
auc_bootstrap = []

In [411]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76073158, 0.77276288])

In [412]:
t_21 = auc_bootstrap
print(t_21)

[0.7598144001085776, 0.7703526511038726, 0.7657381017010496, 0.7742758663590301, 0.7703752714440825, 0.7715416327361564, 0.7622927411328266, 0.7675293498914223, 0.7621032957835685, 0.7622220525696707, 0.7694294584690553, 0.7630787979551212, 0.7703272032211366, 0.7646000158342381, 0.7725015834238146, 0.7634576886536374, 0.7652503506152732, 0.7625443924176619, 0.7607785921100254, 0.7727164766558089, 0.764516603329714, 0.7679676189829894, 0.7697376606044155, 0.762828560441549, 0.7674374547593196, 0.7666598805646037, 0.766874773796598, 0.7671320801664857, 0.768291372602244, 0.7663332994028229, 0.7682489594643505, 0.7681061685667754, 0.76633188563156, 0.76273383776692, 0.7709068494390156, 0.7753262984075281, 0.7686179537640245, 0.7699313472674629, 0.7690618779406442, 0.7706905424357583, 0.7629953854505971, 0.7740878347810352, 0.7717098715164676, 0.7637673045602605, 0.7689940169200145, 0.7643596747195078, 0.7677187952406805, 0.76187285106768, 0.7647314965617082, 0.7628950076909156, 0.7659176

In [413]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
column_to_drop_21 = 'Cat_현재 교육환경'

In [57]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(8444, 178)


In [416]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [417]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [418]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7861079559266151


In [419]:
optuna_22 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [420]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7714765992580528


In [421]:
X_train = X_train.values
y_train = y_train.values

In [422]:
auc_bootstrap = []

In [423]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76170436, 0.77386319])

In [424]:
t_22 = auc_bootstrap
print(t_22)

[0.7641419539449874, 0.7677767598624683, 0.7711811210640609, 0.7663827813970321, 0.7685783681686572, 0.7707364900018097, 0.7667659134093376, 0.7693686663047412, 0.7669016354505971, 0.7689459486970684, 0.7641334713174086, 0.7675152121787912, 0.7650269747557004, 0.7695270086862105, 0.7704205121245022, 0.7663502646579805, 0.7690689467969599, 0.7677767598624683, 0.7636923746833153, 0.7692032550669561, 0.770769713626493, 0.7669101180781759, 0.7618657822113645, 0.7718611450416215, 0.7641716431415129, 0.7713875316684762, 0.7693234256243215, 0.7637545806188926, 0.766239990499457, 0.7672041825009048, 0.7696288002171552, 0.7683832677343467, 0.7658271692906261, 0.7677682772348896, 0.7630830392689105, 0.7658243417480999, 0.7743267621245024, 0.7655175533840028, 0.7712263617444806, 0.7617046122873687, 0.7651853171371696, 0.7730939535830619, 0.7631325212631197, 0.7644020878574014, 0.7677753460912052, 0.7667517756967065, 0.7717975253347811, 0.7644035016286644, 0.765907754252624, 0.7687946751719146, 0.

In [425]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [58]:
column_to_drop_22 = 'Cat_현재 청소/쓰레기 처리상태'

In [59]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(8444, 174)


In [429]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [430]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [431]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7861698319182883


In [432]:
optuna_23 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [433]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.7731462631197973


In [434]:
X_train = X_train.values
y_train = y_train.values

In [435]:
auc_bootstrap = []

In [436]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76193668, 0.77458432])

In [437]:
t_23 = auc_bootstrap
print(t_23)

[0.7685076796055013, 0.7680072045783568, 0.767984584238147, 0.7674671439558451, 0.7711175013572203, 0.7734544652551574, 0.760337495475932, 0.7677371742671011, 0.7702932727108216, 0.7716688721498371, 0.7705901646760767, 0.7648530808903367, 0.7609326931777053, 0.7699525538364097, 0.7728988531487514, 0.7664859866992398, 0.7721608645494027, 0.7705915784473398, 0.7693008052841115, 0.7666909835323923, 0.7704685803474485, 0.7652630745566413, 0.7713691526420557, 0.7704982695439739, 0.771678768548679, 0.7705435102243937, 0.7667291553564966, 0.767157528049222, 0.7664732627578719, 0.7711358803836409, 0.7689586726384363, 0.7698125904813609, 0.7723870679515019, 0.7666839146760767, 0.7698846928157799, 0.7684581976112921, 0.7698578311617807, 0.7719134545783568, 0.7707273004885993, 0.7660406487513572, 0.7697899701411508, 0.7707668860839667, 0.7696782822113644, 0.7698465209916756, 0.7708191956207021, 0.7681994774701411, 0.7667093625588128, 0.7682263391241404, 0.7642889861563518, 0.7729440938291712, 0.7

In [438]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [60]:
column_to_drop_23 = 'Cat_가구주 성별'

In [61]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(8444, 172)


In [441]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [442]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [443]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.78519749490628


In [444]:
optuna_24 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [445]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7730190237061165


In [446]:
X_train = X_train.values
y_train = y_train.values

In [447]:
auc_bootstrap = []

In [448]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76133099, 0.77408402])

In [449]:
t_24 = auc_bootstrap
print(t_24)

[0.7697051438653637, 0.7700628279949331, 0.7652178338762214, 0.761800748733261, 0.7711061911871154, 0.7679110681324648, 0.7701985500361925, 0.7674077655627941, 0.7680948583966704, 0.7624595661418748, 0.7631410038906986, 0.7729200597176982, 0.7675321774339486, 0.7649039766558089, 0.7650255609844373, 0.765991166757148, 0.76257973669924, 0.7632922774158524, 0.7654921055012668, 0.7656009658885269, 0.7708898841838582, 0.767443109844372, 0.7718781102967789, 0.7650693878935939, 0.7654496923633731, 0.7682616834057184, 0.7673349563427435, 0.7682440112649295, 0.7682150289540355, 0.7646198086319218, 0.7636322894046326, 0.7739860432500905, 0.7682645109482447, 0.7646000158342381, 0.7725595480456027, 0.7724054469779225, 0.7735449466159972, 0.7672805261491132, 0.7703130655085053, 0.770263583514296, 0.7763244209192905, 0.7636287549764749, 0.7699299334961999, 0.7635170670466883, 0.7691495317589577, 0.76849354189287, 0.7680990997104598, 0.7730741607853782, 0.7689091906442274, 0.7679386366720956, 0.76352

In [450]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [62]:
column_to_drop_24 = 'Cat_현재 주변도로의 보행 안전'

In [63]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(8444, 168)


In [453]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [454]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [455]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 133, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7849544106532779


In [456]:
optuna_25 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [457]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.7751128189467968


In [458]:
X_train = X_train.values
y_train = y_train.values

In [459]:
auc_bootstrap = []

In [460]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76295258, 0.77465681])

In [461]:
t_25 = auc_bootstrap
print(t_25)

[0.7749092358849078, 0.7667729822656533, 0.7718045941910966, 0.7721523819218241, 0.77429848669924, 0.7663064377488238, 0.772043521534564, 0.7662336285287732, 0.7673639386536373, 0.7714030831523706, 0.7667079487875498, 0.7681528230184581, 0.7633445869525878, 0.771018537368802, 0.7698210731089395, 0.7673936278501629, 0.7730628506152732, 0.7714638753166848, 0.7709054356677525, 0.7680156872059356, 0.7756345005428882, 0.7654270720231632, 0.766757430781759, 0.7707174040897574, 0.7648516671190735, 0.7693983555012668, 0.7699582089214622, 0.763830924267101, 0.7623860500361926, 0.7701214995023525, 0.7694195620702136, 0.76412498868983, 0.7716038386717337, 0.7681415128483531, 0.7726217539811799, 0.775715085504886, 0.7651683518820124, 0.768397405446978, 0.7682970276872965, 0.7728543193539631, 0.7664464011038725, 0.7709125045240681, 0.7700034496018819, 0.764208401194354, 0.7720053497104596, 0.767560452859211, 0.7703116517372421, 0.7687211590662324, 0.7690392576004343, 0.7681188925081432, 0.766740465

In [462]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [64]:
column_to_drop_25 = 'Cat_현재 주택의 구조'

In [65]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(8444, 166)


In [465]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [466]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [467]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7856604599154067


In [468]:
optuna_26 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [469]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.772894611834962


In [470]:
X_train = X_train.values
y_train = y_train.values

In [471]:
auc_bootstrap = []

In [472]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76255144, 0.77448958])

In [473]:
t_26 = auc_bootstrap
print(t_26)

[0.771270895539269, 0.7669836341838582, 0.7668521534563879, 0.7677159676981542, 0.768397405446978, 0.7719191096634095, 0.7649675963626493, 0.7755115024429968, 0.7686660219869706, 0.7675024882374231, 0.7680312386898298, 0.76306466024249, 0.7690512746561708, 0.7649852685034383, 0.7689954306912776, 0.7712602922547955, 0.7736820824285198, 0.7673611111111112, 0.7717324918566775, 0.7621301574375678, 0.7649315451954398, 0.7676452791349981, 0.770936538635541, 0.7739351474846182, 0.7688377951954397, 0.7706806460369164, 0.7640288522439378, 0.7774483690734708, 0.7601621878393051, 0.7693361495656896, 0.7689699828085415, 0.7714511513753168, 0.7722852764205573, 0.7727249592833876, 0.7656462065689468, 0.7649562861925443, 0.762998919878755, 0.7647314965617084, 0.7664181256786102, 0.7694181482989504, 0.7617866110206297, 0.769447837495476, 0.7708460572747015, 0.767112287368802, 0.7700953447339848, 0.768528886174448, 0.7719671778863555, 0.7696796959826276, 0.7678460346543612, 0.7714695304017372, 0.768633

In [474]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
column_to_drop_26 = 'Cat_현재 공공기관 접근용이성'

In [73]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(8444, 162)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7847301101834623


In [77]:
optuna_27 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [78]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

0.7710241924538546


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76181923, 0.77460167])

In [82]:
t_27 = auc_bootstrap
print(t_27)

[0.7652277302750634, 0.7686490567318134, 0.7699172095548317, 0.7711047774158524, 0.77105388165038, 0.7672126651284834, 0.7670620984889613, 0.7668917390517552, 0.766249886898299, 0.7739181822294607, 0.7667277415852334, 0.7635665490408976, 0.7740835934672456, 0.767513798407528, 0.7685960403094462, 0.7662640246109302, 0.7607008346905538, 0.7663106790626131, 0.7726259952949692, 0.7668691187115453, 0.7706424742128122, 0.7701872398660875, 0.7659091680238872, 0.7736481519182048, 0.7650269747557005, 0.7655854144046327, 0.7730416440463265, 0.7664251945349257, 0.7698705551031488, 0.7708997805826999, 0.7677400018096272, 0.7669242557908071, 0.7682998552298227, 0.7707965752804923, 0.7651909722222222, 0.7684907143503438, 0.7652856948968513, 0.7711457767824829, 0.7687155039811799, 0.771712699058994, 0.7699610364639884, 0.7687960889431777, 0.7692082032663772, 0.7658413070032573, 0.7631607966883822, 0.7722004501447702, 0.7738291146398842, 0.7663460233441911, 0.7699186233260948, 0.7698026940825189, 0.76

In [83]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [68]:
column_to_drop_27 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [69]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(8444, 158)


In [488]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [489]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [490]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7873432659032348


In [491]:
optuna_28 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [492]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7717664223669924


In [493]:
X_train = X_train.values
y_train = y_train.values

In [494]:
auc_bootstrap = []

In [495]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76188268, 0.77464645])

In [496]:
t_28 = auc_bootstrap
print(t_28)

[0.7654956399294246, 0.7662611970684039, 0.767513798407528, 0.7661820258776693, 0.7681203062794064, 0.7668719462540717, 0.7655069500995295, 0.7662597832971407, 0.7666938110749186, 0.7689247421281217, 0.7734473963988419, 0.765388900199059, 0.761932229460731, 0.7652150063336953, 0.7687649859753891, 0.7645208446435034, 0.7639072679153094, 0.7646735319399204, 0.7714398412052118, 0.7684850592652914, 0.7653394182048499, 0.7681895810712993, 0.760963796145494, 0.7700614142236699, 0.7701278614730365, 0.7618799199239956, 0.7679103612468331, 0.7672176133279045, 0.7724973421100254, 0.763830924267101, 0.7652036961635903, 0.7698196593376765, 0.767843207111835, 0.7778852243937748, 0.7689996720050669, 0.7758833242851972, 0.7687720548317047, 0.772387067951502, 0.765706998733261, 0.7632463298498009, 0.7673229392870069, 0.7607503166847629, 0.7716745272348896, 0.7727772688201231, 0.7620156419652551, 0.7685826094824466, 0.7704092019543974, 0.7651273525153819, 0.7715699081614188, 0.7697065576366269, 0.77224

In [497]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [70]:
column_to_drop_28 = 'Cat_현재 의료시설 접근용이성'

In [71]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(8444, 154)


In [500]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [501]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [502]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 92, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7859510560905865


In [503]:
optuna_29 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [504]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.771750870883098


In [505]:
X_train = X_train.values
y_train = y_train.values

In [506]:
auc_bootstrap = []

In [507]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76350254, 0.77509038])

In [508]:
t_29 = auc_bootstrap
print(t_29)

[0.77288471543612, 0.7693672525334782, 0.7727320281397033, 0.7718512486427795, 0.7671165286825914, 0.7724874457111837, 0.7705322000542888, 0.7739026307455665, 0.7638521308360479, 0.7707753687115455, 0.7639892666485704, 0.7730267994480637, 0.7704572701773436, 0.7667857062070214, 0.7709280560079623, 0.7697927976836771, 0.7671660106768006, 0.7734544652551575, 0.7685882645674992, 0.7681323233351429, 0.7735491879297864, 0.7705717856496561, 0.76664008776692, 0.7689247421281216, 0.7664138843648208, 0.768127375135722, 0.7732063483984799, 0.769165083242852, 0.7711273977560622, 0.7698592449330438, 0.7659473398479913, 0.7633163115273253, 0.7668973941368079, 0.7700663624230908, 0.7688045715707563, 0.7662110081885631, 0.7670613916033298, 0.7690378438291713, 0.7658653411147305, 0.7704205121245024, 0.7696719202406803, 0.7703144792797683, 0.7665510201773433, 0.7679605501266739, 0.7662626108396671, 0.7664350909337677, 0.7711606213807456, 0.7715274950235251, 0.7723983781216069, 0.764120040490409, 0.7670

In [509]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [13]:
comp_30 = comp[[
 '현재 무주택 기간(총 개월)',
 '총 이사 횟수',
 '가구주 나이',
 '소득 중 근로/사업소득의 비중(월평균)',
 '소득 중 사적이전소득의 비중(월평균)',
 '부채 중 금융기관 대출금의 비중',
 'target',
 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
 'Cat_현재 주택의 유형_고시원',
 'Cat_현재 주택의 유형_기타',
 'Cat_현재 주택의 유형_다가구 단독주택',
 'Cat_현재 주택의 유형_다세대주택',
 'Cat_현재 주택의 유형_비거주용 건물 내 주택',
 'Cat_현재 주택의 유형_아파트',
 'Cat_현재 주택의 유형_연립주택',
 'Cat_현재 주택의 유형_영업겸용 단독주택',
 'Cat_현재 주택의 유형_오피스텔',
 'Cat_현재 주택의 유형_일반 단독주택',
 'Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',                
 'Cat_현재 주택의 점유형태_무상',
 'Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_현재 주택의 점유형태_보증금 있는 월세',
 'Cat_현재 주택의 점유형태_전세',
 'Cat_현재 문화시설 접근용이성_대체로 만족',
 'Cat_현재 문화시설 접근용이성_매우 만족',
 'Cat_현재 문화시설 접근용이성_매우 불만족',
 'Cat_현재 문화시설 접근용이성_약간 불만족',                
 'Cat_현재 도시공원 및 녹지 접근용이성_대체로 만족',
 'Cat_현재 도시공원 및 녹지 접근용이성_매우 만족',
 'Cat_현재 도시공원 및 녹지 접근용이성_매우 불만족',
 'Cat_현재 도시공원 및 녹지 접근용이성_약간 불만족',
 'Cat_현재 주차시설 이용편의성_대체로 만족',
 'Cat_현재 주차시설 이용편의성_매우 만족',
 'Cat_현재 주차시설 이용편의성_매우 불만족',
 'Cat_현재 주차시설 이용편의성_약간 불만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_대체로 만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_매우 만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_매우 불만족',
 'Cat_현재 자동차 경적/집주변의 소음 정도_약간 불만족',
 'Cat_현재 대기오염 정도_대체로 만족',
 'Cat_현재 대기오염 정도_매우 만족',
 'Cat_현재 대기오염 정도_매우 불만족',
 'Cat_현재 대기오염 정도_약간 불만족',                
 'Cat_현재 주택에 대한 전반적인 만족도_대체로 만족',
 'Cat_현재 주택에 대한 전반적인 만족도_매우 만족',
 'Cat_현재 주택에 대한 전반적인 만족도_매우 불만족',
 'Cat_현재 주택에 대한 전반적인 만족도_약간 불만족',                
 'Cat_이사 예상 기간_2~5년',
 'Cat_이사 예상 기간_2년 미만',
 'Cat_이사 예상 기간_5년 초과',
 'Cat_이사 예상 기간_이사 계획 없음 및 모름',                
 'Cat_이사 계획 첫 번째 이유_가구상황에 적합한 주택규모로 이사',
 'Cat_이사 계획 첫 번째 이유_결혼이나 세대독립을 위해',
 'Cat_이사 계획 첫 번째 이유_계약 만기로 인해',
 'Cat_이사 계획 첫 번째 이유_교통과 편의 및 문화시설 및 공원과 녹지 등이 좋은 지역으로 가기 위해',
 'Cat_이사 계획 첫 번째 이유_높은 집값 혹은 집세 부담',
 'Cat_이사 계획 첫 번째 이유_부모 혹은 자녀 등과 가까이 살기 위해',
 'Cat_이사 계획 첫 번째 이유_시설이나 설비가 더 양호한 집으로 이사',
 'Cat_이사 계획 첫 번째 이유_이사 계획 없음 및 모름',
 'Cat_이사 계획 첫 번째 이유_자가로 이사 또는 자가 마련을 위해',
 'Cat_이사 계획 첫 번째 이유_자녀 양육 및 교육환경',
 'Cat_이사 계획 첫 번째 이유_재개발이나 재건축으로 인해',
 'Cat_이사 계획 첫 번째 이유_직주근접 혹은 직장변동',
 'Cat_이사 계획 첫 번째 이유_집주인이 나가라고 해서',
 'Cat_이사 계획 중인 거주 지역_국내 to 국외',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',                
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 기타',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_공동주택 to 준주택',
 'Cat_이사 계획 중인 주택의 유형_기타 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_기타 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_기타 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_기타 to 준주택',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 기타',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_단독주택 to 준주택',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 공동주택',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 기타',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 단독주택',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 유형_준주택 to 준주택',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 전세',
 'Cat_주택 보유 의식_아니오',
 'Cat_주택 보유 의식_예',
 'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급',
 'Cat_현재 가장 필요한 주거지원 1순위_없음',
 'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등',
 'Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원',
 'Cat_기초생활보장 수급가구 여부_아니오',
 'Cat_기초생활보장 수급가구 여부_예',
 'Cat_소득 계층_서민층',
 'Cat_소득 계층_중산층',
 'Cat_가구주 최종 학력_고등학교 졸업',
 'Cat_가구주 최종 학력_대학 졸업 이상',
 'Cat_가구주 최종 학력_중학교 졸업 이하',
 'Cat_가구주 종사상 지위_무급가족종사자',
 'Cat_가구주 종사상 지위_무직 및 기타',
 'Cat_가구주 종사상 지위_사업자 및 자영자',
 'Cat_가구주 종사상 지위_상용근로자',
 'Cat_가구주 종사상 지위_임시일용근로자']]

In [14]:
X_30 = comp_30.drop('target', axis=1)
y_30 = comp_30['target']
X_30.shape

(8444, 152)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
import decimal
context = decimal.getcontext()
from scipy.stats import shapiro
context.rounding = decimal.ROUND_HALF_UP

In [17]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [18]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7862781149037166


In [19]:
optuna_30 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [20]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.774


In [21]:
X_train = X_train.values
y_train = y_train.values

In [22]:
auc_bootstrap = []

In [23]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76230511, 0.77474888])

In [24]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981680512428284, pvalue=0.024196382611989975),
 0.7684508530695803)

In [25]:
t_30 = auc_bootstrap
print(t_30)

[0.7641313506605139, 0.7715826321027869, 0.773755598534202, 0.765229850931958, 0.7655960176891061, 0.765412934310532, 0.7644515698516107, 0.7668274124592835, 0.7708312126764387, 0.7624687556550851, 0.7593902687296418, 0.7667560170104959, 0.7613066356768006, 0.7698726757600435, 0.7740390596724573, 0.7607821265381831, 0.7685472652008686, 0.7711747590933767, 0.7762417153003982, 0.766574347403185, 0.7718788171824105, 0.7639334226836771, 0.7698967098715166, 0.7662244390155628, 0.7677237434401013, 0.7640924719507781, 0.7684596113825553, 0.7696005247918929, 0.7698231937658342, 0.7722570009952949, 0.767366059310532, 0.7716504931234165, 0.767219733984799, 0.7670804775153818, 0.7675696423724214, 0.7672522507238508, 0.7645441718693451, 0.7664895211273977, 0.7684044743032935, 0.7648142021806008, 0.7703173068222946, 0.7648064264386536, 0.7737513572204127, 0.7709711760314876, 0.7660399418657255, 0.7629600411690193, 0.7730882984980093, 0.7745020697611292, 0.7671554073923272, 0.767017564694173, 0.7671

In [26]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
# 31
column_to_drop_30 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [28]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(8444, 148)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [30]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [31]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.785764323187144


In [32]:
optuna_31= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [33]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.775


In [34]:
X_train = X_train.values
y_train = y_train.values

In [35]:
auc_bootstrap = []

In [36]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7621693 , 0.77433164])

In [37]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9985588192939758, pvalue=0.0863521471619606),
 0.7684216452621697)

In [38]:
t_31 = auc_bootstrap
print(t_31)

[0.767802914630836, 0.7640260247014115, 0.7727044596000724, 0.769778659971046, 0.7671850965888527, 0.7690053270901194, 0.7736813755428881, 0.7684716284382918, 0.7727730275063337, 0.7690173441458559, 0.7683436821389794, 0.7655550183224755, 0.7676763821027868, 0.7701116031035107, 0.7682327010948244, 0.7664471079895041, 0.7701158444173, 0.7659932874140427, 0.7596885744661599, 0.7591562895855952, 0.7708248507057547, 0.7737520641060442, 0.7669426348172277, 0.7716462518096272, 0.767272750407166, 0.7726995114006514, 0.7677979664314151, 0.766150216024249, 0.7688017440282302, 0.7611313280401737, 0.769090860251538, 0.7719700054288816, 0.766728448470865, 0.766350971543612, 0.7688922253890699, 0.7650262678700688, 0.771826507645675, 0.7642812104144046, 0.76996174334962, 0.762449669743033, 0.7628695598081794, 0.7712546371697431, 0.7686759183858125, 0.7627465617082881, 0.7710312613101702, 0.768875967019544, 0.7691785140698516, 0.7648834769724937, 0.7672911294335867, 0.764727255247919, 0.7667800511219

In [39]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
# 32
column_to_drop_31 = 'Cat_현재 대기오염 정도'

In [41]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(8444, 144)


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [43]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [44]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7863201021837807


In [45]:
optuna_32 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [46]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.769


In [47]:
X_train = X_train.values
y_train = y_train.values

In [48]:
auc_bootstrap = []

In [49]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76292314, 0.77464426])

In [50]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980713129043579, pvalue=0.017655950039625168),
 0.7689087707541621)

In [51]:
t_32 = auc_bootstrap
print(t_32)

[0.7684207326728195, 0.7714355998914223, 0.7714476169471588, 0.7705512859663409, 0.7662088875316685, 0.7722124672005066, 0.7682355286373508, 0.7744327949692364, 0.7742779870159247, 0.763660564829895, 0.7650071819580166, 0.7708029372511763, 0.7724118089486066, 0.7612981530492218, 0.7694541994661599, 0.7636322894046327, 0.7688342607672819, 0.7696344553022079, 0.7730176099348535, 0.764934372737966, 0.7666026228284473, 0.7685175760043431, 0.7724528083152371, 0.7676028659971047, 0.7696888854958379, 0.7665913126583423, 0.7656483272258414, 0.7697920907980456, 0.7648693392598624, 0.7696606100705754, 0.7688130541983351, 0.7722541734527687, 0.7610712427614911, 0.7694690440644227, 0.7653740556007963, 0.7742935384998191, 0.7707350762305464, 0.7678820858215707, 0.7664746765291349, 0.7698952961002532, 0.7678417933405718, 0.7660116664404634, 0.7677286916395223, 0.7664054017372421, 0.7674586613282663, 0.7688667775063336, 0.767892689106044, 0.7698274350796236, 0.7734848613373145, 0.7669306177614912, 0.

In [52]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [53]:
# 33.
column_to_drop_32 = 'Cat_소득 계층'

In [54]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(8444, 142)


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [56]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [57]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7850151817165284


In [58]:
optuna_33 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [59]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.770


In [60]:
X_train = X_train.values
y_train = y_train.values

In [61]:
auc_bootstrap = []

In [62]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76185157, 0.77388994])

In [63]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990701675415039, pvalue=0.40175002813339233),
 0.7679601365985795)

In [64]:
t_33 = auc_bootstrap
print(t_33)

[0.7632767259319579, 0.769172858984799, 0.7676092279677886, 0.7652793329261672, 0.7684440598986608, 0.7689127250723851, 0.7735364639884184, 0.7705456308812884, 0.7710722606768006, 0.7655486563517916, 0.7672444749819037, 0.7646014296055013, 0.770379512757872, 0.7692704092019544, 0.7672967845186391, 0.7673816107944265, 0.773137073606587, 0.7694266309265292, 0.7697758324285197, 0.7692329442634818, 0.7673073878031127, 0.7674855229822656, 0.7641589192001448, 0.7710927603601159, 0.765949460504886, 0.7679541881559898, 0.7699419505519363, 0.7720463490770901, 0.7673752488237423, 0.7677810011762577, 0.7649591137350706, 0.7757949635812522, 0.7663537990861382, 0.7682645109482447, 0.7641525572294607, 0.7720640212178791, 0.7649273038816504, 0.7621032957835686, 0.7689933100343829, 0.7678036215164676, 0.7617816628212088, 0.7645378098986608, 0.7703208412504525, 0.7717614741675715, 0.7679400504433587, 0.7660081320123056, 0.7644084498280854, 0.7672048893865364, 0.7653471939467971, 0.7655352255247918, 0.7

In [65]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [66]:
# 34
column_to_drop_33 = 'Cat_현재 주택의 유형'

In [67]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(8444, 131)


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [69]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [70]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7861698319182883


In [71]:
optuna_34 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [72]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.771


In [73]:
X_train = X_train.values
y_train = y_train.values

In [74]:
auc_bootstrap = []

In [75]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76212275, 0.77378693])

In [76]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988362193107605, pvalue=0.20618949830532074),
 0.7681752539133188)

In [77]:
t_34 = auc_bootstrap
print(t_34)

[0.7593457349348535, 0.7710955879026421, 0.7706841804650741, 0.7642889861563518, 0.7679181369887803, 0.7585052479189287, 0.7649612343919652, 0.7702763074556642, 0.7644635869073471, 0.7685444376583425, 0.7710453990228013, 0.7696867648389432, 0.7685903852243936, 0.7616261479822657, 0.7679640845548317, 0.7654320202225842, 0.7657119469326819, 0.7670069614096997, 0.7631106078085415, 0.7737867015019906, 0.7676099348534202, 0.770462925262396, 0.7688116404270721, 0.7747212043069127, 0.7677428293521535, 0.7726111506967064, 0.7721933812884545, 0.7652666089847991, 0.7717614741675713, 0.7693842177886356, 0.7621308643231994, 0.7683344926257691, 0.7716052524429967, 0.770440304922186, 0.7702473251447701, 0.764330692408614, 0.7706940768639159, 0.7671052185124864, 0.7674565406713716, 0.7782506842652913, 0.7693114085685848, 0.7713875316684764, 0.7686151262214984, 0.7673151635450598, 0.7659819772439378, 0.7668097403184944, 0.7665778818313427, 0.7708396953040173, 0.7687572102334419, 0.7661516297955121, 0.

In [78]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [79]:
# 35
column_to_drop_34 = '총 이사 횟수'

In [80]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(8444, 130)


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.785537812860483


In [84]:
optuna_35 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [85]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.772


In [86]:
X_train = X_train.values
y_train = y_train.values

In [87]:
auc_bootstrap = []

In [88]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7622272, 0.7735567])

In [89]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999141275882721, pvalue=0.48145291209220886),
 0.7679806302733668)

In [90]:
t_35 = auc_bootstrap
print(t_35)

[0.7631155560079624, 0.7659883392146218, 0.7697539189739413, 0.7703660819308724, 0.7675364187477379, 0.7699935532030402, 0.7694124932138979, 0.7673780763662685, 0.7639164574285198, 0.7673978691639523, 0.7707456795150199, 0.7668952734799132, 0.769838038364097, 0.7715642530763663, 0.7680171009771988, 0.766321989232718, 0.7690979291078538, 0.7670430125769091, 0.7664817453854507, 0.7688476915942815, 0.7657571876131017, 0.7643773468602968, 0.7639595774520448, 0.7691961862106407, 0.7673455596272167, 0.7679619638979371, 0.7708545399022801, 0.7681796846724575, 0.7657578944987333, 0.7660766999185668, 0.7656751888798407, 0.7632025029406442, 0.7741182308631922, 0.765412934310532, 0.7657769804107853, 0.7694923712902643, 0.7725963060984437, 0.769500853917843, 0.7643278648660876, 0.7633424662956931, 0.7643893639160332, 0.7699299334961998, 0.7690180510314876, 0.763365086635903, 0.7670847188291713, 0.7696825235251538, 0.7679789291530945, 0.7674593682138979, 0.7686589531306551, 0.7624694625407167, 0.77

In [91]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [92]:
# 36
column_to_drop_35 = 'Cat_이사 계획 첫 번째 이유'

In [93]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(8444, 117)


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [95]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [96]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7845853645600838


In [97]:
optuna_36 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [98]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.771


In [99]:
X_train = X_train.values
y_train = y_train.values

In [100]:
auc_bootstrap = []

In [101]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76147406, 0.77279213])

In [102]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9960440397262573, pvalue=4.208913378533907e-05),
 0.7674172399509139)

In [103]:
t_36 = auc_bootstrap
print(t_36)

[0.7615773728736881, 0.7673476802841115, 0.7703540648751357, 0.7658003076366269, 0.7666280707111834, 0.7692074963807456, 0.7631388832338039, 0.7642416248190372, 0.7682927863735071, 0.769963157120883, 0.769732005519363, 0.7621647948335142, 0.769683937296417, 0.7682765280039812, 0.7714384274339486, 0.7701957224936663, 0.7679259127307274, 0.7688865703040173, 0.7675484358034744, 0.7665050726112921, 0.7709499694625407, 0.7679725671824105, 0.7665898988870794, 0.7668245849167572, 0.7592736326004343, 0.7658427207745204, 0.766603329714079, 0.7688505191368078, 0.7658745306279409, 0.7662993688925082, 0.7676544686482085, 0.7660986133731451, 0.7683231824556641, 0.7647739096996019, 0.7660922514024611, 0.7655741042345277, 0.7618382136717335, 0.7688017440282302, 0.771938902461093, 0.7642366766196163, 0.7622998099891423, 0.7703505304469779, 0.7661544573380384, 0.7721135032120883, 0.7671610624773797, 0.7663410751447701, 0.7631812963716973, 0.766727034699602, 0.7662117150741947, 0.7693135292254796, 0.771

In [104]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [105]:
# 37
column_to_drop_36 = '현재 무주택 기간(총 개월)'

In [106]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(8444, 116)


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [108]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [109]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7861267397098015


In [110]:
optuna_37 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [111]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.773


In [112]:
X_train = X_train.values
y_train = y_train.values

In [113]:
auc_bootstrap = []

In [114]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76162099, 0.77345104])

In [115]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988572597503662, pvalue=0.21964401006698608),
 0.7675233632770087)

In [116]:
t_37 = auc_bootstrap
print(t_37)

[0.7689918962631198, 0.7722145878574013, 0.7676594168476294, 0.7712701886536374, 0.7653648660875859, 0.767814224800941, 0.7665870713445531, 0.7716653377216793, 0.7643745193177705, 0.7636994435396308, 0.7670861326004343, 0.7646311188020267, 0.7694845955483169, 0.7665368824647123, 0.7717296643141512, 0.7624503766286646, 0.7683288375407167, 0.7693425115363736, 0.7659360296778863, 0.7662696796959827, 0.7631417107763301, 0.7724747217698155, 0.7674706783840028, 0.7705668374502352, 0.7663234030039813, 0.7703936504705031, 0.7660625622059356, 0.7651980410785377, 0.7676353827361564, 0.7664902280130295, 0.7671730795331161, 0.7709782448878031, 0.7699143820123054, 0.7678976373054651, 0.7687529689196526, 0.7650368711545422, 0.7654793815598987, 0.7675809525425261, 0.7685260586319218, 0.7678651205664133, 0.7635771523253709, 0.7662315078718784, 0.7643702780039812, 0.7665460719779226, 0.7657628426981542, 0.7685988678519725, 0.771895782437568, 0.7680489108306189, 0.7685698855410785, 0.7671858034744843, 0

In [117]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [118]:
# 38
column_to_drop_37 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [119]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(8444, 112)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [121]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [122]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7833445299413505


In [123]:
optuna_38 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [124]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.773


In [125]:
X_train = X_train.values
y_train = y_train.values

In [126]:
auc_bootstrap = []

In [127]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76133514, 0.77293826])

In [128]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9977302551269531, pvalue=0.0058847349137067795),
 0.7673841987027236)

In [129]:
t_38 = auc_bootstrap
print(t_38)

[0.7678820858215708, 0.7677527257509953, 0.7616381650380022, 0.7662350423000364, 0.7706764047231269, 0.7657791010676802, 0.7697708842290988, 0.7702664110568224, 0.7678778445077813, 0.7686108849077089, 0.7723192069308722, 0.7672784054922185, 0.767276284835324, 0.7686561255881289, 0.7701116031035107, 0.770316599936663, 0.7688031577994934, 0.7696888854958379, 0.7648481326909157, 0.7660152008686211, 0.7688194161690193, 0.764905390427072, 0.7703738576728194, 0.7672960776330076, 0.7699850705754615, 0.7732162447973219, 0.7702558077723488, 0.7675965040264205, 0.7672338716974303, 0.7740602662414042, 0.764558309581976, 0.7674968331523706, 0.7693863384455302, 0.7697588671733623, 0.7682532007781397, 0.7630978838671734, 0.7655104845276873, 0.7634074997737965, 0.7675173328356859, 0.7701101893322475, 0.7657282053022078, 0.7650121301574375, 0.7716801823199421, 0.7682454250361925, 0.7646664630836048, 0.7683585267372421, 0.7719000237513572, 0.7624291700597177, 0.7661848534201954, 0.7669574794154904, 0.7

In [130]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [131]:
# 38
column_to_drop_38 = 'Cat_현재 문화시설 접근용이성'

In [132]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(8444, 108)


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [134]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [135]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.78480193053094


In [136]:
optuna_39 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [137]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.771


In [138]:
X_train = X_train.values
y_train = y_train.values

In [139]:
auc_bootstrap = []

In [140]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76125251, 0.77306999])

In [141]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999397873878479, pvalue=0.8041389584541321),
 0.767264679187025)

In [142]:
t_39 = auc_bootstrap
print(t_39)

[0.7674134206478465, 0.7597663318856315, 0.768964327723489, 0.7684822317227651, 0.7672204408704306, 0.771619390155628, 0.7663898502533478, 0.7640967132645675, 0.7655267428972132, 0.7682256322385088, 0.7675654010586318, 0.7700296043702498, 0.7599656736337315, 0.7659883392146218, 0.7647194795059717, 0.7699744672909881, 0.7701738090390879, 0.7696761615544697, 0.7731285909790083, 0.765848375859573, 0.7636725818856316, 0.7680764793702497, 0.7751418012576909, 0.7707513346000725, 0.7639440259681507, 0.7730699194715889, 0.7683733713355049, 0.7680425488599348, 0.7723531374411872, 0.7718017666485704, 0.7728246301574375, 0.7668076196615998, 0.7687303485794426, 0.7724549289721317, 0.7690279474303293, 0.7682129082971407, 0.7706799391512849, 0.7633212597267462, 0.7691438766739052, 0.7673088015743756, 0.7714631684310532, 0.7699504331795151, 0.7670783568584871, 0.7673427320846906, 0.768027704261672, 0.7687020731541803, 0.7712221204306913, 0.7710312613101702, 0.7714801336862106, 0.7668429639431777, 0.7

In [143]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [144]:
# 40
column_to_drop_39 = 'Cat_기초생활보장 수급가구 여부'

In [145]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(8444, 106)


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [147]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [148]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7822175029501589


In [149]:
optuna_40 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [150]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.767


In [151]:
X_train = X_train.values
y_train = y_train.values

In [152]:
auc_bootstrap = []

In [153]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75856931, 0.77039008])

In [154]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980441927909851, pvalue=0.01616581156849861),
 0.7646352820049539)

In [155]:
t_40 = auc_bootstrap
print(t_40)

[0.7639404915399928, 0.7732275549674266, 0.7676085210821572, 0.7626490114911328, 0.7671207699963807, 0.7615929243575824, 0.7652757984980094, 0.7649131661690192, 0.767465730184582, 0.7608697803564966, 0.7631233317499095, 0.7666485703944987, 0.7670684604596454, 0.7656638787097358, 0.7619760563698876, 0.7633933620611654, 0.7666796733622874, 0.76443319082519, 0.7660130802117264, 0.7634894985070576, 0.7657974800941006, 0.7643434163499818, 0.7669631345005428, 0.7640740929243577, 0.7634767745656895, 0.7670818912866448, 0.7649803203040174, 0.7635382736156353, 0.7635234290173725, 0.7612592743394861, 0.7673865589938473, 0.7666266569399204, 0.7652998326094824, 0.7637892180148389, 0.7667948957202315, 0.7710058134274339, 0.7607863678519724, 0.7662824036373507, 0.762091278727832, 0.7644303632826639, 0.7643731055465075, 0.7629600411690192, 0.7621386400651466, 0.767417661961636, 0.7684836454940283, 0.762828560441549, 0.7623902913499818, 0.7620502793612016, 0.76932271873869, 0.76778665626131, 0.7734481

In [156]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [157]:
# 41.
column_to_drop_40 = 'Cat_현재 주차시설 이용편의성'

In [158]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(8444, 102)


In [159]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [160]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [161]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7841544424752165


In [162]:
optuna_41 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [163]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.771


In [164]:
X_train = X_train.values
y_train = y_train.values

In [165]:
auc_bootstrap = []

In [166]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76058149, 0.77229685])

In [167]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990143179893494, pvalue=0.34560489654541016),
 0.7665261448719689)

In [168]:
t_41 = auc_bootstrap
print(t_41)

[0.7691191356768006, 0.7632958118440101, 0.7676883991585233, 0.7701794641241404, 0.7667277415852335, 0.7639645256514658, 0.7648269261219689, 0.771005813427434, 0.7683033896579805, 0.7647088762214983, 0.764505293159609, 0.7720972448425624, 0.7691657901284835, 0.766291593150561, 0.7669235489051756, 0.761394289495114, 0.7672048893865364, 0.7694089587857402, 0.7685260586319218, 0.7698210731089395, 0.7661756639069852, 0.7602936685667752, 0.7604817001447703, 0.7669567725298588, 0.7673589904542164, 0.7663806607401376, 0.7671129942544336, 0.7660802343467246, 0.7653380044335868, 0.768585437024973, 0.7669306177614912, 0.7643830019453492, 0.7661933360477742, 0.7645420512124502, 0.7711542594100615, 0.7697058507509953, 0.7675463151465798, 0.7637793216159972, 0.7640719722674629, 0.7673943347357945, 0.7681089961093014, 0.7665835369163952, 0.7651160423452769, 0.7651556279406442, 0.7712143446887441, 0.7718243869887803, 0.7687600377759681, 0.7639383708830981, 0.7696634376131017, 0.7624687556550851, 0.77

In [169]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [170]:
# 42.
column_to_drop_41 = 'Cat_이사 계획 중인 주택의 유형'

In [171]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(8444, 83)


In [172]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [173]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [174]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.779870634980266


In [175]:
optuna_42 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [176]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.772


In [177]:
X_train = X_train.values
y_train = y_train.values

In [178]:
auc_bootstrap = []

In [179]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75984587, 0.77197393])

In [180]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989087581634521, pvalue=0.25591379404067993),
 0.7659034171558314)

In [181]:
t_42 = auc_bootstrap
print(t_42)

[0.7659119955664133, 0.7672734572927977, 0.7662696796959826, 0.7641362988599348, 0.762025538364097, 0.7663644023706118, 0.7678135179153094, 0.7608485737875498, 0.7706460086409699, 0.7639270607129931, 0.7631155560079623, 0.7639334226836771, 0.7669504105591749, 0.7660873032030402, 0.7595797140788998, 0.772888249864278, 0.7656115691730004, 0.7652319715888527, 0.7688597086500181, 0.7680227560622511, 0.7675328843195801, 0.7629819546235974, 0.7688462778230184, 0.7632258301664858, 0.7645929469779226, 0.7602759964259862, 0.7670599778320666, 0.7652475230727469, 0.7623782742942453, 0.7609319862920739, 0.766355919743033, 0.7619322294607311, 0.7669553587585958, 0.7686582462450235, 0.7694902506333695, 0.764409156713717, 0.7670882532573291, 0.7645519476112921, 0.7693891659880565, 0.7649803203040173, 0.7664909348986608, 0.7667475343829171, 0.7695913352786825, 0.7660865963174086, 0.7643830019453492, 0.7691177219055374, 0.7645604302388708, 0.7690003788906986, 0.7680814275696707, 0.7688427433948607, 0.7

In [182]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [183]:
# 43.
column_to_drop_42 = 'Cat_이사 예상 기간'

In [184]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(8444, 79)


In [185]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [186]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [187]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7799512947551257


In [188]:
optuna_43 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [189]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.773


In [190]:
X_train = X_train.values
y_train = y_train.values

In [191]:
auc_bootstrap = []

In [192]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76163954, 0.77255462])

In [193]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9993579983711243, pvalue=0.7559360861778259),
 0.7671692383165942)

In [194]:
t_43 = auc_bootstrap
print(t_43)

[0.7655352255247919, 0.7683203549131379, 0.7654298995656894, 0.7654765540173725, 0.7691997206387984, 0.7661622330799854, 0.7607729370249728, 0.772139657980456, 0.7628009919019182, 0.7646947385088672, 0.7694365273253709, 0.7663757125407167, 0.7667623789811799, 0.7633276216974303, 0.7681945292707203, 0.7654136411961635, 0.7659522880474121, 0.7714801336862105, 0.7665672785468693, 0.7684179051302932, 0.7690936877940644, 0.7701596713264567, 0.7625147032211363, 0.7636153241494751, 0.766002476927253, 0.770197843150561, 0.7682404768367717, 0.7649110455121245, 0.7610203469960188, 0.7716950269182048, 0.7712362581433225, 0.7609206761219689, 0.7667355173271806, 0.768291372602244, 0.7662435249276149, 0.7640104732175172, 0.7672260959554832, 0.7663368338309807, 0.7642246595638797, 0.7694598545512125, 0.7704176845819761, 0.7649972855591749, 0.7644996380745566, 0.7666471566232356, 0.7720060565960911, 0.7653535559174809, 0.7667814648932321, 0.7675067295512126, 0.7621435882645675, 0.7637164087947883, 0.7

In [195]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [196]:
## 44
column_to_drop_43 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [197]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(8444, 75)


In [198]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [199]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [200]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7807236397226187


In [201]:
optuna_44 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [202]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.772


In [203]:
X_train = X_train.values
y_train = y_train.values

In [204]:
auc_bootstrap = []

In [205]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76137937, 0.77284732])

In [206]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.998693585395813, pvalue=0.13266333937644958),
 0.7672567850417344)

In [207]:
t_44 = auc_bootstrap
print(t_44)

[0.7661721294788273, 0.7660258041530945, 0.7710708469055375, 0.7658745306279406, 0.7669348590752805, 0.7697122127216793, 0.7716808892055737, 0.7706976112920738, 0.7689424142689105, 0.7661996980184581, 0.7683535785378212, 0.7674629026420557, 0.7612677569670648, 0.7697404881469417, 0.7629480241132827, 0.7674424029587404, 0.7649817340752805, 0.7672628540083244, 0.7614020652370612, 0.7716823029768367, 0.7679464124140427, 0.7690152234889613, 0.7619499016015201, 0.7668344813155991, 0.7661869740770901, 0.767228216612378, 0.771301291621426, 0.7653931415128483, 0.768639867218603, 0.7694351135541079, 0.7658208073199421, 0.7700649486518277, 0.7696415241585233, 0.7654030379116902, 0.7665701060893956, 0.7608916938110749, 0.7645321548136085, 0.7702854969688745, 0.7640408692996743, 0.7627069761129207, 0.7728529055826999, 0.7674353341024249, 0.7643483645494028, 0.7689056562160695, 0.7653224529496923, 0.7651315938291713, 0.7643674504614549, 0.7679237920738327, 0.767766156577995, 0.7681280820213536, 0.7

In [208]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [209]:
# 45
column_to_drop_44 = '가구주 나이'

In [210]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(8444, 74)


In [211]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [212]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [213]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.778611016578346


In [214]:
optuna_45 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [215]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.771


In [216]:
X_train = X_train.values
y_train = y_train.values

In [217]:
auc_bootstrap = []

In [218]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76002184, 0.77108924])

In [219]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9983018636703491, pvalue=0.037464383989572525),
 0.7656325693313428)

In [220]:
t_45 = auc_bootstrap
print(t_45)

[0.7649004422276511, 0.7654723127035831, 0.761418323606587, 0.7674530062432139, 0.763424465028954, 0.7652461093014838, 0.765461002533478, 0.7689113113011219, 0.7645604302388708, 0.7644586387079262, 0.7694648027506334, 0.7658957371968875, 0.7636280480908433, 0.7583172163409337, 0.7662279734437205, 0.7679478261853059, 0.7635651352696344, 0.7682588558631922, 0.7661346645403547, 0.7667885337495476, 0.7701434129569309, 0.7613292560170105, 0.7631077802660151, 0.7698041078537821, 0.7665524339486066, 0.7671320801664857, 0.7681634263029317, 0.7629847821661238, 0.762555702587767, 0.7654475717064786, 0.769659903184944, 0.763678236970684, 0.7639235262848353, 0.77001617354325, 0.7661806121064061, 0.7642430385903004, 0.7641278162323561, 0.7618947645222585, 0.762563478329714, 0.7636018933224755, 0.7677590877216792, 0.7684313359572927, 0.7639744220503076, 0.7628059401013392, 0.7626143740951865, 0.7641907290535649, 0.7664032810803474, 0.7610203469960188, 0.7636527890879479, 0.7661078028863554, 0.764168

In [221]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [222]:
# 46.
column_to_drop_45 = 'Cat_이사 계획 중인 주택의 점유형태'

In [223]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(8444, 50)


In [224]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [225]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [226]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 90, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7806949115836276


In [227]:
optuna_46 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [228]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.772


In [229]:
X_train = X_train.values
y_train = y_train.values

In [230]:
auc_bootstrap = []

In [231]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75986862, 0.77204142])

In [232]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987510442733765, pvalue=0.1587863266468048),
 0.7661246568070259)

In [233]:
t_46 = auc_bootstrap
print(t_46)

[0.7624793589395584, 0.7713472391874774, 0.76978502194173, 0.7637383222493666, 0.7686412809898661, 0.7634746539087949, 0.7650814049493304, 0.7679570156985162, 0.7667616720955484, 0.7635128257328989, 0.761440943946797, 0.7622764827633008, 0.7652022823923272, 0.7694351135541078, 0.7685105071480275, 0.7699511400651466, 0.7681867535287731, 0.7683012690010858, 0.7684963694353963, 0.76460991223308, 0.7700465696254072, 0.7664322633912415, 0.7686547118168657, 0.7656829646217879, 0.7599805182319942, 0.7649336658523345, 0.7692400131197974, 0.7642875723850887, 0.7634605161961636, 0.7684702146670286, 0.7661396127397756, 0.7672883018910606, 0.7651937997647484, 0.76608730320304, 0.7642925205845097, 0.7740228013029317, 0.7698613655899385, 0.7672543713807456, 0.7662152495023526, 0.7704537357491857, 0.7650495950959103, 0.768599574737604, 0.7681273751357219, 0.7679393435577272, 0.766733396670286, 0.7655154327271082, 0.7637340809355773, 0.7657112400470504, 0.76953125, 0.7661360783116179, 0.76481632283749

In [234]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [235]:
# 47.
column_to_drop_46 = '소득 중 근로/사업소득의 비중(월평균)'

In [236]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(8444, 49)


In [237]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [238]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [239]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7800827812374315


In [240]:
optuna_47 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [241]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.773


In [242]:
X_train = X_train.values
y_train = y_train.values

In [243]:
auc_bootstrap = []

In [244]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76019608, 0.77250013])

In [245]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990491271018982, pvalue=0.37988534569740295),
 0.7664415334894136)

In [246]:
t_47 = auc_bootstrap
print(t_47)

[0.768533834373869, 0.765289229325009, 0.7693750282754253, 0.7699631571208831, 0.7693142361111112, 0.7680333593467246, 0.7679867048950415, 0.7639376639974665, 0.7629805408523344, 0.7692520301755339, 0.7642911068132464, 0.7661756639069852, 0.7620997613554108, 0.7665135552388709, 0.7707859719960188, 0.7735880666395222, 0.7687911407437568, 0.7657656702406803, 0.7668090334328628, 0.7695149916304741, 0.766079527461093, 0.767578125, 0.7704014262124502, 0.7676643650470503, 0.7680079114639885, 0.7671667175624322, 0.7678241211997828, 0.7604470627488238, 0.7700847414495113, 0.7642416248190373, 0.7660653897484618, 0.7687586240047051, 0.7686582462450235, 0.7703491166757148, 0.7684051811889251, 0.7659635982175172, 0.7659833910152009, 0.7679909462088309, 0.7667977232627579, 0.7697129196073109, 0.760674679922186, 0.766622415626131, 0.7649923373597539, 0.7680658760857763, 0.766211008188563, 0.7730564886445892, 0.7688165886264929, 0.7639291813698879, 0.7697821943992038, 0.7687798305736518, 0.7703363927

In [247]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [248]:
# 48
column_to_drop_47 = '부채 중 금융기관 대출금의 비중'

In [249]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(8444, 48)


In [250]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [251]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [252]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7785347765171772


In [253]:
optuna_48 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [254]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.773


In [255]:
X_train = X_train.values
y_train = y_train.values

In [256]:
auc_bootstrap = []

In [257]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76151372, 0.7731293 ])

In [258]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981371164321899, pvalue=0.021877408027648926),
 0.7674669538036102)

In [259]:
t_48 = auc_bootstrap
print(t_48)

[0.7637305465074194, 0.7618459894136808, 0.769926399068042, 0.767051495204488, 0.7625048068222945, 0.7695595254252624, 0.7710849846181687, 0.7637545806188926, 0.768397405446978, 0.7703978917842924, 0.7643964327723489, 0.7663347131740862, 0.7697277642055738, 0.7661820258776691, 0.7673215255157438, 0.7671971136445892, 0.7697546258595731, 0.7642889861563518, 0.7673158704306913, 0.7649336658523345, 0.7755341227832067, 0.7647343241042345, 0.7646268774882374, 0.7672451818675352, 0.767825534971046, 0.766935565960912, 0.766850739685125, 0.7689233283568585, 0.7694959057184221, 0.7671561142779587, 0.7660689241766195, 0.7713804628121607, 0.7735166711907348, 0.7702791349981903, 0.7672748710640609, 0.7631268661780674, 0.767030288635541, 0.7671575280492219, 0.7719332473760405, 0.7637390291349982, 0.7710270199963807, 0.7660279248099892, 0.7651598692544336, 0.7647823923271806, 0.7693969417300037, 0.769732005519363, 0.7718710414404634, 0.7679676189829895, 0.7638804062613102, 0.7678135179153094, 0.77140

In [260]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [261]:
# 49
column_to_drop_48 = 'Cat_가구주 최종 학력'

In [262]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(8444, 45)


In [263]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [264]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [265]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5}
0.7796209211567273


In [266]:
optuna_49 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [267]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.772


In [268]:
X_train = X_train.values
y_train = y_train.values

In [269]:
auc_bootstrap = []

In [270]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76168305, 0.77381784])

In [271]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9994615912437439, pvalue=0.8730489611625671),
 0.7678032882198922)

In [272]:
t_49 = auc_bootstrap
print(t_49)

[0.7664789178429243, 0.7683846815056099, 0.7677795874049946, 0.7718851791530945, 0.7666316051393414, 0.7743140381831342, 0.772669822204126, 0.7731575732899022, 0.7652899362106406, 0.7714002556098443, 0.7685133346905537, 0.7665043657256605, 0.7657013436482085, 0.7737244955664133, 0.7610740703040174, 0.7706679220955484, 0.7704968557727108, 0.7702480320304017, 0.7687027800398119, 0.7622432591386175, 0.769190531125588, 0.767892689106044, 0.7667206727289178, 0.7635863418385812, 0.7743903818313428, 0.7679449986427795, 0.7670133233803836, 0.7685599891422367, 0.7641787119978285, 0.7718993168657255, 0.768774882374231, 0.7636428926891059, 0.7697065576366269, 0.76734697339848, 0.7725341001628664, 0.7705279587404994, 0.7663714712269272, 0.7661579917661961, 0.7702494458016649, 0.7648106677524429, 0.7674544200144771, 0.7709365386355411, 0.7683606473941367, 0.7701914811798769, 0.7661735432500905, 0.7714978058269998, 0.7623690847810352, 0.7667418792978646, 0.7717946977922548, 0.7741627646579804, 0.767

In [273]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [274]:
# 50
column_to_drop_49 = 'Cat_현재 주택의 점유형태'

In [275]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(8444, 41)


In [276]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [277]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [278]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.780517018107567


In [279]:
optuna_50 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [280]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.770


In [281]:
X_train = X_train.values
y_train = y_train.values

In [282]:
auc_bootstrap = []

In [283]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75755823, 0.77078191])

In [284]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9985970854759216, pvalue=0.09763243049383163),
 0.7644229837501131)

In [285]:
t_50 = auc_bootstrap
print(t_50)

[0.765949460504886, 0.7612670500814331, 0.7671582349348535, 0.7639913873054651, 0.76947681980637, 0.7664117637079262, 0.762857542752443, 0.7610761909609121, 0.7652369197882736, 0.7616784575190012, 0.7594489402370611, 0.7642458661328266, 0.7649032697701773, 0.7664259014205574, 0.7680800137984075, 0.7683627680510314, 0.7636761163137894, 0.7632039167119075, 0.7691629625859573, 0.7636916677976837, 0.7590403603420195, 0.7638358724665218, 0.7661473884817228, 0.7618990058360477, 0.7554408987061165, 0.7717303711997828, 0.7646254637169743, 0.7682722866901918, 0.7629437827994933, 0.7649329589667029, 0.7649499242218604, 0.7713550149294245, 0.7619060746923634, 0.7679966012938835, 0.7609249174357582, 0.7575756933134273, 0.7669419279315962, 0.7650460606677524, 0.7698274350796235, 0.7636761163137894, 0.7690314818584871, 0.7608556426438653, 0.7654673645041621, 0.7569621165852334, 0.766198284247195, 0.7708538330166487, 0.7657812217245747, 0.7615144600524791, 0.7700069840300398, 0.7629126798317046, 0.76

In [286]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [287]:
# 51
column_to_drop_50 = 'Cat_가구주 종사상 지위'

In [288]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(8444, 36)


In [289]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [290]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [291]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 72, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 5}
0.7825014695548022


In [292]:
optuna_51 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [293]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.767


In [294]:
X_train = X_train.values
y_train = y_train.values

In [295]:
auc_bootstrap = []

In [296]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75796349, 0.76898497])

In [297]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979827404022217, pvalue=0.013243099674582481),
 0.7636182506559899)

In [298]:
t_51 = auc_bootstrap
print(t_51)

[0.7666591736789722, 0.758185028727832, 0.7659579431324646, 0.7573367659699601, 0.7619301088038363, 0.766222318358668, 0.7629381277144409, 0.7639489741675716, 0.7641539710007239, 0.7658363588038363, 0.76148759839848, 0.7653662798588491, 0.761145465752805, 0.7643596747195078, 0.7676488135631561, 0.7616883539178428, 0.7628674391512847, 0.761558286961636, 0.7672345785830619, 0.7583659914495113, 0.7677152608125226, 0.7649131661690192, 0.7605658195349259, 0.7672557851520087, 0.7615455630202678, 0.7596058688472674, 0.7690788431958018, 0.7607114379750272, 0.7622778965345639, 0.7627239413680782, 0.7673483871697431, 0.7658406001176258, 0.764296761898299, 0.7663304718602967, 0.7637849767010496, 0.7627953368168657, 0.7675229879207384, 0.7642861586138255, 0.7610549843919653, 0.7644409665671372, 0.7677096057274703, 0.762555702587767, 0.7667326897846545, 0.7645257928429243, 0.7635856349529497, 0.7585384715436121, 0.7569465651013391, 0.7606676110658703, 0.7619385914314152, 0.7611822238056462, 0.76367

In [299]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [300]:
# 52
column_to_drop_51 = 'Cat_현재 거주 지역'

In [301]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(8444, 19)


In [302]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [303]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [304]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 175, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 7}
0.7694379008127854


In [305]:
optuna_52 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [306]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.759


In [307]:
X_train = X_train.values
y_train = y_train.values

In [308]:
auc_bootstrap = []

In [309]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74914188, 0.75884989])

In [310]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9952903985977173, pvalue=6.051204309187597e-06),
 0.7542389464293792)

In [311]:
t_52 = auc_bootstrap
print(t_52)

[0.7566694659337676, 0.7527547333061889, 0.7538122342110025, 0.7519630213988417, 0.7484653512938835, 0.7545205336138257, 0.7539507837947883, 0.7549234584238147, 0.7539507837947883, 0.7527462506786101, 0.7550931109753891, 0.75515248936844, 0.7583716465345638, 0.7577411045512124, 0.7562905752352516, 0.7553433484889612, 0.753994610703945, 0.7551369378845458, 0.7550634217788635, 0.7576463818765833, 0.7510058982537096, 0.7576039687386898, 0.7569720129840752, 0.7555200698968514, 0.7534022405446978, 0.7548032878664495, 0.7515148559084328, 0.7529144894589215, 0.7542632272439377, 0.7508645211273979, 0.7533881028320666, 0.7553094179786465, 0.7544781204759319, 0.7544555001357219, 0.755788686436844, 0.7549757679605502, 0.7527943189015561, 0.7553588999728555, 0.7576265890788998, 0.755186419878755, 0.754502154587405, 0.7531887610839667, 0.7553221419200143, 0.7551949025063337, 0.7502481168566776, 0.7559088569942092, 0.7507570745114006, 0.7539111981994209, 0.7504008041530944, 0.7549743541892872, 0.756

In [312]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [313]:
# 53
column_to_drop_52 = 'Cat_주택 보유 의식'

In [314]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(8444, 17)


In [315]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [316]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [317]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 105, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7648204049341684


In [318]:
optuna_53 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [319]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.754


In [320]:
X_train = X_train.values
y_train = y_train.values

In [321]:
auc_bootstrap = []

In [322]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74512701, 0.75621625])

In [323]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9857506155967712, pvalue=3.192810599239593e-13),
 0.7512463185396308)

In [324]:
t_53 = auc_bootstrap
print(t_53)

[0.752604873552298, 0.7491072034473399, 0.7522415343376764, 0.7459961997828447, 0.7538355614368439, 0.7517976101610567, 0.7495878856768006, 0.7513254105591749, 0.7512617908523344, 0.7488371731360839, 0.7501081535016286, 0.7526826309717698, 0.7498791225570033, 0.7491397201863916, 0.7488498970774521, 0.7512950144770177, 0.751055380247919, 0.751670370747376, 0.7508178666757148, 0.7541508324285197, 0.7532750011310169, 0.7490633765381831, 0.7463418668566774, 0.754147298000362, 0.7487848635993486, 0.7509606575732899, 0.7570469428610205, 0.752396342290988, 0.7540087484165762, 0.7517516625950054, 0.7505817668747737, 0.7508009014205572, 0.7557816175805284, 0.7490471181686572, 0.7506870928338762, 0.7492796835414405, 0.7522669822204126, 0.7555624830347448, 0.754940423678972, 0.7495030594010132, 0.7528720763210277, 0.7546463592562431, 0.7577114153546869, 0.7557335493575823, 0.7503477877307275, 0.7452617456116539, 0.7524458242851971, 0.7537542695892147, 0.755141179198335, 0.7540101621878392, 0.7485

In [325]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [326]:
# 54
column_to_drop_53 = '소득 중 사적이전소득의 비중(월평균)'

In [327]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(8444, 16)


In [328]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [329]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [330]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 65, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 10}
0.7566063670395431


In [331]:
optuna_54 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [332]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.749


In [333]:
X_train = X_train.values
y_train = y_train.values

In [334]:
auc_bootstrap = []

In [335]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73995226, 0.74992931])

In [336]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9411337375640869, pvalue=1.7125442250829188e-27),
 0.7463482097414494)

In [337]:
t_54 = auc_bootstrap
print(t_54)

[0.7453783817408615, 0.7462549199239956, 0.7458562364277959, 0.7486993304379298, 0.7475061074918568, 0.7434104121425986, 0.746861427795874, 0.7440027823018458, 0.7470197701773436, 0.7450631107491856, 0.7427388707926168, 0.7472784903184945, 0.7440550918385813, 0.7423783591205212, 0.7485650221679334, 0.7416460256062252, 0.7384650402642057, 0.7474241087585958, 0.7459467177886356, 0.7430442453854507, 0.7446460482265653, 0.7486159179334058, 0.7442897778682591, 0.7464302275606225, 0.7459566141874774, 0.7498628641874774, 0.7495900063336953, 0.745987717155266, 0.744335018548679, 0.7470621833152371, 0.7467737739775606, 0.7489637056641332, 0.7390149972855592, 0.7487162956930873, 0.7439957134455302, 0.7472784903184944, 0.7489637056641332, 0.7396045399022801, 0.7457798927795874, 0.7465334328628302, 0.7504580618892509, 0.7456144815418024, 0.7464853646398842, 0.74830064694173, 0.74593964893232, 0.748972188291712, 0.7456667910785378, 0.7485296778863555, 0.7446559446254071, 0.7470678384002896, 0.74814

In [338]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [339]:
# 55
column_to_drop_54 = 'Cat_이사 계획 중인 거주 지역'

In [340]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(8444, 9)


In [341]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [342]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [343]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 7}
0.7246617814098002


In [344]:
optuna_55 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [345]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.721


In [346]:
X_train = X_train.values
y_train = y_train.values

In [347]:
auc_bootstrap = []

In [348]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71000144, 0.72133225])

In [349]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9030101895332336, pvalue=7.546771077905013e-34),
 0.7175620709204216)

In [350]:
t_55 = auc_bootstrap
print(t_55)

[0.7158524757962359, 0.7213152879569308, 0.7091427173814694, 0.7206960561436844, 0.7201955811165399, 0.7213322532120883, 0.7206960561436844, 0.7212417718512486, 0.7169354245837857, 0.7186347776420556, 0.7180918894770176, 0.7155131706930872, 0.7110824115544697, 0.7151597278773072, 0.7110541361292073, 0.7206055747828446, 0.7162794347176981, 0.7163529508233804, 0.7112831670738327, 0.7094085063789358, 0.7094085063789358, 0.7105338683043793, 0.7157789596905537, 0.7158524757962359, 0.7187648445982626, 0.7119023988870792, 0.7125527336681143, 0.7162850898027505, 0.7206055747828446, 0.7212417718512486, 0.7131832756514658, 0.7162794347176981, 0.7136667854234526, 0.716952389838943, 0.7201814434039087, 0.7180918894770176, 0.7181088547321751, 0.7150890393141512, 0.714803457519001, 0.7149844202406803, 0.7201220650108576, 0.7187789823108939, 0.7136894057636626, 0.7154962054379297, 0.7213152879569308, 0.7201220650108576, 0.7162624694625406, 0.7206225400380021, 0.7213152879569308, 0.7151597278773072, 0

In [351]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc