In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [5]:
from sklearn.utils import resample
from numpy.random import RandomState

In [6]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [7]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [8]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [9]:
중장년가구 = pd.read_csv('중장년가구_변수추가.csv', encoding='cp949')
중장년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [10]:
중장년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [11]:
cat = 중장년가구.select_dtypes(include = 'object')
num = 중장년가구.select_dtypes(exclude = 'object')
num_중장년 = num.drop('target',axis=1)
target = 중장년가구.target

In [12]:
scaler=RobustScaler()
scaler.fit(num_중장년)
num_scaled_중장년=scaler.transform(num_중장년)
num_df_scaled_중장년=pd.DataFrame(data=num_scaled_중장년, columns=num_중장년.columns)

In [13]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [14]:
comp =pd.concat([num_df_scaled_중장년, target,cat2],axis=1)

In [15]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(19949, 214)

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.783286352264317


In [16]:
optuna_0 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_0.fit(X_train, y_train)

In [17]:
optuna_proba_0 = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_proba_0)
print(auc_0)

0.7883899909761978


In [18]:
X_train = X_train.values
y_train = y_train.values

In [19]:
auc_bootstrap = []

In [20]:
rs = RandomState(seed = 0)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78101339, 0.78931196])

In [21]:
t_0 = auc_bootstrap
print(t_0)

[0.7830834881327492, 0.7842471253801302, 0.7865057175402003, 0.7870480438214429, 0.7863023121643811, 0.7862111759895012, 0.7852720771439984, 0.7854826149161124, 0.7908934620757282, 0.7846993721377957, 0.7870150234682254, 0.7814691891046571, 0.7854939739176192, 0.7834464478553149, 0.7863865801057919, 0.7884388610989597, 0.7861374745611199, 0.7870587424158854, 0.7837763872246631, 0.7846008394037951, 0.7844872493887272, 0.7834414287616259, 0.7853964978349216, 0.7845900087279398, 0.7829123106216702, 0.78571402155146, 0.7875356883977574, 0.7808674261876233, 0.7849474210311649, 0.7866985564029898, 0.7819987034888513, 0.7849252313538027, 0.7857930062363561, 0.7827868332794441, 0.7858064785404686, 0.7853658549471357, 0.7868834703810073, 0.7863997882470788, 0.7869727574161072, 0.7873926442276196, 0.7825551624812709, 0.7836466832772252, 0.7831017153677252, 0.7854218574661924, 0.7868805645899242, 0.7825776163214587, 0.7845213263932476, 0.783495978385141, 0.7847794134739947, 0.7860981143000848, 0.

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

NameError: name 'study' is not defined

In [16]:
column_to_drop = '부채 중 임대 보증금의 비중'

In [17]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(19949, 213)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [26]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [27]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 162, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 5}
0.7834460975304702


In [28]:
optuna_1= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [29]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.78960936657981


In [30]:
X_train = X_train.values
y_train = y_train.values

In [31]:
auc_bootstrap = []

In [32]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78222996, 0.78964028])

In [33]:
t_1 = auc_bootstrap
print(t_1)

[0.7874765159247918, 0.7876199563391681, 0.7859684103526469, 0.7842233507258137, 0.7850832007235948, 0.7820218177361034, 0.7864901319334817, 0.7852559632116283, 0.7894241884389668, 0.7826093158605474, 0.7835008653974171, 0.782419382788841, 0.7854443113063803, 0.783010843355671, 0.7836462870329866, 0.786671083469113, 0.785808063517423, 0.7853732515062564, 0.7879134412385643, 0.7856712271736902, 0.7861247947454844, 0.7851664120137027, 0.7848248494800218, 0.7846782391117366, 0.785232981045789, 0.7846729558552219, 0.7834984879319855, 0.7897205791294462, 0.7844687579909254, 0.7863020480015555, 0.7861615133782622, 0.784749827237512, 0.7865075666799806, 0.7835650569640717, 0.7853341554080471, 0.787722451515555, 0.7817498621070049, 0.7852797378659447, 0.7839544329692114, 0.7871595205339048, 0.7847170710471203, 0.7828198536326615, 0.7888554458751503, 0.7868203354656557, 0.7860761887855484, 0.786724708522738, 0.7861594000756562, 0.78586406603648, 0.7879905767836801, 0.7864079772946767, 0.7828991

In [34]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [18]:
column_to_drop_1 = 'Cat_가구주 동거 여부'

In [19]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(19949, 211)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [38]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [39]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.783714816311519


In [40]:
optuna_2= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [41]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.7890865883476721


In [42]:
X_train = X_train.values
y_train = y_train.values

In [43]:
auc_bootstrap = []

In [44]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78133889, 0.78937144])

In [45]:
t_2 = auc_bootstrap
print(t_2)

[0.7855840534411963, 0.7847503555631634, 0.7841110815248746, 0.7887183453685916, 0.788828369185512, 0.7888792205294668, 0.7858035727493856, 0.7861123790926746, 0.7827868332794441, 0.7858785949918956, 0.7852842286339823, 0.785796968678742, 0.7839829625543911, 0.7807625535458047, 0.7843224117854659, 0.7843041845504899, 0.7841163647813894, 0.7852563594558668, 0.7876548258321657, 0.7863554088923548, 0.7836287202050749, 0.7907648147795932, 0.783415012479052, 0.7851149002626836, 0.7812318388057304, 0.7885550927422849, 0.7879528014995995, 0.7863901463039396, 0.7857697599076909, 0.7838033318328885, 0.7859689386782982, 0.7859165023573891, 0.7827823425114065, 0.7868509783534413, 0.7869529452041768, 0.7851812051319441, 0.7853947807765542, 0.7838075584381003, 0.785642169262859, 0.78712478312232, 0.7837466689067675, 0.7884628999161021, 0.7866890465412633, 0.7868662997973342, 0.782669544984816, 0.7848507374369442, 0.7870644219166387, 0.7864481300441891, 0.7866800650051882, 0.7853330987567441, 0.7868

In [46]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [20]:
column_to_drop_2 = '소득 중 재산소득의 비중(월평균)'

In [21]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(19949, 210)


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [50]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [51]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7817000446213521


In [52]:
optuna_3 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [53]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7888456718505978


In [54]:
X_train = X_train.values
y_train = y_train.values

In [55]:
auc_bootstrap = []

In [56]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78150607, 0.78974503])

In [57]:
t_3 = auc_bootstrap
print(t_3)

[0.7877689441728851, 0.7872812995965706, 0.7888401244312574, 0.7860038081712959, 0.7810312071395815, 0.7828269860289565, 0.7857922137478788, 0.7851452789876434, 0.7836902701434721, 0.7842562389976183, 0.7851241459615844, 0.7850105559465166, 0.7882573812376767, 0.7873190748806512, 0.7855740152538182, 0.7820387241569509, 0.7868508462720286, 0.7858474237784583, 0.7848708138117005, 0.7855890725348853, 0.7846795599258654, 0.7893602610351379, 0.7851746010613007, 0.786750068154009, 0.789869566963163, 0.7850871631659809, 0.782037403342822, 0.7876231262930771, 0.7851901866680192, 0.7817513150025465, 0.7876886386738603, 0.7889756399608615, 0.7846526153176401, 0.7854863131956727, 0.783511167747621, 0.7866100618563673, 0.782996578563081, 0.7867690878774622, 0.7846713708782673, 0.7853737798319078, 0.7834876572561302, 0.7854501228885464, 0.784295995502892, 0.7858291965434823, 0.784647596223951, 0.7895494016183671, 0.7845591016773283, 0.7845539505022263, 0.786359239253328, 0.7854023094170877, 0.78737

In [58]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [22]:
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [23]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(19949, 209)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [22]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [23]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7833544607111264


In [24]:
optuna_4= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [25]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7886808342473368


In [26]:
X_train = X_train.values
y_train = y_train.values

In [27]:
auc_bootstrap = []

In [28]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78107024, 0.78957909])

In [29]:
t_4 = auc_bootstrap
print(t_4)

[0.7875483682133928, 0.7848805878362528, 0.7875161403486526, 0.789261992463963, 0.7857346262518676, 0.7854865773584985, 0.7853822330423316, 0.7858402913821634, 0.7899675713715122, 0.7816263659859719, 0.7834184465957865, 0.7844983442274082, 0.7885667159066173, 0.788197416276234, 0.785587487557931, 0.7856849636406287, 0.7875536514699075, 0.7875787469383528, 0.7878650994414542, 0.7865749282005441, 0.7865791548057557, 0.7846449545956935, 0.7857547026266238, 0.7841681406952343, 0.7891618747530078, 0.7831548120956988, 0.7820453282275943, 0.7851793559921639, 0.7854112909531629, 0.7841175535141052, 0.7847754510316086, 0.7851798843178154, 0.7866824424706197, 0.7850694642566562, 0.7861496260511038, 0.7870469871701399, 0.7871457840669663, 0.7838355596976286, 0.7856487733335024, 0.7838577493749908, 0.7853848746705889, 0.78412191220073, 0.787608861500487, 0.7885329030649229, 0.78803337116145, 0.7851150323440965, 0.787424740010947, 0.7885381863214376, 0.7859913925184863, 0.7872258254031652, 0.786978

In [30]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [24]:
column_to_drop_4 = '소득 중 사적이전소득의 비중(월평균)'

In [25]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(19949, 208)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [34]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [35]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 60, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5}
0.7836945901667087


In [36]:
optuna_5 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [37]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7893095417725959


In [38]:
X_train = X_train.values
y_train = y_train.values

In [39]:
auc_bootstrap = []

In [40]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78120962, 0.78954059])

In [41]:
t_5 = auc_bootstrap
print(t_5)

[0.7880933361228928, 0.7843712819082276, 0.7882119452316496, 0.7841787072082639, 0.7867474265257517, 0.7836268710652947, 0.7894388494757953, 0.7860302244538698, 0.7831709260280688, 0.7863337475406442, 0.7845311004177999, 0.788119488242641, 0.7834620334620335, 0.7850335381123559, 0.7847555067382654, 0.7876854687199515, 0.7869349821320265, 0.7877129416538283, 0.783810860473422, 0.7852465854313145, 0.7881667733884483, 0.7836523627779783, 0.7858900860748153, 0.785602544838998, 0.7873811531446999, 0.7864840561884897, 0.7888879379027162, 0.7869279818171444, 0.786980021893815, 0.7864769237921947, 0.7860378851758162, 0.789828093399522, 0.786847280073881, 0.7835993981314178, 0.7890926640926641, 0.7841197988981241, 0.7860096197534621, 0.7824624413294364, 0.7882000579044914, 0.7865165482160557, 0.7857039833640819, 0.7868599598895165, 0.7866499504430539, 0.7822434503468986, 0.7832487219802491, 0.7853729873434306, 0.7852477741640304, 0.7847846967305097, 0.7866586678163032, 0.7858184979490398, 0.784

In [42]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
column_to_drop_5 = 'Cat_가구주 주민등록상 등재 여부'

In [27]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(19949, 206)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [46]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [47]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7846460445297154


In [48]:
optuna_6 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [49]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7911104718370728


In [50]:
X_train = X_train.values
y_train = y_train.values

In [51]:
auc_bootstrap = []

In [52]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78345372, 0.79109607])

In [53]:
t_6 = auc_bootstrap
print(t_6)

[0.7885516586255502, 0.7887547998385437, 0.7881540935728129, 0.7857248522273153, 0.7860373568501646, 0.7874147018235689, 0.78809597775115, 0.7867355391985935, 0.7875486323762186, 0.7880703539570535, 0.7874030786592363, 0.7852300752547057, 0.7872522416857393, 0.7884642207302306, 0.7855304283875713, 0.787602785755495, 0.7881860572747272, 0.7844032456101421, 0.78588017996885, 0.7853909504155809, 0.7864211854359637, 0.7898520001352514, 0.7852242636725395, 0.7887131941934897, 0.7874461371998317, 0.7881496028047753, 0.7865847022250964, 0.7893747899905535, 0.7888308787323566, 0.7832472690847075, 0.7885104492247349, 0.7920150974338166, 0.7872963568776377, 0.7886544179647628, 0.7890205476412373, 0.7879121204244357, 0.7857733261058384, 0.7855016346395657, 0.7882557962607223, 0.7902187902187902, 0.7877414712390081, 0.7850620676975357, 0.786022827894749, 0.7869656250198123, 0.7862653293687776, 0.7836371734154985, 0.7878101535737004, 0.787330433882158, 0.7875961816848516, 0.789604215404708, 0.78719

In [54]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
column_to_drop_6 = '소득 대비 주거관리비의 비율'

In [29]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(19949, 205)


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [58]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [59]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 167, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3}
0.7845432626918025


In [60]:
optuna_7 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [61]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7908070808317115


In [62]:
X_train = X_train.values
y_train = y_train.values

In [63]:
auc_bootstrap = []

In [64]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78327398, 0.79068528])

In [65]:
t_7 = auc_bootstrap
print(t_7)

[0.7876698831132327, 0.7863218602134858, 0.7858151959137181, 0.7895018523097341, 0.7850530861614605, 0.7916864788785971, 0.7860672072494732, 0.7885564135564136, 0.7851532038724157, 0.7844066797268767, 0.7866728005274803, 0.7866195717180938, 0.7875829735435648, 0.7866396480928501, 0.7872258254031653, 0.7870966497813789, 0.7865664749901202, 0.7899583256726114, 0.7876313153406749, 0.788302817243704, 0.788481919639555, 0.788540827949695, 0.7876772796723536, 0.7832340609434205, 0.7859773918887221, 0.7874427030830973, 0.7856487733335025, 0.7874714968311027, 0.7858656510134343, 0.7882518338183363, 0.7873468119773539, 0.7903160021386622, 0.7863900142225265, 0.7885661875809659, 0.7844637388972364, 0.786267970997035, 0.7895187587305814, 0.7864512999980982, 0.7870340431916787, 0.7872054848655834, 0.7884721456150028, 0.7921511412890723, 0.7880180497175572, 0.786147512748498, 0.7908768198177065, 0.7873853797499117, 0.7883572347858061, 0.7890630778561813, 0.7857174556681945, 0.7882502488413818, 0.78

In [66]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
column_to_drop_7 = '소득 대비 생활비의 비율'

In [31]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(19949, 204)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [70]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [71]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.782320037876552


In [72]:
optuna_8 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [73]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7890659836472645


In [74]:
X_train = X_train.values
y_train = y_train.values

In [75]:
auc_bootstrap = []

In [76]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78119365, 0.78924673])

In [77]:
t_8 = auc_bootstrap
print(t_8)

[0.7822419974513571, 0.786178948124761, 0.7823355110916688, 0.7828755919888926, 0.7803022498219542, 0.7824597997011791, 0.7842806740589991, 0.7878286449715021, 0.7880457868142596, 0.7865083591684576, 0.7855951482798774, 0.782056819310514, 0.7825181796856674, 0.7879203094720335, 0.7856334518896095, 0.7861166056978866, 0.7915250753920705, 0.7851891300167163, 0.7840762120318772, 0.7861152848837578, 0.7872070698425377, 0.7863038971413356, 0.7859406732559442, 0.7867305201049044, 0.7853669115984387, 0.7827715118355513, 0.7824238735568785, 0.7860489800144972, 0.7793474332883201, 0.7831521704674413, 0.7902629054106887, 0.7856374143319956, 0.7869106791520586, 0.7838418996054465, 0.7838979021245032, 0.7832896672182387, 0.7820857451399323, 0.7872015224231973, 0.786397674944473, 0.7880286162305866, 0.7842167466551704, 0.7816887084128463, 0.7835684910808063, 0.7892244813427081, 0.7874643644348077, 0.7841831979763014, 0.7855011063139141, 0.7863989957586016, 0.7877396220992281, 0.7853352120593499, 0.

In [78]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [32]:
column_to_drop_8 = '중기부채부담지표'

In [33]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(19949, 203)


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7819105616387634


In [84]:
optuna_9 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [85]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7889809232173763


In [86]:
X_train = X_train.values
y_train = y_train.values

In [87]:
auc_bootstrap = []

In [88]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78274568, 0.79040096])

In [89]:
t_9 = auc_bootstrap
print(t_9)

[0.7894548313267524, 0.7857240597388381, 0.7875547081212106, 0.7854234424431469, 0.7832430424794957, 0.7847883950100698, 0.7859013129949091, 0.7842867498039912, 0.7850295756699698, 0.7849383074136769, 0.784383169235386, 0.7869349821320264, 0.7885746407913896, 0.78721499472731, 0.7865931554355201, 0.7815373431136978, 0.7873737565855792, 0.786290160674397, 0.7862743109048528, 0.7894860025401897, 0.7842904480835515, 0.7851180702165923, 0.7880222763227689, 0.7886451722658618, 0.7888673332023086, 0.7850604827205813, 0.7870559687062151, 0.7862774808587617, 0.7869175473855277, 0.7858426688475949, 0.7894801909580235, 0.7835069411424092, 0.7874355706868021, 0.7860495083401488, 0.78765456166934, 0.7863305775867352, 0.7853587225508407, 0.7900967469932987, 0.7880496171752329, 0.7871072162944084, 0.784852982820963, 0.7881560747940057, 0.7868348644210714, 0.787589841777034, 0.791511338925132, 0.7864143172024946, 0.7896585008653974, 0.7890634741004199, 0.7852592652469501, 0.7863033688156841, 0.783213

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
column_to_drop_9 = '장기부채부담지표'

In [35]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(19949, 202)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [94]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [95]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7842477133104957


In [96]:
optuna_10 = ExtraTreesClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [97]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.789841301540809


In [98]:
X_train = X_train.values
y_train = y_train.values

In [99]:
auc_bootstrap = []

In [100]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78243822, 0.79016948])

In [101]:
t_10 = auc_bootstrap
print(t_10)

[0.7858838782484103, 0.7885973587944032, 0.783073582026784, 0.7814686607790057, 0.786271140950944, 0.7900333479151214, 0.7841343278535396, 0.7830647325721218, 0.7847788851483433, 0.7845646490966688, 0.7847316000025361, 0.7863946370719769, 0.7858254982639219, 0.7884887878730242, 0.7878402681358346, 0.7843047128761415, 0.7865947404124743, 0.7865218314725703, 0.7855505047623275, 0.7866819141449684, 0.7854316314907448, 0.7886324924502265, 0.7895277402666565, 0.789053303831629, 0.7895253628012249, 0.785319890615457, 0.7833283670722095, 0.7868484688065969, 0.7869275855729058, 0.7864211854359637, 0.7888504267814613, 0.7868275999433635, 0.7857356829031706, 0.7832197961508307, 0.7886446439402105, 0.7883891984877207, 0.7900711231992019, 0.7826048250925098, 0.7890974190235274, 0.7881210732195955, 0.7836836660728286, 0.786832222792814, 0.7848618322756253, 0.7881453761995635, 0.78563899930895, 0.7895124188227637, 0.7883626501237337, 0.7848462466689067, 0.7872033715629774, 0.7865165482160557, 0.7816

In [102]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [36]:
column_to_drop_10 = '소득 대비 주택 임대료의 비율'

In [37]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(19949, 201)


In [105]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [106]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [107]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7804208441567286


In [108]:
optuna_11 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [109]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7870134384912709


In [110]:
X_train = X_train.values
y_train = y_train.values

In [111]:
auc_bootstrap = []

In [112]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78048036, 0.78920051])

In [113]:
t_11 = auc_bootstrap
print(t_11)

[0.7833447451674052, 0.7848845502786389, 0.7848758329053895, 0.7842648242894548, 0.7845078540891349, 0.7869381520859353, 0.7878291732971536, 0.7835264891915138, 0.7838594664333581, 0.7872897528069942, 0.7849476851939906, 0.7856384709832985, 0.7842146333525645, 0.7823826641560632, 0.7863741644529822, 0.7890733802063852, 0.7810856246816837, 0.7862537062044452, 0.7815254557865395, 0.785230867743183, 0.783809935903532, 0.7853848746705889, 0.7828740070119381, 0.7877612834509387, 0.7869595492748203, 0.7859800335169794, 0.7870105327001878, 0.7861963828712597, 0.7824164769977577, 0.7853391745017361, 0.783316743907877, 0.7846695217384873, 0.7858114976341578, 0.7867204819175262, 0.7830332971958587, 0.7799468187399222, 0.7843135623308037, 0.7838337105578485, 0.7856883977573632, 0.7858738400610322, 0.7870443455418825, 0.7842362947042749, 0.7874586849340544, 0.7858915389703567, 0.7905133317694895, 0.787515876185827, 0.7849254955166285, 0.7871965033295083, 0.7824901784261391, 0.7781951550424455, 0.7

In [114]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [38]:
column_to_drop_11 = '현재 무주택 기간(총 개월)'

In [39]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(19949, 200)


In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [118]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [119]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7843129323080467


In [120]:
optuna_12 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [121]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7897730154503554


In [122]:
X_train = X_train.values
y_train = y_train.values

In [123]:
auc_bootstrap = []

In [124]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78264977, 0.79016384])

In [125]:
t_12 = auc_bootstrap
print(t_12)

[0.7867262934996925, 0.7830996020651192, 0.7883905193018492, 0.7858109693085062, 0.7836699296058902, 0.783323876304172, 0.787788228059164, 0.7864877544680501, 0.7866515354200082, 0.7879060446794437, 0.7875735957632509, 0.784864209741057, 0.7891369113659754, 0.7863192185852284, 0.7845720456557895, 0.7877309047259786, 0.7842368230299264, 0.786050300828626, 0.7881667733884482, 0.7850919180968442, 0.7858429330104207, 0.7850430479740824, 0.7871315192743765, 0.7832813460892278, 0.7879950675517179, 0.7877089792114422, 0.7835021862115459, 0.7860032798456444, 0.7855296358990941, 0.7879171395181248, 0.7878547970912504, 0.7844476249648663, 0.7877956246182847, 0.7852321885573118, 0.7856672647313041, 0.7895816294831073, 0.7860542632710121, 0.7859747502604645, 0.7891349301447824, 0.7868528274932217, 0.785621828725277, 0.790259471293954, 0.7862399697375066, 0.7871504069164168, 0.7840763441132899, 0.7871847480837629, 0.7857980253300451, 0.785522503502799, 0.7892818046758934, 0.7832451557821015, 0.7833

In [126]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
column_to_drop_12 = '부채 중 비금융기관 대출금의 비중'

In [41]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(19949, 199)


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [130]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [131]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7828422026354254


In [132]:
optuna_13 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [133]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.790615034457399


In [134]:
X_train = X_train.values
y_train = y_train.values

In [135]:
auc_bootstrap = []

In [136]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78310389, 0.79096221])

In [137]:
t_13 = auc_bootstrap
print(t_13)

[0.7872510529530232, 0.7900325554266442, 0.7889370721883038, 0.7882577774819155, 0.7891647805440909, 0.7865907779700883, 0.7883897268133722, 0.7879911051093318, 0.7890245100836233, 0.7896714448438585, 0.7857725336173611, 0.7845923861933715, 0.7888723522959975, 0.7890758897532297, 0.7845115523686952, 0.7910326758848927, 0.7871716720238888, 0.7874638361091562, 0.7854268765598815, 0.7874622511322018, 0.7832264002214742, 0.7889737908210814, 0.7879081579820496, 0.7847342416307934, 0.7818654333432659, 0.7833473867956626, 0.7868488650508354, 0.7862324410969731, 0.7889574127258857, 0.7867387091525023, 0.7897110692677196, 0.7808365191370117, 0.7857092666205966, 0.7870414397507993, 0.7879958600401951, 0.7853978186490502, 0.7889336380715691, 0.7846724275295703, 0.7888188593237855, 0.787099687653875, 0.7857893079567957, 0.7864708480472027, 0.7878730243262262, 0.7852879269135427, 0.7843144869006939, 0.7876813741961525, 0.7859562588626627, 0.783937526548364, 0.7847923574524559, 0.7873386229297559, 0

In [39]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

NameError: name 'X_train' is not defined

In [42]:
column_to_drop_13 = 'Cat_가구주 장애 여부'

In [43]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(19949, 197)


In [141]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [142]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [143]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7828591265525113


In [144]:
optuna_14 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [145]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7902883971233725


In [146]:
X_train = X_train.values
y_train = y_train.values

In [147]:
auc_bootstrap = []

In [148]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78213392, 0.78994863])

In [149]:
t_14 = auc_bootstrap
print(t_14)

[0.7856162813059364, 0.7881039026359224, 0.7861547772262057, 0.7864944906201063, 0.7859591646537459, 0.7877108283512224, 0.788936808025478, 0.7853235888950175, 0.7860257336858322, 0.7847125802790829, 0.7850371043105033, 0.787832871576714, 0.7849289296333632, 0.7868637902504898, 0.7890921357670125, 0.7861319271417793, 0.7877182249103432, 0.7882011145557943, 0.7835275458428168, 0.7871867293049559, 0.7853003425663523, 0.7861720798912917, 0.7876142768384148, 0.784039757561925, 0.7879277060311544, 0.7869566434837372, 0.7878829304321916, 0.7805584877629212, 0.7861267759666776, 0.7861887221493132, 0.7880988835422333, 0.7856675288941299, 0.7872477509177016, 0.7873700583060188, 0.7823217746247302, 0.7843555642200962, 0.7835323007736801, 0.7864122038998886, 0.7869665495897022, 0.7884761080573889, 0.7851431656850376, 0.7853328345939183, 0.7838206344979743, 0.787303093029694, 0.784568479457642, 0.7851825259460726, 0.7862304598757801, 0.7876273528982889, 0.7849812338728595, 0.7863688811964673, 0.78

In [150]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
column_to_drop_14 = '자산 중 부동산 자산의 비중'

In [45]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(19949, 196)


In [153]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [154]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [155]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7833053400737305


In [156]:
optuna_15 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [157]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7887125337864254


In [158]:
X_train = X_train.values
y_train = y_train.values

In [159]:
auc_bootstrap = []

In [160]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78100196, 0.78919777])

In [161]:
t_15 = auc_bootstrap
print(t_15)

[0.7877171682590403, 0.7856300177728749, 0.7841684048580599, 0.7858109693085062, 0.7810821905649492, 0.7843115811096106, 0.7822659041870865, 0.7874038711477135, 0.7861559659589217, 0.7867770127622344, 0.789847377285801, 0.782282546445108, 0.7868795079386213, 0.7855796947545716, 0.7852203012301534, 0.7817624098412275, 0.778580832768025, 0.7849144006779475, 0.7862286107359997, 0.7853074749626474, 0.7829360852759867, 0.7853708740408248, 0.7843971698651502, 0.7845250246728079, 0.7837687265027167, 0.7875961816848516, 0.7848215474447, 0.7880463151399112, 0.7846135192194306, 0.7831339432324653, 0.7847300150255816, 0.7856022806761723, 0.7836627972095953, 0.7844238503105498, 0.7818960762310516, 0.7870665352192445, 0.7862626877405203, 0.7850792382812086, 0.7874210417313865, 0.782149936706587, 0.7870026078154156, 0.7841221763635557, 0.785120447682024, 0.7857466456604387, 0.7842805419775862, 0.7878138518532607, 0.785089012305761, 0.7888777676339254, 0.7840927222084857, 0.7816078745881702, 0.784730

In [162]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
column_to_drop_15 = '소득 중 정부 보조금의 비중(월평균)'

In [47]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(19949, 195)


In [165]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [166]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [167]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 101, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7822833005931213


In [168]:
optuna_16 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [169]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7890044337088673


In [170]:
X_train = X_train.values
y_train = y_train.values

In [171]:
auc_bootstrap = []

In [172]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78026065, 0.78845636])

In [173]:
t_16 = auc_bootstrap
print(t_16)

[0.781603119657307, 0.7811823082759043, 0.7839335641059779, 0.7819647585657438, 0.7844946459478479, 0.7824769702848522, 0.7831622086548193, 0.7826093158605474, 0.7865289638688653, 0.7863040292227486, 0.7839865287525386, 0.7854997854997854, 0.7862362714579463, 0.7847337133051419, 0.784644426270042, 0.7832160978712703, 0.7827196038402935, 0.7833922944760383, 0.7849048908162208, 0.785877142096354, 0.7891697996377799, 0.7834570143683445, 0.7833222913272174, 0.7815862132364595, 0.7840595697738555, 0.7788423539655067, 0.7837882745518213, 0.78381812495113, 0.783797784413548, 0.7854054793709966, 0.7827979281181252, 0.7852654730733548, 0.7843652061632357, 0.7838986946129803, 0.7872680914552836, 0.7816968974604444, 0.7852818511685508, 0.78387888240105, 0.7827347932027734, 0.7845200055791189, 0.7826254297929175, 0.7842130483756099, 0.7847017496032275, 0.7828446849382811, 0.787591690916814, 0.7838952604962458, 0.7863329550521669, 0.7861350970956882, 0.7866304023939491, 0.7827918523731332, 0.784271

In [174]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [48]:
column_to_drop_16 = '소득 중 근로/사업소득의 비중(월평균)'

In [49]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(19949, 194)


In [177]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [178]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [179]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7848644043379713


In [180]:
optuna_17 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [181]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7876615619842221


In [182]:
X_train = X_train.values
y_train = y_train.values

In [183]:
auc_bootstrap = []

In [184]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78165913, 0.78965302])

In [185]:
t_17 = auc_bootstrap
print(t_17)

[0.7832440991307985, 0.7864161663422747, 0.7836336072173511, 0.7838759766099668, 0.7894516613728436, 0.7852060364375637, 0.7834464478553149, 0.788029276637651, 0.7873980595655472, 0.7878392114845316, 0.7868331473627039, 0.7857195689708005, 0.785263359770749, 0.7829278962283888, 0.7824384025122941, 0.7887804236326403, 0.7876807137890882, 0.7907732679900168, 0.7846734841808733, 0.7880420885346994, 0.7869880788600001, 0.7829329153220779, 0.7838371446745831, 0.7850776533042543, 0.7856120547007246, 0.7853249097091461, 0.7850367080662648, 0.7856062431185584, 0.7830454486858428, 0.7867067454505878, 0.7848224720145902, 0.7875887851257309, 0.7847540538427238, 0.7832364384088523, 0.7838883922627765, 0.7904060816622392, 0.7838337105578486, 0.7864188079705321, 0.7877943038041559, 0.7856273761446175, 0.787201786586023, 0.7882048128353547, 0.7847994898487509, 0.7856329235639581, 0.7855153711065042, 0.7851706386189145, 0.7846851073452059, 0.7868486008880098, 0.7843868675149464, 0.786172872379769, 0.7

In [186]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [50]:
column_to_drop_17 = '자산 중 기타자산의 비중'

In [51]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(19949, 193)


In [189]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [190]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [191]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 74, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7840062379081708


In [192]:
optuna_18 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [193]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.7908873863307362


In [194]:
X_train = X_train.values
y_train = y_train.values

In [195]:
auc_bootstrap = []

In [196]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78177873, 0.78975706])

In [197]:
t_18 = auc_bootstrap
print(t_18)

[0.7908839522140014, 0.7854955588945736, 0.7890591154137951, 0.7823283786953737, 0.7829577466276973, 0.7852543782346737, 0.7885405637868692, 0.7884959202693193, 0.7884438801926485, 0.7854421980037745, 0.7878019645261024, 0.78574532484631, 0.7873203956947799, 0.7869677383224182, 0.7851501659999197, 0.7855917141631428, 0.7836231727857345, 0.789757033599398, 0.7884507484261178, 0.7873940971231611, 0.7829276320655632, 0.7825932019281773, 0.7834860722791759, 0.7875491607018701, 0.7831220559053071, 0.7896658974245181, 0.7879628396869776, 0.7863131428402365, 0.7855399382492978, 0.7851444864991663, 0.782800041420731, 0.7873045459252356, 0.7804732952516205, 0.7846684650871842, 0.7875399150029692, 0.7814924354333223, 0.7859684103526468, 0.7870650823237029, 0.7791102150708062, 0.7887857068891553, 0.7854902756380588, 0.7856728121506446, 0.7874639681905692, 0.7872123530990527, 0.7861763064965035, 0.7856529999387142, 0.7829062348766782, 0.7813555990895893, 0.7874569678756871, 0.7821084631429459, 0.7

In [198]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [52]:
column_to_drop_18 = '총 이사 횟수'

In [53]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(19949, 192)


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [55]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [56]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7821214914346404


In [203]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7821214914346404


In [57]:
optuna_19 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [58]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7886644561521409


In [59]:
X_train = X_train.values
y_train = y_train.values

In [60]:
auc_bootstrap = []

In [61]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)
#array([0.781057  , 0.78893338])

array([0.781057  , 0.78893338])

In [62]:
t_19 = auc_bootstrap
print(t_19)

[0.7864169588307519, 0.7844957025991509, 0.7844431341968288, 0.7875753128216182, 0.7833500284239201, 0.7858968222268715, 0.7859161061131505, 0.7822252231119227, 0.7863637300213655, 0.7848404350867404, 0.7853682324125675, 0.786203251104729, 0.7861971753597369, 0.7846471999797123, 0.7849534967761569, 0.7859839959593654, 0.783489770558736, 0.7848528507395502, 0.7812030450577249, 0.7882505130042077, 0.7846726916923961, 0.7850166316915086, 0.7851658836880511, 0.7858062143776429, 0.7804957490918082, 0.7844375867774882, 0.787185672653653, 0.785226905300797, 0.7856271119817918, 0.7854865773584985, 0.7856337160524353, 0.7841184780839954, 0.7849622141494063, 0.7862423472029384, 0.7790880253934442, 0.7853060220671058, 0.7866359498132898, 0.7841792355339153, 0.7854773316595975, 0.7874059844503194, 0.7883094213143473, 0.785518541060413, 0.7834445987155345, 0.7881741699475688, 0.7848620964384511, 0.7904663107865079, 0.7850797666068603, 0.7842862214783397, 0.7869326046665948, 0.7843961132138473, 0.78

In [63]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [128]:
column_to_drop_19 = '자산 중 금융자산의 비중'

In [129]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(19949, 191)


In [130]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [131]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [132]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7831460075860426


In [133]:
optuna_20 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [134]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7866776875397565


In [135]:
X_train = X_train.values
y_train = y_train.values

In [136]:
auc_bootstrap = []

In [137]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78004066, 0.78834421])

In [138]:
t_20 = auc_bootstrap
print(t_20)

[0.7842743341511813, 0.7811708171929846, 0.7824769702848521, 0.7836617405582923, 0.7829393873113084, 0.7857560234407526, 0.7821644656620025, 0.7856355651922154, 0.782623976897376, 0.7853618925047496, 0.7880473717912141, 0.7833827846143117, 0.784267730080538, 0.7820722728358196, 0.7849214009928295, 0.7811627602267996, 0.785819158356104, 0.7871817102112667, 0.7771916004674624, 0.7856403201230787, 0.7846396713391787, 0.7867424074320626, 0.7850936351552115, 0.7868536199816987, 0.7836292485307264, 0.7853540997013904, 0.7826748282413307, 0.7844941176221963, 0.7862177800601446, 0.781910869349293, 0.7827937015129134, 0.7837117994137699, 0.7861541168191415, 0.787528027675811, 0.783276062832713, 0.7877884922219898, 0.7787575576984443, 0.7851344483117882, 0.7845797063777359, 0.7842022176997546, 0.7830945829714303, 0.7851275800783191, 0.7828784977799756, 0.7844729845961372, 0.785113315285729, 0.7844640030600623, 0.7838326539065457, 0.7828359675650316, 0.7829997485169898, 0.7822714516064271, 0.7853

In [223]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [53]:
column_to_drop_20 = 'Cat_현재 공공기관 접근용이성'

In [54]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(19949, 187)


In [226]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [227]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [228]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.781512230419544


In [229]:
optuna_21 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [230]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7882886845325268


In [231]:
X_train = X_train.values
y_train = y_train.values

In [232]:
auc_bootstrap = []

In [233]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78230563, 0.78967835])

In [234]:
t_21 = auc_bootstrap
print(t_21)

[0.7844515874072523, 0.7843909620387454, 0.7881939821594994, 0.7862642727174749, 0.7875084796267062, 0.7890398315275162, 0.7872555437210611, 0.7862435359356541, 0.7842407854723126, 0.791060677144421, 0.7883988404308602, 0.784711259464954, 0.787143142438709, 0.7882732310072211, 0.787331358452048, 0.7839227334301225, 0.7905899389889538, 0.7869035467557635, 0.7881678300397512, 0.7862526495531421, 0.7875338392579772, 0.7847140331746243, 0.7851909791564964, 0.7883903872204364, 0.7870950648044244, 0.7871737853264946, 0.787436891500931, 0.7862604423565015, 0.7873687374918902, 0.7879403858467898, 0.7847971123833192, 0.7856355651922154, 0.7859089737168554, 0.7863823535005802, 0.7853949128579671, 0.785331249616964, 0.7874269853949657, 0.790162259374082, 0.7859917887627248, 0.7845976694498862, 0.7818049400561716, 0.7877698687427752, 0.7850438404625597, 0.7861953262199568, 0.7880653348633644, 0.7845658378293847, 0.7850662943027474, 0.7835535658811521, 0.7841337995278882, 0.7844699467236413, 0.7848

In [235]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [55]:
column_to_drop_21 = '현재 주택 거주 기간(총 개월)'

In [56]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(19949, 186)


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [59]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [60]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7837668263981736


In [61]:
optuna_22 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [62]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7887688004683078


In [63]:
X_train = X_train.values
y_train = y_train.values

In [64]:
auc_bootstrap = []

In [65]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78165055, 0.78953185])

In [66]:
t_22 = auc_bootstrap
print(t_22)

[0.7846077076372642, 0.7851685253163084, 0.7886854570967872, 0.783846390373484, 0.7889233357213652, 0.7847492989118604, 0.7861126432555003, 0.7852609823053173, 0.7900246305418719, 0.7837858970863897, 0.7820527247867151, 0.7880164647406027, 0.7853892333572137, 0.7822590359536172, 0.7853880446244978, 0.7870810641746602, 0.7841903303725963, 0.7826983387328215, 0.7845181564393386, 0.7864114114114114, 0.7870776300579255, 0.7873449628375737, 0.7833344428172014, 0.7854933135105548, 0.7882750801470013, 0.784453964872684, 0.787922951100291, 0.7842922972233318, 0.7871441990900119, 0.7877729066152712, 0.7856219608066899, 0.7860368285245133, 0.7858603677569194, 0.7856733404762961, 0.7861229456057042, 0.7849939136884949, 0.787171407861063, 0.7846510303406854, 0.7840027747663215, 0.7872995268315465, 0.7874157584748718, 0.7881846043791857, 0.7865810039455359, 0.7870538554036091, 0.7878685335581888, 0.7893489020336311, 0.7864757350594789, 0.7872145984830714, 0.7842278414938514, 0.7837008366565017, 0.7

In [67]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [57]:
column_to_drop_22 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [58]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(19949, 182)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [71]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200)

In [72]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7837404085763808


In [73]:
optuna_23 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [74]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.7878801567225213


In [75]:
X_train = X_train.values
y_train = y_train.values

In [76]:
auc_bootstrap = []

In [77]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78076078, 0.78865131])

In [78]:
t_23 = auc_bootstrap
print(t_23)

[0.7855877517207567, 0.7852216220442821, 0.787893629026634, 0.7866327798593807, 0.7848002823372282, 0.7855000496626113, 0.7849162498177276, 0.7844037739357936, 0.7845531580137491, 0.7840674946586277, 0.7867265576625182, 0.78510829619204, 0.7852842286339823, 0.78616877785597, 0.7862951797680862, 0.783754197547301, 0.7839517913409539, 0.7858896898305765, 0.7865360962651603, 0.7847712244263968, 0.7854068001851253, 0.7836324184846353, 0.7867508606424862, 0.7848126979900379, 0.7854971438715281, 0.7834755057661462, 0.7847239392805895, 0.7881575276895474, 0.7853307212913124, 0.7840643247047188, 0.7864839241070767, 0.7867321050818588, 0.784890626023631, 0.7863234451904403, 0.7835653211268974, 0.7862732542535498, 0.7839013362412377, 0.7859155777874991, 0.7877266781207669, 0.7827592282641544, 0.7859578438396173, 0.78380478472843, 0.7842384080068809, 0.7876310511778493, 0.7842804098961734, 0.785924559323574, 0.7859659008058024, 0.7849194197716365, 0.7863583146834379, 0.783494789652425, 0.78664889

In [79]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
column_to_drop_23 = 'Cat_이사 예상 기간'

In [60]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(19949, 178)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [83]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [84]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 190, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 3}
0.784177953749824


In [85]:
optuna_24 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [86]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7909124817991813


In [87]:
X_train = X_train.values
y_train = y_train.values

In [88]:
auc_bootstrap = []

In [89]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7838124 , 0.79067947])

In [90]:
t_24 = auc_bootstrap
print(t_24)

[0.7857066249923392, 0.7893100700982475, 0.7825187080113187, 0.7891692713121284, 0.7872097114707952, 0.7847009571147503, 0.7873536802108231, 0.7850660301399218, 0.7892588225100541, 0.7885418846009978, 0.7872546191511709, 0.7889212224187594, 0.7904129498957085, 0.787319339043477, 0.7864927735617392, 0.7887590264437554, 0.7874030786592363, 0.7883559139716775, 0.7870485721470943, 0.7868303736530338, 0.7915007724121024, 0.7896563875627914, 0.7888969194387914, 0.7879271777055028, 0.7881046951243996, 0.7871148770163547, 0.7887389500689993, 0.7875124420690924, 0.7890046978716929, 0.7893441471027678, 0.7882634569826688, 0.7868586390753879, 0.7868834703810075, 0.7878223050636843, 0.78594278655855, 0.7860666789238218, 0.7864660931163394, 0.7893253915421403, 0.7872480150805273, 0.7826740357528535, 0.7903405692814559, 0.7896854454736228, 0.7870958572929017, 0.7860336585706045, 0.7847321283281874, 0.7883574989486319, 0.7852229428584109, 0.7843722064781178, 0.784741770271327, 0.7870514779381774, 0.7

In [91]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [61]:
column_to_drop_24 = 'Cat_현재 상업시설 접근용이성'

In [62]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(19949, 174)


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [95]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [96]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.783883229925448


In [97]:
optuna_25 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [98]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.791188267789253


In [99]:
X_train = X_train.values
y_train = y_train.values

In [100]:
auc_bootstrap = []

In [101]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7838901, 0.7910033])

In [102]:
t_25 = auc_bootstrap
print(t_25)

[0.7864082414575025, 0.7876527125295597, 0.7855534105534105, 0.78862483172828, 0.7832359100832008, 0.7873856439127375, 0.7870601953114269, 0.7885524511140276, 0.7886055478420011, 0.7855462781571156, 0.788614793540902, 0.7891087780250341, 0.7840035672547988, 0.7879638963382806, 0.792362075305425, 0.7853585904694279, 0.7885993400155962, 0.785587487557931, 0.7890081319884276, 0.7891394209128199, 0.7885524511140275, 0.7873421891279034, 0.7884969769206223, 0.7890939849067928, 0.7866651398055339, 0.7873655675379814, 0.7848203587119844, 0.7869184719554176, 0.7861563622031602, 0.7904951045345133, 0.785581411812939, 0.7833125173026652, 0.7893174666573681, 0.7877958887811105, 0.785754438463798, 0.7889711491928241, 0.7874305515931131, 0.7884549750313298, 0.7877505848564963, 0.7877808315000434, 0.7842661451035836, 0.7884063690713936, 0.7884541825428524, 0.7874714968311027, 0.7866256474630858, 0.786269027648338, 0.7862307240386057, 0.7890509263661973, 0.7832681379479408, 0.789029000851661, 0.787115

In [103]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [63]:
column_to_drop_25 = 'Cat_현재 교육환경'

In [64]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(19949, 170)


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [66]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [67]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 51, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.785055314378271


In [68]:
optuna_26 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [69]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.7885693575348748


In [70]:
X_train = X_train.values
y_train = y_train.values

In [71]:
auc_bootstrap = []

In [72]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78278608, 0.79083156])

In [73]:
t_26 = auc_bootstrap
print(t_26)

[0.7858272153222893, 0.7821211429585813, 0.7868398835147603, 0.7870090798046464, 0.78865521045324, 0.7892096882244665, 0.7837383477777566, 0.7895730441912707, 0.786271933439421, 0.7875850868461706, 0.7867718615871324, 0.7877410749947696, 0.7856742650461862, 0.785556976751558, 0.7861308704904764, 0.7859625987704805, 0.7856633022889181, 0.7869178115483533, 0.7879063088422693, 0.7898620383226296, 0.7855322775273513, 0.7869336613178978, 0.7862486871107561, 0.7860009023802128, 0.7865559084770909, 0.7855615996010084, 0.7897694492522078, 0.7880712785269435, 0.7876022574298437, 0.7867759561109314, 0.7850985221674878, 0.7897944126392402, 0.7907014157014158, 0.7824352325583853, 0.7858620848152869, 0.7872748276073399, 0.7857058325038621, 0.7887124017050126, 0.7860203183479045, 0.7860940197762858, 0.7893467887310252, 0.7866385914415471, 0.785064841407206, 0.7875071588125775, 0.7876166543038464, 0.7819209075366711, 0.786738048745438, 0.7828160232716883, 0.7841592912405722, 0.7852543782346738, 0.785

In [74]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [65]:
column_to_drop_26 = 'Cat_현재 대중교통 접근용이성'

In [66]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(19949, 166)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [68]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [69]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.783109270302612


In [70]:
optuna_27 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [81]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

0.7887616680720129


In [71]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

0.7887616680720129


In [72]:
X_train = X_train.values
y_train = y_train.values

In [73]:
auc_bootstrap = []

In [74]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78373715, 0.79055085])

In [75]:
t_27 = auc_bootstrap
print(t_27)

[0.7874670060630653, 0.7890506622033715, 0.7846594835511091, 0.7844082647038313, 0.7908757631664035, 0.7854090455691441, 0.787437948152234, 0.787149218183701, 0.7861152848837578, 0.7863559372180061, 0.7886068686561296, 0.7855874875579308, 0.788280495484929, 0.7847424306783913, 0.7846227649183315, 0.7884045199316135, 0.785486049032847, 0.7872340144507631, 0.7879575564304628, 0.786561587977844, 0.7899373247279651, 0.7811849499041617, 0.7854947664060965, 0.7897406555042024, 0.785945956512459, 0.7861350970956882, 0.7874339857098478, 0.7863081237465473, 0.7847582804479356, 0.7879216302861622, 0.7863583146834379, 0.7863662395682101, 0.7873153766010909, 0.7869377558416967, 0.7897946768020658, 0.7860946801833502, 0.7891317601908736, 0.788359612251238, 0.7878199275982527, 0.790333436885161, 0.7874128526837886, 0.7858509899766057, 0.7849430623445401, 0.7891531573797584, 0.786965889182638, 0.7827560583102455, 0.7845313645806257, 0.7846028206249881, 0.7863910708738295, 0.7872472225920502, 0.784887

In [76]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [67]:
column_to_drop_27 = 'Cat_현재 의료시설 접근용이성'

In [68]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(19949, 162)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [70]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [71]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.784953770875755


In [72]:
optuna_28 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [73]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7895480808042383


In [74]:
X_train = X_train.values
y_train = y_train.values

In [75]:
auc_bootstrap = []

In [76]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78319788, 0.79044818])

In [77]:
t_28 = auc_bootstrap
print(t_28)

[0.7864877544680501, 0.7860196579408402, 0.7895389671867504, 0.7845786497264331, 0.7893716200366447, 0.7860679997379507, 0.7871039142590867, 0.7901073135063283, 0.7856586794394677, 0.7867352750357677, 0.7883214407229187, 0.7873194711248899, 0.7893364863808213, 0.7862317806899087, 0.7842815986288894, 0.7839388473624926, 0.785102352528461, 0.7884920899083461, 0.7880228046484203, 0.7874757234363146, 0.7862164592460158, 0.786000638217387, 0.7891507799143267, 0.7869582284606914, 0.7890641345074844, 0.7868883573932834, 0.7831725110050234, 0.7896962761494781, 0.7877076583973135, 0.7890423410743608, 0.7859302388243274, 0.7851598079430591, 0.787904723865315, 0.7863016517573167, 0.7898486980999296, 0.7865236806123506, 0.7880344278127529, 0.784753789679898, 0.7890534359130419, 0.7874543262474297, 0.785192432052038, 0.7830042392850274, 0.788099147705059, 0.7859005205064319, 0.7870768375694485, 0.7895277402666565, 0.7891358547146725, 0.7895336839302357, 0.7874646285976334, 0.7829915594693919, 0.788

In [78]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [79]:
column_to_drop_28 = 'Cat_현재 주변도로의 보행 안전'

In [80]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(19949, 158)


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7837825119798632


In [84]:
optuna_29 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [85]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7903873261016117


In [86]:
X_train = X_train.values
y_train = y_train.values

In [87]:
auc_bootstrap = []

In [88]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7826148 , 0.79082121])

In [89]:
t_29 = auc_bootstrap
print(t_29)

[0.7858754250379867, 0.7902456027456027, 0.789061492879227, 0.7884920899083461, 0.7908525168377384, 0.7874950073225936, 0.786317633608274, 0.7885476961831641, 0.783797652332135, 0.7864213175173765, 0.788757441466801, 0.7861210964659241, 0.7855141823737883, 0.7868673564486373, 0.7837485180465477, 0.7885358088560059, 0.7878983839574971, 0.788012106053978, 0.7881016572519036, 0.7883733487181763, 0.7867498039911833, 0.7907138313542255, 0.789509116787442, 0.7866245908117828, 0.7840398896433378, 0.7895188908119943, 0.7882342669904245, 0.7868508462720285, 0.784337601147946, 0.783717743077349, 0.7812885017318514, 0.7866208925322227, 0.7879245360772454, 0.7822847918291267, 0.7856548490784945, 0.787075252592494, 0.7850627281046, 0.7888158214512895, 0.7856935489324652, 0.786321728132073, 0.7867717295057197, 0.7895908751820082, 0.786954662262544, 0.7887801594698146, 0.7795721037716111, 0.788684532526897, 0.784129969166915, 0.7867104437301481, 0.7857084741321194, 0.7866981601587513, 0.7900795764096

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [91]:
column_to_drop_29 = '가구주 나이'

In [92]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(19949, 157)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [94]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [95]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7832667452872051


In [96]:
optuna_30 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [97]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.7911055848247966


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
auc_bootstrap = []

In [22]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78343724, 0.79082359])

In [23]:
np.mean(auc_bootstrap)

0.7872559729856528

In [24]:
t_30 = auc_bootstrap
print(t_30)

[0.7867004055427701, 0.787035760250046, 0.787026646632558, 0.7857060966666878, 0.7876206167462325, 0.7876067481978811, 0.789546495827284, 0.7878689298024275, 0.7886397569279342, 0.7886777963748407, 0.7892190660047803, 0.7882678156692935, 0.785827479485115, 0.7911226233270569, 0.788312987512495, 0.7856404522044915, 0.7890609645535754, 0.782833458018187, 0.7896064607887268, 0.7877870393264482, 0.7861360216655782, 0.7879946713074791, 0.7877363200639063, 0.7878578349637462, 0.7871736532450818, 0.7865736073864154, 0.7859042187859921, 0.7908288742648348, 0.7867503323168348, 0.7840117563023967, 0.7883647634263398, 0.7921245929250855, 0.7878331357395396, 0.784584593390012, 0.7868820174854658, 0.7894788701438948, 0.7849729127438487, 0.7882622682499529, 0.7875975024989804, 0.7849021171065506, 0.7847745264617185, 0.7915059235872044, 0.7866733288531318, 0.7894653978397821, 0.7877907376060085, 0.7864910565033718, 0.7835764159655785, 0.7848471712387969, 0.788925581105384, 0.787453005433301, 0.789631

In [18]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [19]:
# 31
column_to_drop_30 = 'Cat_현재 문화시설 접근용이성'

In [20]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(19949, 153)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [22]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [30]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7830869802654743


In [31]:
optuna_31= ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [32]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7901028227382907


In [33]:
X_train = X_train.values
y_train = y_train.values

In [34]:
auc_bootstrap = []

In [35]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78277573, 0.79081759])

In [36]:
np.mean(auc_bootstrap)

0.7869205215287419

In [37]:
t_31 = auc_bootstrap
print(t_31)

[0.7890266233862293, 0.7883099496399989, 0.7870217596202818, 0.786075528378484, 0.7864334690073607, 0.7793973600623847, 0.7856770387558565, 0.7865703053510935, 0.7872876395043884, 0.7912924800240071, 0.7844362659633596, 0.7837629149205504, 0.7881530369215098, 0.7851023525284608, 0.7871091975156015, 0.786742935757714, 0.7881015251704907, 0.7860476592003687, 0.7857959120274391, 0.7858702738628847, 0.7854441792249675, 0.785504408349236, 0.7867083304275423, 0.7854749541941659, 0.7862107797452624, 0.7864280536694329, 0.7878583632893978, 0.7817794483434877, 0.7883914438717394, 0.7881676979583383, 0.7886912686789534, 0.7851862242256331, 0.7853613641790983, 0.7909258220218812, 0.7895884977165766, 0.7862136855363455, 0.7863972787002343, 0.7867351429543548, 0.7870547799734992, 0.7858470275342195, 0.7854486699930051, 0.784141064005596, 0.7888587479104721, 0.7839369982227126, 0.7861704949143373, 0.7864668856048165, 0.785583260952719, 0.7882892128581784, 0.7905820141041817, 0.7913964280959355, 0.78

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [17]:
# 32
column_to_drop_31 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [18]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(19949, 149)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7821394472978903


In [44]:
optuna_32 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [45]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.7906360354020453


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78346257, 0.7909263 ])

In [49]:
np.mean(auc_bootstrap)

0.7872039574100843

In [50]:
t_32 = auc_bootstrap
print(t_32)

[0.7816151390658781, 0.787297149366115, 0.7862382526791393, 0.7874170792890005, 0.7879977091799752, 0.7890374540620846, 0.7876157297339562, 0.7876828270916941, 0.7840686833913435, 0.7845630641197143, 0.7882140585342556, 0.7865182652744229, 0.7849348732969423, 0.7848700213232234, 0.7877035638735147, 0.7872929227609031, 0.7862843490922309, 0.7874662135745879, 0.7838821844363717, 0.7892180093534773, 0.7897322022937787, 0.7889467141314432, 0.7857153423655887, 0.7895284006737209, 0.7866650077241211, 0.786643742616649, 0.7862502720877105, 0.7867529739450922, 0.7890102452910335, 0.786290160674397, 0.7858391026494476, 0.7867973532998164, 0.7904947082902748, 0.7867908813105857, 0.7856924922811622, 0.7905446350643396, 0.7903482300034024, 0.7894850779702997, 0.7904159877682044, 0.7864874903052244, 0.7871921446428835, 0.7855527501463462, 0.7855524859835206, 0.7845172318694487, 0.787568444588149, 0.782886158501922, 0.7888447472807079, 0.783707572808558, 0.7869228306420424, 0.7902824534597933, 0.788

In [51]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [19]:
# 33.
column_to_drop_32 = '부채 중 금융기관 대출금의 비중'

In [20]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(19949, 148)


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [55]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [56]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7824467608654644


In [57]:
optuna_33 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [58]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.7904306488050331


In [59]:
X_train = X_train.values
y_train = y_train.values

In [60]:
auc_bootstrap = []

In [61]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78268913, 0.79028418])

In [62]:
np.mean(auc_bootstrap)

0.7864974662162163

In [63]:
t_33 = auc_bootstrap
print(t_33)

[0.7880694293871633, 0.7872042961328676, 0.7826202786178156, 0.7877154512006729, 0.7858017236096053, 0.7844352093120566, 0.7891513082399783, 0.7892222359586891, 0.7880370694410104, 0.7840202095128204, 0.7845822159245806, 0.7852772283191002, 0.7817542207936297, 0.7844436625224802, 0.7839356774085838, 0.7861225493614656, 0.7851966586572499, 0.7860629806442614, 0.7869015655345705, 0.7892079711660993, 0.7860678676565377, 0.7879856897714042, 0.7847849608933353, 0.7850689359310049, 0.7871502748350039, 0.7831850587392459, 0.7910733569600565, 0.7864437713575645, 0.7842732774998784, 0.7872629402801816, 0.7870683843590247, 0.7873773227837266, 0.7879085542262881, 0.783779425097159, 0.7868812249969886, 0.7861012842539936, 0.7858217999843616, 0.7864030902824005, 0.7855149748622656, 0.7875467832364385, 0.7861535884934899, 0.7853630812374655, 0.7865095479011734, 0.7871317834372022, 0.7898571513103534, 0.783185983309136, 0.7866956506119067, 0.7866607811189092, 0.7860216391620333, 0.788864823655464, 0.

In [64]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [21]:
# 34
column_to_drop_33 = 'Cat_현재 주택의 위치'

In [22]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(19949, 144)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [68]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [69]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7840499924255151


In [70]:
optuna_34 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [71]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.7904457060861002


In [72]:
X_train = X_train.values
y_train = y_train.values

In [73]:
auc_bootstrap = []

In [74]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78392007, 0.79100315])

In [75]:
np.mean(auc_bootstrap)

0.7876735412400437

In [76]:
t_34 = auc_bootstrap
print(t_34)

[0.7902961899267318, 0.7911420392947487, 0.7874795537972878, 0.7859130682406544, 0.7876609015771577, 0.7852699638413924, 0.7854569911220157, 0.7877435845416141, 0.7905608810781225, 0.7881728491334403, 0.7865675316414233, 0.7916121170431516, 0.7892178772720645, 0.7896808226241724, 0.7852877948321297, 0.7896532176088825, 0.7849384394950898, 0.7887501769890933, 0.7842450120775244, 0.7885483565902285, 0.7892424444148581, 0.7862057606515733, 0.788926637756687, 0.7884646169744691, 0.7870125139213809, 0.7871242547966686, 0.7888451435249465, 0.7881658488185582, 0.7853546280270418, 0.7828779694543241, 0.7869473977848362, 0.7887386859061736, 0.7837803496670491, 0.7904372528756765, 0.7892622566267886, 0.7871850122465887, 0.7897812044979532, 0.7868196750585914, 0.7893566948369904, 0.7877394900178152, 0.7881851327048373, 0.7874634398649177, 0.7865366245908119, 0.7902376778608307, 0.7869662854268765, 0.7859256159748771, 0.7877616796951772, 0.7860911139852026, 0.7908605738039236, 0.7893235424023601, 

In [77]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [23]:
# 35
column_to_drop_34 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [24]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(19949, 140)


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [81]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [82]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7845696805135954


In [83]:
optuna_35 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [84]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.7899358718324234


In [85]:
X_train = X_train.values
y_train = y_train.values

In [86]:
auc_bootstrap = []

In [87]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78328937, 0.79055127])

In [88]:
np.mean(auc_bootstrap)

0.7868798484445035

In [89]:
t_35 = auc_bootstrap
print(t_35)

[0.7856116584564861, 0.7867833526700522, 0.7860945481019372, 0.7833988985466818, 0.7880729955853109, 0.7856548490784945, 0.788415218526056, 0.7849520438806152, 0.785645207135355, 0.7862673105899707, 0.7881276772902389, 0.7846428412930875, 0.7865699091068549, 0.7876968277214582, 0.7903837599034642, 0.783875844528554, 0.7882567208306125, 0.7855034837793459, 0.7844020568774263, 0.7850063293413047, 0.7888979760900943, 0.7861681174489057, 0.7856663401614141, 0.7852920214373417, 0.7896374999207512, 0.7859260122191156, 0.7872872432601496, 0.7857631558370475, 0.7881626788646493, 0.7862243841307881, 0.7882001899859044, 0.7905783158246212, 0.787446005118419, 0.7899776095588904, 0.7866161376013593, 0.7896960119866524, 0.7884511446703565, 0.784565441585146, 0.7859504472804966, 0.7870714222315207, 0.7896219143140326, 0.7902014875537043, 0.7856745292090119, 0.7862996705361237, 0.7852275657078613, 0.7855851100924992, 0.7863881650827464, 0.7884203697011579, 0.7912447986339612, 0.7873063950650158, 0.78

In [90]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [25]:
# 36
column_to_drop_35 = 'Cat_이사 계획 첫 번째 이유'

In [26]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(19949, 127)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [94]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [95]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7831334178428445


In [96]:
optuna_36 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [97]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.7874971206251995


In [98]:
X_train = X_train.values
y_train = y_train.values

In [99]:
auc_bootstrap = []

In [100]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78300867, 0.79025218])

In [101]:
np.mean(auc_bootstrap)

0.7867391778433959

In [102]:
t_36 = auc_bootstrap
print(t_36)

[0.7864009769797946, 0.7857517968355408, 0.7866801970866011, 0.7858525749535602, 0.7869296988755117, 0.7868562616099561, 0.7849180989575079, 0.7873337359174797, 0.7900091770165663, 0.7888164818583537, 0.7846923718229136, 0.7857298713210042, 0.7863464273562796, 0.7847553746568525, 0.7880856754009464, 0.7881375833962041, 0.7856648872658726, 0.7851245422058231, 0.7860892648454224, 0.7837281775089657, 0.7864577719873286, 0.7847129765233212, 0.7866658002125982, 0.7857593254760742, 0.787991897597809, 0.7859139928105445, 0.786255819507051, 0.7861032654751867, 0.7859431828027889, 0.788441502727217, 0.7889617714125103, 0.7861929487545252, 0.7863819572563415, 0.7860887365197711, 0.7861846276255143, 0.7851341841489625, 0.784309203644179, 0.7865154915647526, 0.784369961094099, 0.7867186327777461, 0.7855464102385284, 0.7838809957036558, 0.7893903755972721, 0.7874729497266442, 0.7882571170748511, 0.7813365793661361, 0.7868106935225161, 0.7854642555997235, 0.7860249411973551, 0.7894479630932832, 0.78

In [103]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
# 37
column_to_drop_36 = 'Cat_현재 주차시설 이용편의성'

In [28]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(19949, 123)


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [107]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [108]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7818701093491434


In [109]:
optuna_37 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [110]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7892403311122524


In [111]:
X_train = X_train.values
y_train = y_train.values

In [112]:
auc_bootstrap = []

In [113]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78406368, 0.79108146])

In [114]:
np.mean(auc_bootstrap)

0.7875593427259068

In [115]:
t_37 = auc_bootstrap
print(t_37)

[0.7846358409782055, 0.7840957600809818, 0.7862708767881181, 0.7924117379166641, 0.785738324531428, 0.7869438315866887, 0.7894842854818225, 0.7906109399336, 0.7863242376789175, 0.7869310196896404, 0.7876639394496536, 0.7873653033751555, 0.7851237497173458, 0.7888509551071128, 0.7898052433150956, 0.7863509181243171, 0.7890877770803879, 0.7867435961647784, 0.7885791315594272, 0.7869777765097962, 0.7880044453320315, 0.7864588286386316, 0.7908144773908321, 0.7875601234591383, 0.7871901634216907, 0.7885667159066173, 0.7869979849659652, 0.7872072019239507, 0.7856939451767038, 0.7869991736986812, 0.7861946658128924, 0.7900732365018079, 0.7905109543040577, 0.7896590291910489, 0.7866706872248745, 0.7894927386922461, 0.7873552651877775, 0.7880143514379967, 0.7878254750175933, 0.7879645567453448, 0.7856434900769875, 0.7837338570097191, 0.7859302388243274, 0.7871962391666826, 0.7858482162669355, 0.7846403317462431, 0.7861465881786078, 0.7853270230117521, 0.7924426449672755, 0.7908255722295131, 0.7

In [116]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
# 38
column_to_drop_37 = 'Cat_현재 대기오염 정도'

In [30]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(19949, 119)


In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [120]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [121]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7834617831121597


In [122]:
optuna_38 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [123]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7905763346034282


In [124]:
X_train = X_train.values
y_train = y_train.values

In [125]:
auc_bootstrap = []

In [126]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78401542, 0.79108782])

In [127]:
np.mean(auc_bootstrap)

0.787656125843472

In [128]:
t_38 = auc_bootstrap
print(t_38)

[0.7901699200960284, 0.785070917152198, 0.7926723345442557, 0.7879166111924734, 0.7906950757935979, 0.7884002933264017, 0.7868522991675702, 0.7906174119228306, 0.7869895317555416, 0.7874475900953735, 0.7866791404352982, 0.787054912054912, 0.7884062369899808, 0.7879213661233366, 0.7886788530261437, 0.7867385770710893, 0.7848103205246062, 0.7889982258824624, 0.7861942695686537, 0.7883643671821012, 0.7892675398833036, 0.7879730099557686, 0.7881584522594376, 0.7888287654297508, 0.7875948608707228, 0.789582950297236, 0.7872193534139347, 0.787989784295203, 0.7861699665886858, 0.7890949094766828, 0.7882040203468775, 0.7846037451948782, 0.7880955815069115, 0.7862578007282441, 0.789360128953725, 0.788824274661713, 0.7864049394221808, 0.7872859224460209, 0.7868205996284814, 0.7874487788280893, 0.7876116352101574, 0.7876418818537045, 0.7860588861204625, 0.7876435989120718, 0.7872103718778596, 0.7835258287844495, 0.7886278696007761, 0.7885169212139655, 0.787842909764092, 0.7881028459846193, 0.7875

In [129]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [31]:
# 38
column_to_drop_38 = 'Cat_이사 계획 중인 주택의 유형'

In [32]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(19949, 100)


In [132]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [133]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [134]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.783444652805841


In [135]:
optuna_39 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [136]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.7910877538340592


In [137]:
X_train = X_train.values
y_train = y_train.values

In [138]:
auc_bootstrap = []

In [139]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78427198, 0.79089972])

In [140]:
np.mean(auc_bootstrap)

0.7876705727763301

In [141]:
t_39 = auc_bootstrap
print(t_39)

[0.7890346803524144, 0.7855897329419497, 0.7889074859518209, 0.7890493413892429, 0.7883615934724308, 0.7889181845462634, 0.7867338221402261, 0.7872066735982992, 0.7904597067158644, 0.7902383382678948, 0.7884605224506702, 0.7863628054514754, 0.7853883087873236, 0.7883267239794334, 0.7870846303728076, 0.7902269792663881, 0.7896010454507991, 0.7867130853584056, 0.7879973129357365, 0.7875241973148377, 0.7893951305281355, 0.7866084768794128, 0.7865955329009515, 0.7862138176177585, 0.7906825280593753, 0.7879843689572754, 0.7870640256724001, 0.7903399088743917, 0.7876537691808627, 0.7888484455602682, 0.7900340083221857, 0.7882322857692317, 0.7862515929018392, 0.7860716980175109, 0.7845861783669665, 0.7882234363145694, 0.7853703457151733, 0.7881271489645874, 0.7846025564621623, 0.7887213832410876, 0.7881649242486681, 0.787338622929756, 0.7898558304962245, 0.7886471534870549, 0.7876206167462325, 0.790649375624745, 0.7898419619478734, 0.7869953433377078, 0.7875064984055132, 0.7897905822782669, 0

In [142]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [37]:
# 40
column_to_drop_39 = 'Cat_현재 청소/쓰레기 처리상태'

In [38]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(19949, 96)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [41]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [42]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 155, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 5}
0.7825557343803597


In [43]:
optuna_40 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [44]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.7909287278129642


In [45]:
X_train = X_train.values
y_train = y_train.values

In [46]:
auc_bootstrap = []

In [47]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78449522, 0.7909625 ])

In [48]:
np.mean(auc_bootstrap)

0.7877139077633228

In [49]:
t_40 = auc_bootstrap
print(t_40)

[0.7880045774134445, 0.7889718095998883, 0.7850013102476157, 0.7879353667531007, 0.7903190400111582, 0.7841824054878243, 0.7883938213371711, 0.7851436940106891, 0.7884327853539675, 0.7879537260694897, 0.7884928823968232, 0.7894648695141305, 0.7875477078063284, 0.7894457177092644, 0.7909609556777044, 0.7883018926738139, 0.7901090305646956, 0.7888039341241312, 0.7877223194341421, 0.7888640311669868, 0.7911094151857698, 0.7874152301492203, 0.7845159110553198, 0.7879324609620176, 0.7885223365518932, 0.7880757692949811, 0.7876620903098736, 0.7849821584427497, 0.7882770613681943, 0.7848566811005234, 0.7872847337133051, 0.7871473690439207, 0.7853387782574975, 0.7897963938604333, 0.7855081066287963, 0.7855954124427031, 0.78592667262618, 0.7883914438717394, 0.7885898301538695, 0.7881167145329707, 0.7898283575623477, 0.7900476127077113, 0.7873419249650777, 0.7872287311942484, 0.7874318724072419, 0.7869117358033615, 0.7847292225371043, 0.7868192788143527, 0.7871052350732153, 0.7904334225147033, 0

In [50]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [51]:
# 41.
column_to_drop_40 = 'Cat_소득 계층'

In [52]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(19949, 94)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [54]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [55]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7818554557136176


In [56]:
optuna_41 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [57]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.7894945878320263


In [58]:
X_train = X_train.values
y_train = y_train.values

In [59]:
auc_bootstrap = []

In [60]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78347264, 0.79040561])

In [61]:
np.mean(auc_bootstrap)

0.7871217197581113

In [62]:
t_41 = auc_bootstrap
print(t_41)

[0.7851756577126034, 0.790957389479557, 0.7881885668215718, 0.7847313358397102, 0.7842578239745728, 0.7905361818539158, 0.7880579383042438, 0.7855837892783705, 0.7887109488094709, 0.784076212031877, 0.7870999518167006, 0.7903125680219276, 0.7887802915512274, 0.7852163387877673, 0.7882293799781486, 0.7904723865314998, 0.7878064552941401, 0.7868092406269745, 0.782528746198697, 0.7866277607656917, 0.7857613066972673, 0.7854589723432088, 0.7863943729091512, 0.7888307466509437, 0.7869720970090428, 0.7887312893470529, 0.786543492824281, 0.790149183314208, 0.7885943209219071, 0.7865728148979381, 0.7884153506074688, 0.784747846016319, 0.7864143172024944, 0.7857407019968596, 0.7881930575896093, 0.7869575680536272, 0.7896813509498238, 0.7861988924181043, 0.7887577056296269, 0.7886118877498187, 0.786207213547115, 0.7875833697878034, 0.7830689591773337, 0.7864214495987895, 0.7853415519671677, 0.785847687941284, 0.7869046034070665, 0.7876638073682407, 0.787392247983381, 0.7843574133598764, 0.781533

In [63]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [64]:
# 42.
column_to_drop_41 = 'Cat_현재 주택의 구조'

In [65]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(19949, 92)


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [67]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [68]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7826184767071176


In [69]:
optuna_42 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [70]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.7893197120413868


In [71]:
X_train = X_train.values
y_train = y_train.values

In [72]:
auc_bootstrap = []

In [73]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7837561 , 0.79071565])

In [74]:
np.mean(auc_bootstrap)

0.7873130887528977

In [75]:
t_42 = auc_bootstrap
print(t_42)

[0.7902796797501233, 0.7915823987252558, 0.7873028288668683, 0.7846051980904197, 0.7905147846650309, 0.7884305399699489, 0.7883885380806563, 0.7873827381216544, 0.7840528336217991, 0.7881684904468156, 0.787892572375331, 0.7866223454277642, 0.7875168007557171, 0.7873876251339306, 0.7887990471118549, 0.7899390417863323, 0.7880349561384044, 0.7895297214878496, 0.7891643842998523, 0.7868427893058435, 0.790002176701684, 0.7857429473808785, 0.7870987630839847, 0.7898007525470578, 0.788963224308052, 0.7855503726809145, 0.7855362399697374, 0.7850324814610528, 0.7881876422516817, 0.7887382896619349, 0.7863964862117572, 0.788675550990822, 0.7889683754831538, 0.7854819545090481, 0.7865732111421766, 0.7913863899085574, 0.7863864480243791, 0.7872651856642005, 0.7854764070897076, 0.7877676233587564, 0.7879925580048733, 0.7871839555952856, 0.787764981730499, 0.7873985878911987, 0.7879030068069477, 0.7845268738125881, 0.7878149085045636, 0.7893877339690148, 0.7880266350093936, 0.7876175788737365, 0.78

In [76]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [77]:
# 43.
column_to_drop_42 = '총 가구원 수'

In [78]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(19949, 91)


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [80]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [81]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 170, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}
0.7828463304200806


In [82]:
optuna_43 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [83]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.7899282111104771


In [84]:
X_train = X_train.values
y_train = y_train.values

In [85]:
auc_bootstrap = []

In [86]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78432269, 0.79060769])

In [87]:
np.mean(auc_bootstrap)

0.7876499856427503

In [88]:
t_43 = auc_bootstrap
print(t_43)

[0.7888138402300964, 0.7867866547053739, 0.7887731591549326, 0.7877482073910647, 0.7870815925003116, 0.7861671928790156, 0.7880176534733185, 0.789736957224642, 0.7859454281868075, 0.7913273495170047, 0.7862267615962197, 0.7853723269363663, 0.7869452844822303, 0.7898395844824416, 0.7877462261698714, 0.7877890205476412, 0.7840104354882681, 0.7888612574573166, 0.7865672674785975, 0.7839140160568732, 0.7874129847652016, 0.7868771304731895, 0.7882519658997491, 0.7900008558875553, 0.7864470733928861, 0.7859429186399629, 0.7871221414940627, 0.7877003939196058, 0.785609677235293, 0.7877073942344879, 0.7866368743831798, 0.787330301800745, 0.7888879379027163, 0.7873017722155654, 0.7870660068935931, 0.7888193876494369, 0.7877021109779732, 0.7876939219303751, 0.7882341349090117, 0.7878421172756148, 0.7871453878227277, 0.788977357019229, 0.7859979965891296, 0.7877501886122575, 0.7884724097778286, 0.7866261757887374, 0.7867811072860335, 0.7896117440452416, 0.7884281625045173, 0.7855778456147915, 0.7

In [89]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [90]:
## 44
column_to_drop_43 = 'Cat_기초생활보장 수급가구 여부'

In [91]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(19949, 89)


In [92]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [93]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [94]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 78, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4}
0.7823889718802925


In [95]:
optuna_44 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [96]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.7895236457428576


In [97]:
X_train = X_train.values
y_train = y_train.values

In [98]:
auc_bootstrap = []

In [99]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78331305, 0.79058928])

In [100]:
np.mean(auc_bootstrap)

0.7870097123425326

In [101]:
t_44 = auc_bootstrap
print(t_44)

[0.7882690044020094, 0.7882102281732823, 0.7848689646719204, 0.7861103978714816, 0.7885498094857701, 0.7885686971278104, 0.7863407478555261, 0.7850603506391685, 0.7861946658128924, 0.7875698974836906, 0.7855335983414801, 0.7882707214603766, 0.7892497088925658, 0.7874168151261747, 0.7856053185486683, 0.7886877024808061, 0.786535567939509, 0.7838800711337657, 0.7864919810732618, 0.7840921938828342, 0.7890039053832159, 0.7900865767245078, 0.7854897473124074, 0.7856243382721215, 0.7884156147702946, 0.7867536343521566, 0.787547179480677, 0.785063784755903, 0.7866922164951721, 0.7873787756792683, 0.7852623031194459, 0.787691940709182, 0.7882729668443954, 0.7866874615643089, 0.787032590296137, 0.7885406958682821, 0.7886227184256741, 0.7886178314133979, 0.7883074400931543, 0.7829802004678852, 0.7856806049540039, 0.7860433005137439, 0.7878278524830249, 0.783633739298764, 0.7862712730323567, 0.7883398000393074, 0.7872066735982992, 0.7894681715494524, 0.7900283288214324, 0.7886464930799906, 0.783

In [102]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [103]:
# 45
column_to_drop_44 = '현재 주택의 면적(㎡)'

In [104]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(19949, 88)


In [105]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [106]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [107]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7811797373655734


In [108]:
optuna_45 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [109]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.7877180928289302


In [110]:
X_train = X_train.values
y_train = y_train.values

In [111]:
auc_bootstrap = []

In [112]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78235818, 0.78962923])

In [113]:
np.mean(auc_bootstrap)

0.786040365400587

In [114]:
t_45 = auc_bootstrap
print(t_45)

[0.7843101282140691, 0.7869707761949141, 0.7850315568911629, 0.7823695880961892, 0.7854639914368979, 0.7831655106901412, 0.7860602069345912, 0.7845410065237651, 0.783751159674805, 0.7870434209719923, 0.7826189578036868, 0.7863912029552423, 0.7852555669673896, 0.7844643993043008, 0.7880770901091099, 0.7872811675151576, 0.786988210941413, 0.7874938185898779, 0.7857442681950072, 0.7857572121734684, 0.7864633194066692, 0.7847776964156274, 0.7863576542763735, 0.7835321686922672, 0.7820227423059936, 0.7855685999158907, 0.7894507368029535, 0.7876663169150854, 0.7899257015636327, 0.7847930178595202, 0.7829311982637106, 0.7870230804344105, 0.7839315828847849, 0.7880200309387502, 0.7832603451445816, 0.7881967558691697, 0.7849924607929535, 0.7823790979579157, 0.7860913781480283, 0.7877997191420837, 0.7860285073955025, 0.7829203675878552, 0.7839318470476105, 0.7890727197993208, 0.7871955787596182, 0.7842631072310875, 0.7860750000528326, 0.7881587164222632, 0.7861774952292194, 0.7866764988070407, 0

In [115]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [116]:
# 46.
column_to_drop_45 = 'Cat_주택 보유 의식'

In [117]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(19949, 86)


In [118]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [119]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [120]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}
0.7801034175167496


In [121]:
optuna_46 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [122]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.7861391916194872


In [123]:
X_train = X_train.values
y_train = y_train.values

In [124]:
auc_bootstrap = []

In [125]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78101807, 0.78777485])

In [126]:
np.mean(auc_bootstrap)

0.7844508260899886

In [127]:
t_46 = auc_bootstrap
print(t_46)

[0.7831142631019478, 0.7810759827385443, 0.7842498990898005, 0.7846115379982375, 0.7808966161798674, 0.785677435000095, 0.7820584042874683, 0.7835017899673071, 0.7819556449482558, 0.7857305317280686, 0.7823030190641029, 0.7878647031972155, 0.7837464047439416, 0.7836901380620593, 0.7822832068521723, 0.7843080149114632, 0.7845161752181455, 0.7852428871517542, 0.7841672161253442, 0.7857115120046154, 0.7846564456786131, 0.7825167267901258, 0.782870969139442, 0.7876034461625594, 0.7824393270821842, 0.7853781385185326, 0.7826916025807651, 0.7858958976569813, 0.7812505943663578, 0.7853327025125054, 0.7807180421096677, 0.7821233883426002, 0.7853702136337604, 0.7822620738261132, 0.7853369291177172, 0.7847129765233214, 0.7839749055882062, 0.7882078507078506, 0.7872597703262728, 0.7832413254211283, 0.7836629292910081, 0.7838842977389775, 0.7878507025674513, 0.7853422123742321, 0.7862813112197349, 0.7861368141540555, 0.784324393006659, 0.7844464362321506, 0.7825941264980674, 0.7851646949553353, 0.

In [128]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [129]:
# 47.
column_to_drop_46 = 'Cat_현재 거주 지역'

In [130]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(19949, 69)


In [131]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [132]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [133]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7774711292421759


In [134]:
optuna_47 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [135]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.7835651890454847


In [136]:
X_train = X_train.values
y_train = y_train.values

In [137]:
auc_bootstrap = []

In [138]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77725425, 0.78381062])

In [139]:
np.mean(auc_bootstrap)

0.7806166927395375

In [140]:
t_47 = auc_bootstrap
print(t_47)

[0.7778814616868804, 0.779221823864681, 0.7796215022200244, 0.7820708199402782, 0.7791592172749808, 0.7827903994775915, 0.7829716151760486, 0.7789447170604805, 0.7788549016997294, 0.7828543268814205, 0.7794159835415992, 0.78084748189428, 0.7787909742959004, 0.7783746536825356, 0.7816048367156742, 0.7799527624035014, 0.7798584562747124, 0.7789278106396333, 0.7819329269452422, 0.7808696715716419, 0.7816074783439315, 0.7830783369576474, 0.7818206577443031, 0.7794529663372028, 0.78388165611072, 0.7792329187033621, 0.7808089141217219, 0.780023029715148, 0.7790844591952966, 0.7831496609205969, 0.7784663181830669, 0.7778465921938829, 0.7796119923582978, 0.7804221797448397, 0.7794405506843931, 0.7801083543078616, 0.7819374177132798, 0.7806818518025415, 0.7822657721056736, 0.7814772460708421, 0.7800124632021184, 0.7807777429082847, 0.7821181050860854, 0.7830054280177432, 0.7810514155957506, 0.7818314884201584, 0.780455464260883, 0.7797057701614353, 0.78247181910975, 0.7816972937046829, 0.778947

In [141]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [142]:
# 48
column_to_drop_47 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [143]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(19949, 65)


In [144]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [145]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [146]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 61, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 9}
0.7748687674063519


In [147]:
optuna_48 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [148]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.7833539908663062


In [149]:
X_train = X_train.values
y_train = y_train.values

In [150]:
auc_bootstrap = []

In [151]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77771462, 0.7835482 ])

In [152]:
np.mean(auc_bootstrap)

0.7806143071510989

In [153]:
t_48 = auc_bootstrap
print(t_48)

[0.7826145991170622, 0.7802651349449379, 0.7792304091565175, 0.7798115673731437, 0.7817695422375225, 0.7783370104798677, 0.7812533680760282, 0.7808069329005289, 0.7779917496666264, 0.7797796036712294, 0.7824563655844443, 0.7812821618240339, 0.7831080552755428, 0.781145061317475, 0.7811004177999252, 0.7799137983867048, 0.7805359018413206, 0.7831540196072215, 0.780475672717052, 0.7809392784762244, 0.7771472211127384, 0.7789855302170574, 0.7805139763267843, 0.7795080442863693, 0.7803414780015766, 0.779819492257916, 0.7792652786495151, 0.7800506347304377, 0.7798876462669566, 0.7794385694631999, 0.7831083194383687, 0.7811939314402369, 0.7794285312758219, 0.7810042625313561, 0.7789451133047192, 0.780659001718115, 0.7803245715807292, 0.7813587690434982, 0.7786267970997034, 0.7782049290669979, 0.7802754372951418, 0.7806888521174237, 0.7826904138480494, 0.781200799673706, 0.7791318764225169, 0.7791403296329406, 0.7826301847237808, 0.7813154463400769, 0.7790291170833041, 0.7827786442318462, 0.78

In [154]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [155]:
# 49
column_to_drop_48 = 'Cat_가구주 성별'

In [156]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(19949, 63)


In [157]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [158]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [159]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7755964958410505


In [160]:
optuna_49 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [161]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.7823772488181355


In [162]:
X_train = X_train.values
y_train = y_train.values

In [163]:
auc_bootstrap = []

In [164]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77731843, 0.78346851])

In [165]:
np.mean(auc_bootstrap)

0.7805188493384305

In [166]:
t_49 = auc_bootstrap
print(t_49)

[0.7808083857960704, 0.78015062035998, 0.7780914711333431, 0.7817078602177122, 0.779929780237662, 0.7821070102474044, 0.7804319537693922, 0.7783389917010606, 0.7801374122186929, 0.78024175653486, 0.7828955362822357, 0.7820235347944707, 0.7797910947541491, 0.780135298916087, 0.7827444351459131, 0.7818478665153541, 0.7824918954845064, 0.780544487133157, 0.7808421986377652, 0.7815802695728804, 0.7801173358439368, 0.7812537643202668, 0.7828374204605733, 0.7806298117258708, 0.7806152827704552, 0.7803059481015147, 0.7791605380891096, 0.7792054457694852, 0.7793945863527145, 0.7812640666704707, 0.7807912152123975, 0.7805930930930931, 0.7803040989617344, 0.7814981149340756, 0.781138061002593, 0.7815554382672609, 0.7812947095582563, 0.7801249965658833, 0.7805265240610068, 0.7793842840025105, 0.7825130285105655, 0.7780087881688866, 0.7817049544266292, 0.7819323986195906, 0.7778310065871643, 0.7827616057295861, 0.7819228887578642, 0.7804816163806312, 0.7779842210260929, 0.7820261764227282, 0.78077

In [167]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [168]:
# 50
column_to_drop_49 = 'Cat_가구주 종사상 지위'

In [169]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(19949, 58)


In [170]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [171]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [172]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 153, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 6}
0.7747218182726293


In [173]:
optuna_50 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [174]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.7794837413064014


In [175]:
X_train = X_train.values
y_train = y_train.values

In [176]:
auc_bootstrap = []

In [177]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77466904, 0.78040554])

In [178]:
np.mean(auc_bootstrap)

0.7775969501345117

In [179]:
t_50 = auc_bootstrap
print(t_50)

[0.778766010908868, 0.7787496328136722, 0.7761957066144258, 0.77746765062036, 0.7782865553801515, 0.7779378604501757, 0.7799230440856059, 0.7764633035568995, 0.7768674726802806, 0.7762131413609246, 0.7762091789185385, 0.7805039381394061, 0.7793416217061536, 0.7786637798953071, 0.7760422280126714, 0.77801446766964, 0.7769345700380184, 0.7768450188400928, 0.7783058392664304, 0.7770267628642014, 0.7793791328274088, 0.7761381191184147, 0.7781587005724937, 0.7769327208982381, 0.7778015524320943, 0.7775107091609554, 0.775323440963835, 0.7752655893049982, 0.7770560849378584, 0.7779579368249319, 0.7774721413883976, 0.7802867962966485, 0.7754697871692946, 0.7777706453814828, 0.7791606701705224, 0.7773482490231259, 0.7773080962736134, 0.7790565900171811, 0.778921866976054, 0.777778570266255, 0.7797846227649182, 0.7780218642287609, 0.7790423252245913, 0.778631287867741, 0.7746128429625966, 0.7775741082391329, 0.7762572565528232, 0.7755876037895743, 0.778247459281942, 0.7771501269038215, 0.7777061

In [180]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [181]:
# 51
column_to_drop_50 = 'Cat_이사 계획 중인 주택의 점유형태'

In [182]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(19949, 34)


In [183]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [184]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [185]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 96, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 3}
0.7733893693859547


In [186]:
optuna_51 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [187]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.7791886714300509


In [188]:
X_train = X_train.values
y_train = y_train.values

In [189]:
auc_bootstrap = []

In [190]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7748763 , 0.77952557])

In [191]:
np.mean(auc_bootstrap)

0.777172985045214

In [192]:
t_51 = auc_bootstrap
print(t_51)

[0.7766133480419195, 0.7774977651824942, 0.7780482805113347, 0.7788772234585042, 0.7767187490093892, 0.7791548585883562, 0.7757450448337149, 0.7772283191002403, 0.7763277880272955, 0.7780300532763588, 0.7768056585790577, 0.7767176923580865, 0.7765121736796613, 0.7760039244029392, 0.7782408552112985, 0.7792644861610378, 0.7780569978845842, 0.7765404391020155, 0.7777223035843724, 0.7763378262146735, 0.775944487767148, 0.7789815677746712, 0.7782862912173256, 0.7760350956163764, 0.7801621114428997, 0.7772523579173826, 0.7766138763675711, 0.777268207686927, 0.7780696777002195, 0.7768532078876906, 0.7765454581957045, 0.7767242964287301, 0.7765414957533185, 0.7758779187350617, 0.7762873711149573, 0.7754061239282914, 0.7786344578216499, 0.7765076829116239, 0.7788008804018656, 0.7775434653513471, 0.7776718484846564, 0.7775482202822104, 0.7783885222308868, 0.7774581407586334, 0.7766453117438339, 0.7794393619516773, 0.7758161046338385, 0.7779080100508673, 0.7779138216330334, 0.7761975557542061, 0

In [193]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [194]:
# 52
column_to_drop_51 = 'Cat_현재 주택의 유형'

In [195]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(19949, 23)


In [196]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [197]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [198]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 95, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}
0.7712326019036516


In [199]:
optuna_52 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [200]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.7736218361218361


In [201]:
X_train = X_train.values
y_train = y_train.values

In [202]:
auc_bootstrap = []

In [203]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76687984, 0.77340631])

In [204]:
np.mean(auc_bootstrap)

0.7702172965100921

In [205]:
t_52 = auc_bootstrap
print(t_52)

[0.7700054470374668, 0.7721961493513217, 0.7744618739076867, 0.7685311543070164, 0.7688724526778714, 0.7659024700280859, 0.768671424767484, 0.7695727483289058, 0.7672322656928569, 0.7698094382207682, 0.7693088496659924, 0.7676105468593153, 0.7705519999239211, 0.7669749711005869, 0.7698239671761838, 0.768392732986329, 0.7712459556671378, 0.768713954982428, 0.7680059986094468, 0.7711445171420542, 0.7711630085398558, 0.7695899189125791, 0.7683208806977281, 0.7677727428343193, 0.7701153387729742, 0.7690674048432669, 0.7658332593677422, 0.7674586532345151, 0.7690946136143182, 0.7689960808803172, 0.7733745797169445, 0.7707897464670863, 0.7663779631144164, 0.7728586697182757, 0.7694549317086263, 0.7711048927181934, 0.7698487984818035, 0.7703113475896727, 0.7691891839059327, 0.7700672611386897, 0.7682981626947143, 0.7687786748747342, 0.767329477612729, 0.7691926180226674, 0.7672507570906586, 0.7727733451255617, 0.7705282252696045, 0.7698118156861999, 0.7702727798171148, 0.7676417180727526, 0.7

In [206]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [207]:
# 53
column_to_drop_52 = 'Cat_가구주 최종 학력'

In [208]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(19949, 20)


In [209]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [210]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [211]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7640052637509922


In [212]:
optuna_53 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [213]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.7664311919238027


In [214]:
X_train = X_train.values
y_train = y_train.values

In [215]:
auc_bootstrap = []

In [216]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76241319, 0.76782185])

In [217]:
np.mean(auc_bootstrap)

0.7654338109661385

In [218]:
t_53 = auc_bootstrap
print(t_53)

[0.7665410836593103, 0.7647240396624634, 0.7665252338897659, 0.766854380770637, 0.7654297506514254, 0.764104577836105, 0.7655150752441393, 0.7666822786896677, 0.765835372670348, 0.7659860775624322, 0.7650455258213877, 0.7658473920789192, 0.7616178810760091, 0.7649318037249071, 0.7675925837871651, 0.766333715841105, 0.764133899909762, 0.7645142943788265, 0.7659895116791668, 0.7638622084434893, 0.7641792038343764, 0.7641372019450837, 0.7675583747012318, 0.7669229310239162, 0.7648538756913141, 0.7656284010963814, 0.7660444575469206, 0.7649353699230547, 0.7639216450792803, 0.7665194223075997, 0.7659839642598263, 0.7665568013474418, 0.7642330930508271, 0.7656050226863034, 0.7643118135728972, 0.7644897272360327, 0.7661522359798222, 0.7656532324020007, 0.763798545202486, 0.7658859598514771, 0.7680148480641091, 0.766328564666003, 0.7663696419854056, 0.7626475877707405, 0.7674904848550168, 0.76562972191051, 0.7614453827508015, 0.7633801112865151, 0.7673557618138899, 0.7657575767181677, 0.766617

In [219]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [220]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [221]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(19949, 13)


In [222]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [223]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [224]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 6}
0.7589076560911449


In [225]:
optuna_54 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [226]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.7598144414646878


In [227]:
X_train = X_train.values
y_train = y_train.values

In [228]:
auc_bootstrap = []

In [229]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75359442, 0.76028423])

In [230]:
np.mean(auc_bootstrap)

0.7580074178242282

In [231]:
t_54 = auc_bootstrap
print(t_54)

[0.7557830525810821, 0.7580421730668037, 0.7568349489531756, 0.7544104625385414, 0.7579946237581706, 0.7599047851510906, 0.7556575752388561, 0.758562309670684, 0.7594203105286849, 0.7582165205317916, 0.759736249268269, 0.7592221884093805, 0.758526911852035, 0.7588822108526542, 0.7599396546440882, 0.7584759284266674, 0.7587776023736615, 0.7598086298825215, 0.7597856477166822, 0.7597370417567463, 0.7560826132254705, 0.7578102381058047, 0.7591762240777019, 0.7599253898514983, 0.7582017274135501, 0.7571284338525719, 0.7582498050478346, 0.7582482200708802, 0.7587142032954841, 0.7543116656417149, 0.7586037832343252, 0.75753339546443, 0.7599127100358627, 0.7573101778766804, 0.7582693530969393, 0.7549482980517463, 0.7559143415054747, 0.7576570236668759, 0.7598191963955511, 0.7577346875376433, 0.7545898290972183, 0.7584888724051286, 0.7569189527317607, 0.7595339005437528, 0.7586621632188134, 0.755598931091542, 0.7579518293804007, 0.7594443493458272, 0.758435775677155, 0.7573075362484228, 0.7599

In [232]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [233]:
# 55
column_to_drop_54 = 'Cat_현재 주택의 점유형태'

In [234]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(19949, 9)


In [235]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [236]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = ExtraTreesClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [237]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 67, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7352387324894216


In [238]:
optuna_55 = ExtraTreesClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [239]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7423720342439556


In [240]:
X_train = X_train.values
y_train = y_train.values

In [241]:
auc_bootstrap = []

In [242]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74119704, 0.74239105])

In [243]:
np.mean(auc_bootstrap)

0.7419566807306957

In [244]:
t_55 = auc_bootstrap
print(t_55)

[0.7423720342439556, 0.7423720342439556, 0.74155524278677, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7414199914199915, 0.7423720342439556, 0.74155524278677, 0.7422367828771771, 0.7414199914199915, 0.7422367828771771, 0.7422367828771771, 0.7423720342439556, 0.7422367828771771, 0.74155524278677, 0.7423720342439556, 0.7422558026006303, 0.7422367828771771, 0.7414199914199915, 0.7422367828771771, 0.7422367828771771, 0.7414199914199915, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7409777828497042, 0.74155524278677, 0.7422066683150428, 0.74155524278677, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423910539674088, 0.7423720342439556, 0.74155524278677, 0.74155524278677, 0.74155524278677, 0.7423820724313337, 0.7415742625102232, 0.74155524278677, 0.7423720342439556, 0.74222172559611, 0.7415742625102232, 0.7423720342439556, 0.74155524278677, 0.7422367828771771, 0

In [245]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc