In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [3]:
from sklearn.utils import resample
from numpy.random import RandomState

In [4]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
신혼가구 = pd.read_csv('신혼가구_변수추가.csv', encoding='cp949')
신혼가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
신혼가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'Cat_주택 마련 예상 소요연수','Cat_남편/아내의 부모님과 동거 의향','Cat_가족계획 시 중요 고려 사항 1순위',
    'target'    
]

In [9]:
cat = 신혼가구.select_dtypes(include = 'object')
num = 신혼가구.select_dtypes(exclude = 'object')
num_신혼 = num.drop('target',axis=1)
target = 신혼가구.target

In [10]:
scaler=RobustScaler()
scaler.fit(num_신혼)
num_scaled_신혼=scaler.transform(num_신혼)
num_df_scaled_신혼=pd.DataFrame(data=num_scaled_신혼, columns=num_신혼.columns)

In [11]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [12]:
comp =pd.concat([num_df_scaled_신혼, target,cat2],axis=1)

In [13]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(6119, 221)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)

{'learning_rate': 0.04, 'n_estimators': 174, 'subsample': 0.8, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 6}


In [16]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7986585864656632


In [17]:
optuna_0 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)

In [18]:
optuna_0.fit(X_train, y_train)

In [19]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(auc_0)

0.7752584586466167


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
auc_bootstrap = []

In [22]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74840539, 0.77393873])

In [23]:
t_0 = auc_bootstrap
print(t_0)

[0.7639268626110732, 0.7570862525632263, 0.7553507347915243, 0.7738139738550923, 0.7631044941900207, 0.7666636406356802, 0.7559781912166781, 0.772604451469583, 0.7533135039302803, 0.7590006621667806, 0.7596334586466165, 0.760049982911825, 0.757414665926179, 0.7634302375256322, 0.7669760338345865, 0.7622687756322626, 0.7583571855775804, 0.7550196514012304, 0.7710745258031443, 0.7556818181818182, 0.7661830357142858, 0.773074376281613, 0.7590246924128503, 0.7621913448393711, 0.7591742139439507, 0.7507903280929596, 0.7557939593301436, 0.7505526956596036, 0.7714590097402598, 0.7770153366370472, 0.7644314977785374, 0.7592062542720437, 0.7704390592959671, 0.7641751751537936, 0.7532467532467533, 0.7561277127477786, 0.7637853511619959, 0.7603703861927547, 0.7629336124401914, 0.7556524478810662, 0.7709864149008887, 0.7541972829801777, 0.7537513884142173, 0.7601674641148326, 0.7600553229665072, 0.7516233766233766, 0.7667143711551606, 0.7536899777853725, 0.761951042378674, 0.7644902383800412, 0.75

In [24]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [14]:
# 1.
column_to_drop = 'Cat_기초생활보장 수급가구 여부'

In [15]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(6119, 219)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [28]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [29]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.04, 'n_estimators': 104, 'subsample': 0.6, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 8}
0.7982036577934708


In [30]:
optuna_1 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [31]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7771061175666439


In [32]:
X_train = X_train.values
y_train = y_train.values

In [33]:
auc_bootstrap = []

In [34]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75469377, 0.77699184])

In [35]:
t_1 = auc_bootstrap
print(t_1)

[0.7605839883800409, 0.7631231843814081, 0.7659774436090225, 0.7606640892002734, 0.7647652511961723, 0.7614010167464115, 0.7721532168489406, 0.7664527084757349, 0.7687462619617225, 0.7641351247436774, 0.7741824376281613, 0.762789430963773, 0.7599191515721122, 0.7698917036910458, 0.7759366455912509, 0.7647091806220094, 0.7694591592617908, 0.7728207236842105, 0.7505713858509911, 0.7667864618933699, 0.7662818267259056, 0.7660014738550922, 0.7599031314080655, 0.7664847488038277, 0.7719129143882435, 0.764188525290499, 0.7676702409432672, 0.7706286312371838, 0.7589552717019822, 0.7602555750170882, 0.7637346206425153, 0.7638547718728639, 0.772099816302119, 0.7668291823308271, 0.7669386534518112, 0.7695232399179768, 0.7665034389952153, 0.7668452024948735, 0.7638707920369104, 0.7668425324675325, 0.7645409688995215, 0.7713068181818181, 0.7752637987012987, 0.77286878417635, 0.7645009184894054, 0.7627787508544087, 0.7630804639439508, 0.7620444933356119, 0.7610459031100478, 0.7564107356459331, 0.76

In [36]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [16]:
#### 2. 
column_to_drop_1 = 'Cat_가구주 장애 여부'

In [17]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(6119, 217)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [40]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [41]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7963004699538394


In [42]:
optuna_2 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [43]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.7769752862269309


In [44]:
X_train = X_train.values
y_train = y_train.values

In [45]:
auc_bootstrap = []

In [46]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75228454, 0.77511194])

In [47]:
t_2 = auc_bootstrap
print(t_2)

[0.7639589029391661, 0.7542319933356118, 0.7651123547505128, 0.7708662636705399, 0.7605839883800409, 0.7585787978468901, 0.7610939636021874, 0.7712801179084073, 0.751938439849624, 0.7616092788790156, 0.764890742481203, 0.7592356245727957, 0.7715871710526316, 0.7649441430280247, 0.7705325102529049, 0.7615318480861244, 0.7651977956254272, 0.7707461124401915, 0.76382006151743, 0.7566964285714286, 0.761251495215311, 0.7648907424812029, 0.7585494275461382, 0.7580821727614491, 0.775739063568011, 0.7640710440874915, 0.7710237952836637, 0.7585520975734792, 0.7583545155502394, 0.7710558356117567, 0.7655235389610389, 0.7738647043745728, 0.7609631322624744, 0.7671415755297335, 0.7592997052289816, 0.7607121496924129, 0.76886374316473, 0.7615772385509227, 0.7663912978468901, 0.7634996582365003, 0.7654808185235816, 0.7683591079972658, 0.7583144651401231, 0.7570915926179084, 0.7630724538619276, 0.7501735517771703, 0.7524404049897472, 0.7502563226247436, 0.7726525119617225, 0.7649521531100478, 0.76294

In [48]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [18]:
#### 3.
column_to_drop_2 = '부채 중 임대 보증금의 비중'

In [19]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(6119, 216)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [52]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [53]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 144, 'subsample': 0.8, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 9}
0.8009833137171428


In [54]:
optuna_3 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [55]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7733146787423104


In [56]:
X_train = X_train.values
y_train = y_train.values

In [57]:
auc_bootstrap = []

In [58]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75122254, 0.77525305])

In [59]:
t_3 = auc_bootstrap
print(t_3)

[0.7594492267600821, 0.7659988038277512, 0.7705565404989747, 0.7700118549213945, 0.7587550196514012, 0.7696193609022556, 0.7667437414559125, 0.7644234876965139, 0.768001324333561, 0.7642526059466848, 0.7741183569719754, 0.7570381920710869, 0.7610592532467533, 0.765993463773069, 0.765592959671907, 0.7596361286739576, 0.7670427845181134, 0.7737552332535885, 0.7658679724880384, 0.7618041908749145, 0.7679719540328094, 0.7575027768284348, 0.7576309381408066, 0.7699504442925496, 0.7644662081339714, 0.7718194634313056, 0.7632406655844156, 0.7516927973342447, 0.7452073009227616, 0.755972851161996, 0.7645970394736842, 0.7686020804853041, 0.7671736158578264, 0.7593077153110048, 0.7621219241285031, 0.762554468557758, 0.7644047975051266, 0.7446919856459331, 0.784990708304853, 0.7568245898838004, 0.7689037935748462, 0.7610779434381408, 0.7676382006151743, 0.7604531570403281, 0.7750582065960356, 0.7650402640123034, 0.76438877734108, 0.7682603169856459, 0.7659347231715652, 0.7610378930280247, 0.76525

In [60]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [20]:
### 4. 
column_to_drop_3 = 'Cat_가구주 동거 여부'

In [21]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(6119, 214)


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [64]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [65]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7977487291212781


In [66]:
optuna_4 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [67]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7743800196514011


In [68]:
X_train = X_train.values
y_train = y_train.values

In [69]:
auc_bootstrap = []

In [70]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75172744, 0.77506635])

In [71]:
t_4 = auc_bootstrap
print(t_4)

[0.77353629101162, 0.7573051948051949, 0.7599859022556391, 0.7608990516062885, 0.7598016703691045, 0.7650856544771019, 0.7580661525974026, 0.7621085739917977, 0.7653366370471633, 0.7642606160287082, 0.7676889311346549, 0.7690399649692413, 0.7710905459671907, 0.7591288234791524, 0.766191045796309, 0.7673925580997949, 0.7570835825358851, 0.7519731502050581, 0.7567524991455913, 0.7623355263157894, 0.7631018241626795, 0.7683964883800409, 0.765192455570745, 0.7758725649350648, 0.7609631322624744, 0.7613209159261791, 0.7652965866370472, 0.7590060022214627, 0.7629069121667806, 0.7623515464798358, 0.7602555750170881, 0.7705965909090909, 0.7604558270676692, 0.7626292293233083, 0.7621459543745728, 0.7643807672590568, 0.7622073650034177, 0.7604745172590567, 0.7666129101161996, 0.7721852571770335, 0.7734962406015038, 0.7502216122693097, 0.7699451042378673, 0.7572598043403965, 0.7621993549213943, 0.7587603597060832, 0.7625678186944634, 0.7623595565618593, 0.7651016746411483, 0.7729862653793576, 0.7

In [22]:
## 5.현재 주택의 위치
column_to_drop_4 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [23]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(6119, 213)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7981452265878681


In [77]:
optuna_5 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [78]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7740142259056733


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75135163, 0.77484994])

In [82]:
t_5 = auc_bootstrap
print(t_5)

[0.7618068609022556, 0.7640123034859877, 0.7608323009227614, 0.7636064593301435, 0.7684765892002734, 0.7634782980177717, 0.7676942711893371, 0.762052503417635, 0.7691494360902256, 0.7696460611756665, 0.7653740174299385, 0.7707995129870129, 0.7618575914217361, 0.7650749743677376, 0.7709143241626795, 0.7525151657552973, 0.7568833304853041, 0.7569207108680793, 0.7624877178742311, 0.7697582023239918, 0.761852251367054, 0.7584613166438824, 0.7605279178058784, 0.777178208304853, 0.7630057031784006, 0.7574253460355433, 0.769098705570745, 0.7549208603896105, 0.7612621753246753, 0.7597242395762133, 0.7639428827751196, 0.7591555237525632, 0.7569767814422421, 0.7722493378332194, 0.7567231288448393, 0.7697742224880383, 0.7607468600478469, 0.7677316515721122, 0.7632460056390976, 0.7640149735133288, 0.76191633202324, 0.7736511021872865, 0.7682336167122352, 0.755275974025974, 0.7676622308612441, 0.7602235346889952, 0.7731945275119617, 0.7679746240601504, 0.768732911825017, 0.7620097829801777, 0.75821

In [83]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [24]:
## 6
column_to_drop_5 = 'Cat_가구주 주민등록상 등재 여부'

In [25]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(6119, 211)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 106, 'subsample': 0.6, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7981827895057555


In [89]:
optuna_6 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [90]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.7788629955570745


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75665331, 0.77815097])

In [94]:
t_6 = auc_bootstrap
print(t_6)

[0.7625704887218046, 0.7637186004784688, 0.7622020249487356, 0.7695739704374573, 0.7596681690020505, 0.7709089841079972, 0.7651016746411483, 0.7674779989747096, 0.7720918062200958, 0.7678785030758715, 0.7626772898154477, 0.7757283834586467, 0.7726445018796992, 0.7644662081339713, 0.7741156869446344, 0.7698943737183869, 0.7669546736158578, 0.7607628802118934, 0.7643006664388244, 0.7663779477101846, 0.7657077708475735, 0.7713415285372524, 0.7606347188995215, 0.7575294771018455, 0.7618682715311004, 0.7700145249487353, 0.769464499316473, 0.7698890336637048, 0.7685673701298701, 0.7626452494873547, 0.7558980903964456, 0.7809376068010936, 0.7661189550580998, 0.7670774948735475, 0.7648693822624743, 0.7582210141831853, 0.7790899478810663, 0.7674486286739577, 0.7740996667805877, 0.7673311474709501, 0.7724816302118933, 0.7593611158578264, 0.7631765849282297, 0.7582557245386193, 0.7537994489063569, 0.7702334671907041, 0.7764973513328777, 0.7660174940191388, 0.7748739747095011, 0.7682790071770336, 

In [95]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [26]:
## 7 .
column_to_drop_6 = '자산 중 부동산 자산의 비중'

In [27]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(6119, 210)


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [99]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [100]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 144, 'subsample': 0.8, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 9}
0.8025025250628135


In [101]:
optuna_7 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [102]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7751596676349968


In [103]:
X_train = X_train.values
y_train = y_train.values

In [104]:
auc_bootstrap = []

In [105]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75184085, 0.77564074])

In [106]:
t_7 = auc_bootstrap
print(t_7)

[0.7620338132262474, 0.7701747265892003, 0.7583598556049214, 0.7556738080997949, 0.7726818822624745, 0.7633261064593302, 0.7622954759056733, 0.7579994019138756, 0.7616760295625427, 0.7577617694805194, 0.7669733638072453, 0.7714456596035544, 0.7669413234791523, 0.7722840481886535, 0.764890742481203, 0.7670588046821599, 0.7593317455570745, 0.7673818779904307, 0.759483937115516, 0.7596494788106629, 0.7663192071086808, 0.7677530117908408, 0.7679425837320574, 0.7673151273069037, 0.759347765721121, 0.7629549726589201, 0.7792875299043063, 0.7605706382433357, 0.7550783920027341, 0.7585974880382775, 0.7677236414900888, 0.7613048957621327, 0.7634382476076556, 0.7519998504784691, 0.7554521958304854, 0.7709784048188654, 0.7671415755297334, 0.7624369873547505, 0.7627894309637732, 0.757580207621326, 0.7562345138414217, 0.7640096334586466, 0.7618736115857827, 0.7601754741968558, 0.7648106416609706, 0.7645115985987696, 0.7635316985645934, 0.7631632347915243, 0.7620898838004101, 0.7617294301093642, 0.7

In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [28]:
## 8 .
column_to_drop_7 = 'Cat_현재 주택의 구조'

In [29]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(6119, 208)


In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [111]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [112]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.06999999999999999, 'n_estimators': 160, 'subsample': 0.7000000000000001, 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 5}
0.8004741274968906


In [113]:
optuna_8 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [115]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7730636961722488


In [116]:
X_train = X_train.values
y_train = y_train.values

In [117]:
auc_bootstrap = []

In [118]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75500016, 0.77604131])

In [119]:
t_8 = auc_bootstrap
print(t_8)

[0.7713174982911823, 0.7658679724880384, 0.7695739704374572, 0.7599405117908407, 0.7638761320915927, 0.7587763798701299, 0.7650482740943267, 0.7627013200615174, 0.7714509996582364, 0.7743613294600136, 0.7666422804169515, 0.7705031399521531, 0.7667330613465482, 0.7663779477101846, 0.7748953349282296, 0.765293916609706, 0.7763024393369787, 0.7655128588516746, 0.7583918959330143, 0.7615425281954887, 0.7655128588516746, 0.756544237012987, 0.7636358296308954, 0.7692615772385509, 0.7636465097402598, 0.7518983894395079, 0.7586108381749829, 0.7506514866712235, 0.7655662593984963, 0.7688130126452495, 0.7680146744702665, 0.7565922975051265, 0.7646718002392344, 0.7641137645249488, 0.7644742182159945, 0.7586028280929596, 0.7654808185235816, 0.7627306903622693, 0.7555296266233766, 0.7628668617566643, 0.7636358296308954, 0.7624156271360218, 0.7634168873889269, 0.7613796565276828, 0.7726445018796992, 0.7643460569036227, 0.7533802546138073, 0.7650375939849624, 0.7602555750170882, 0.7668638926862611, 0

In [120]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
#9.
column_to_drop_8 = '소득 중 재산소득의 비중(월평균)'

In [31]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(6119, 207)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [32]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [33]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.797577609162013


In [34]:
optuna_9 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [35]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7710077751196172


In [36]:
X_train = X_train.values
y_train = y_train.values

In [37]:
auc_bootstrap = []

In [38]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75170281, 0.77547219])

In [39]:
t_9 = auc_bootstrap
print(t_9)

[0.7678571428571428, 0.7601060534859876, 0.7639455528024607, 0.765093664559125, 0.7630751238892686, 0.7698356331168831, 0.7672457065960356, 0.7635824290840738, 0.762650589542037, 0.7671442455570745, 0.7660068139097744, 0.7618682715311005, 0.760850991114149, 0.7666262602529049, 0.7529263499658236, 0.7655956296992482, 0.7604531570403281, 0.7735576512303486, 0.7585948180109365, 0.7555776871155161, 0.7619537124060151, 0.7624236372180451, 0.7665888798701299, 0.774137047163363, 0.7661776956596036, 0.77667624316473, 0.7620364832535884, 0.7700625854408749, 0.772599111414901, 0.7683564379699248, 0.7568459501025291, 0.7521306818181818, 0.7650776443950785, 0.764391447368421, 0.7660735645933014, 0.7561757732399179, 0.7641725051264524, 0.7827585654477102, 0.7659507433356118, 0.7609898325358853, 0.761350286226931, 0.7683858082706767, 0.7613930066643882, 0.7627360304169515, 0.7653793574846206, 0.7587763798701298, 0.765088324504443, 0.7648667122351334, 0.7492070018796992, 0.7484086637047163, 0.7583384

In [40]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [32]:
# 10.
column_to_drop_9 = 'Cat_현재 주택의 위치'

In [33]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(6119, 204)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [44]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [45]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7978864598201988


In [46]:
optuna_10 = GradientBoostingClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [47]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.7765534219070404


In [48]:
X_train = X_train.values
y_train = y_train.values

In [49]:
auc_bootstrap = []

In [50]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75547262, 0.77577417])

In [51]:
t_10 = auc_bootstrap
print(t_10)

[0.771272107826384, 0.7656970907382092, 0.7671629357484621, 0.7564935064935066, 0.7659400632262474, 0.7621192541011619, 0.761083283492823, 0.7609818224538619, 0.7629362824675325, 0.769969134483937, 0.7661643455228982, 0.7669466635338347, 0.7635477187286398, 0.7651043446684894, 0.7684071684894053, 0.7703856587491456, 0.7641110944976076, 0.7648560321257689, 0.7709570446001368, 0.7655128588516746, 0.7643380468215994, 0.7672029861585783, 0.7691521061175667, 0.7723214285714286, 0.758613508202324, 0.768028024606972, 0.7574520463089542, 0.7656730604921396, 0.7606240387901573, 0.7579246411483254, 0.7587389994873548, 0.7568032296650719, 0.7658733125427204, 0.7639028323650034, 0.7648053016062883, 0.7592035842447027, 0.7673418275803143, 0.7718888841421737, 0.7679425837320575, 0.7718595138414217, 0.7525819164388243, 0.7625651486671224, 0.7755521616541353, 0.7667677717019823, 0.7672857570061516, 0.764551649008886, 0.75751612696514, 0.7631792549555707, 0.7635530587833219, 0.7628374914559125, 0.76455

In [52]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [34]:
# 11.
column_to_drop_10 = '소득 중 사적이전소득의 비중(월평균)'

In [35]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(6119, 203)


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [56]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [57]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 144, 'subsample': 0.7000000000000001, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7982036577934707


In [58]:
optuna_11 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [59]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7698730134996582


In [60]:
X_train = X_train.values
y_train = y_train.values

In [61]:
auc_bootstrap = []

In [62]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74942307, 0.77361232])

In [63]:
t_11 = auc_bootstrap
print(t_11)

[0.7626559295967191, 0.7701560363978127, 0.772270698051948, 0.7609391020164046, 0.7619136619958987, 0.757678998632946, 0.7614517472658919, 0.7670321044087491, 0.7634809680451128, 0.7617347701640464, 0.7516313867053999, 0.7525872564935066, 0.7762757390635681, 0.7624797077922078, 0.7598710910799726, 0.7617668104921395, 0.7629683227956254, 0.7678464627477786, 0.757809829972659, 0.7562078135680108, 0.7653312969924811, 0.7499118890977443, 0.7668078221120984, 0.7616466592617908, 0.7585414174641147, 0.758215674128503, 0.7570702323991798, 0.7594679169514695, 0.756506856630212, 0.7522161226930965, 0.7612595052973343, 0.7607548701298701, 0.7525258458646616, 0.7519010594668489, 0.7716806220095693, 0.7580848427887901, 0.7665862098427889, 0.75477133885851, 0.7609017216336296, 0.7540557715311006, 0.7611687243677376, 0.7517595480177717, 0.7599912423103212, 0.7628134612098427, 0.7550436816473, 0.7526326469583049, 0.7527608082706767, 0.7661857057416267, 0.7564187457279562, 0.7669573436431989, 0.7656303

In [64]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [36]:
# 12
column_to_drop_11 = 'Cat_소득 계층'

In [37]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(6119, 201)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [68]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [69]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7980408851492918


In [70]:
optuna_12 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [71]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7750608766233766


In [72]:
X_train = X_train.values
y_train = y_train.values

In [73]:
auc_bootstrap = []

In [74]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75132246, 0.77500794])

In [75]:
t_12 = auc_bootstrap
print(t_12)

[0.7663031869446343, 0.7597242395762132, 0.7606133586807929, 0.7537914388243336, 0.7646450999658236, 0.7704176990772384, 0.7638147214627478, 0.7686501409774437, 0.7556204075529732, 0.7594358766233765, 0.7686367908407383, 0.7713068181818181, 0.7579086209842789, 0.7563920454545454, 0.7711599666780589, 0.7614517472658919, 0.7639188525290499, 0.7724175495557074, 0.7761849581339714, 0.7651470651059467, 0.7690720052973342, 0.7668959330143541, 0.7630724538619276, 0.759777640123035, 0.7648453520164046, 0.7672590567327409, 0.7651123547505128, 0.764757241114149, 0.7706847018113465, 0.7668024820574162, 0.7627253503075871, 0.761686709671907, 0.7674566387559809, 0.7639508928571429, 0.7627440404989747, 0.7694938696172249, 0.7548514396787422, 0.7571636833561176, 0.7611553742310321, 0.7664340182843472, 0.7579166310663021, 0.7628695317840055, 0.7592276144907724, 0.7798455656185919, 0.7683057074504444, 0.7534550153793576, 0.7610058526999317, 0.7649468130553657, 0.7632406655844156, 0.7679692840054682, 0.

In [76]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [38]:
# 13.
column_to_drop_12 = '부채 중 금융기관 대출금의 비중'

In [39]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(6119, 200)


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [80]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [81]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7971769380378801


In [82]:
optuna_13 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [83]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.7764893412508544


In [84]:
X_train = X_train.values
y_train = y_train.values

In [85]:
auc_bootstrap = []

In [86]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75135911, 0.77511308])

In [87]:
t_13 = auc_bootstrap
print(t_13)

[0.7728100435748462, 0.7688637431647299, 0.7637666609706084, 0.7420166182501708, 0.7640523538961039, 0.7789858168147642, 0.76352368848257, 0.7615745685235817, 0.7691467660628846, 0.7608323009227614, 0.7677636919002051, 0.7602609150717703, 0.7636304895762132, 0.7677610218728641, 0.7649628332194123, 0.7586588986671223, 0.7630964841079972, 0.7625384483937114, 0.7711813268967874, 0.7643300367395762, 0.7701640464798359, 0.764220565618592, 0.7634515977443609, 0.7705298402255639, 0.7694671693438141, 0.7523549641148326, 0.7553694249829118, 0.7709517045454546, 0.7613903366370471, 0.7701506963431306, 0.7702681775461381, 0.7754240003417634, 0.7589472616199591, 0.7686634911141491, 0.7711412764866712, 0.764757241114149, 0.7638360816814764, 0.7671068651742994, 0.7701266660970607, 0.7627707407723856, 0.7594946172248804, 0.7611927546138072, 0.7534416652426521, 0.7563573350991114, 0.756942071086808, 0.76028494531784, 0.7588538106630212, 0.7602635850991115, 0.7670107441900205, 0.7609177417976759, 0.7593

In [88]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [40]:
#14.
column_to_drop_13 = 'Cat_가구주 성별'

In [41]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(6119, 198)


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [43]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [44]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.8023314051035484


In [45]:
optuna_14 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [46]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7756910030758716


In [47]:
X_train = X_train.values
y_train = y_train.values

In [48]:
auc_bootstrap = []

In [49]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75154294, 0.77524244])

In [50]:
t_14 = auc_bootstrap
print(t_14)

[0.7619056519138756, 0.7526833774777854, 0.7716405715994532, 0.7651417250512645, 0.7574520463089542, 0.7538181390977444, 0.7576389482228297, 0.7519838303144224, 0.7659160329801777, 0.7551131023581682, 0.7690239448051949, 0.7515913362952836, 0.7628775418660286, 0.7630003631237183, 0.7635076683185236, 0.7723855092276145, 0.7603303357826384, 0.7572197539302803, 0.7547900290498974, 0.7606053485987696, 0.7676221804511277, 0.7723561389268627, 0.7653579972658919, 0.7635236884825701, 0.776075487012987, 0.7617988508202324, 0.7609043916609706, 0.7590380425495558, 0.7619457023239917, 0.7654327580314422, 0.7592009142173616, 0.7582904348940532, 0.7731598171565277, 0.7563226247436773, 0.7566777383800409, 0.7605546180792891, 0.765689080656186, 0.762818801264525, 0.7687596120984279, 0.7569634313055367, 0.7621593045112781, 0.7658626324333561, 0.7691734663362952, 0.7589152212918661, 0.7611260039302803, 0.7602101845522897, 0.7678037423103212, 0.7589392515379358, 0.7746416823308271, 0.758346505468216, 0.7

In [51]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [42]:
## 15.
column_to_drop_14 = '부채 중 비금융기관 대출금의 비중'

In [43]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(6119, 197)


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [55]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [56]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 144, 'subsample': 0.8, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 9}
0.7997938213173731


In [57]:
optuna_15 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [58]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7744040498974709


In [59]:
X_train = X_train.values
y_train = y_train.values

In [60]:
auc_bootstrap = []

In [61]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75107522, 0.77514925])

In [62]:
t_15 = auc_bootstrap
print(t_15)

[0.7640523538961039, 0.767563439849624, 0.7619590524606972, 0.7514204545454546, 0.7614971377306903, 0.769595330656186, 0.7664099880382775, 0.7711599666780589, 0.7578685705741627, 0.7581248931989064, 0.7664340182843472, 0.7606293788448394, 0.7637319506151744, 0.748331232911825, 0.7585280673274095, 0.767192306049214, 0.7656143198906356, 0.7778991156869446, 0.7642632860560491, 0.757078242481203, 0.7691334159261791, 0.7580901828434723, 0.7689999145591251, 0.7686074205399862, 0.7708929639439508, 0.7581142130895421, 0.7639322026657553, 0.7674886790840739, 0.7662017259056733, 0.7587016191045797, 0.7526940575871498, 0.7592196044087491, 0.759753609876965, 0.7530117908407381, 0.7633741669514695, 0.7609551221804511, 0.7590006621667806, 0.7607068096377307, 0.7642846462747778, 0.7723134184894054, 0.773739213089542, 0.7676782510252905, 0.752168062200957, 0.7713628887559809, 0.7705058099794941, 0.7691894865003418, 0.7731411269651401, 0.7673685278537252, 0.7569500811688312, 0.7674165883458647, 0.76749

In [63]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [44]:
# 16.
column_to_drop_15 = 'Cat_현재 공공기관 접근용이성'

In [45]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(6119, 193)


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [67]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [68]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7998689471531482


In [69]:
optuna_16 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [70]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7709303443267259


In [71]:
X_train = X_train.values
y_train = y_train.values

In [72]:
auc_bootstrap = []

In [73]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75040978, 0.77504873])

In [74]:
t_16 = auc_bootstrap
print(t_16)

[0.7620685235816814, 0.7599218215994532, 0.7558927503417635, 0.7533161739576213, 0.7530598513328777, 0.7703002178742311, 0.7494900247778538, 0.765625, 0.764583689336979, 0.7628294813738893, 0.750202922077922, 0.7500240302460699, 0.7598470608339029, 0.7644208176691728, 0.7636732100136705, 0.7671469155844156, 0.7727513029733425, 0.767560769822283, 0.7716112012987012, 0.7648720522898154, 0.7640123034859876, 0.7523336038961039, 0.7725483808954203, 0.7729889354066986, 0.7511828221120984, 0.759115473342447, 0.7741236970266575, 0.7667437414559125, 0.7686501409774436, 0.7649388029733424, 0.7759580058099795, 0.7621005639097745, 0.7719155844155845, 0.768233616712235, 0.7655902896445659, 0.751834308783322, 0.7663752776828434, 0.7741343771360217, 0.7591234834244702, 0.7632086252563226, 0.7627814208817498, 0.7686154306220095, 0.7715524606971975, 0.7685032894736842, 0.7600526529391661, 0.7639001623376623, 0.7687676221804512, 0.771768732911825, 0.7625678186944633, 0.754173252734108, 0.768570040157211

In [75]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
# 17.
column_to_drop_16 = 'Cat_현재 주변도로의 보행 안전'

In [47]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(6119, 189)


In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [79]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [80]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7997896476598302


In [81]:
optuna_17 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [82]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7753946300410115


In [83]:
X_train = X_train.values
y_train = y_train.values

In [84]:
auc_bootstrap = []

In [85]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75182323, 0.77524411])

In [87]:
t_17 = auc_bootstrap
print(t_17)

[0.761651999316473, 0.7708048530416951, 0.7577350692071086, 0.7593183954203692, 0.7618362312030075, 0.7681802161654137, 0.7631205143540669, 0.768631450786056, 0.7621165840738209, 0.7690906954887218, 0.7650990046138073, 0.7706259612098428, 0.7555643369788108, 0.760818950786056, 0.7659373931989063, 0.7665167891319208, 0.7683350777511961, 0.7655448991797678, 0.7721104964114833, 0.7661429853041695, 0.7577911397812714, 0.7650963345864661, 0.7587550196514012, 0.7611927546138073, 0.7786520633971291, 0.7634248974709501, 0.7591555237525633, 0.7641618250170883, 0.7559461508885852, 0.7624343173274094, 0.7706660116199591, 0.7660308441558442, 0.7680680750170881, 0.7598630809979494, 0.7804783620984279, 0.773138456937799, 0.7606720992822966, 0.7712373974709501, 0.7566109876965139, 0.764057693950786, 0.7618976418318524, 0.7536819677033494, 0.7694057587149692, 0.7629442925495558, 0.7654781484962406, 0.774406719924812, 0.7635824290840738, 0.7699985047846889, 0.7526993976418319, 0.7645115985987697, 0.757

In [88]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [48]:
## 18.
column_to_drop_17 ='Cat_이사 계획 중인 주택의 유형'

In [49]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(6119, 175)


In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [92]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [93]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 90, 'subsample': 0.6, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 8}
0.7983455621499345


In [94]:
optuna_18 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [95]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.7799870770676691


In [96]:
X_train = X_train.values
y_train = y_train.values

In [97]:
auc_bootstrap = []

In [98]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75965001, 0.77935448])

In [99]:
t_18 = auc_bootstrap
print(t_18)

[0.7663966379015722, 0.7729782552973343, 0.7748953349282296, 0.7701773966165414, 0.7686928614149008, 0.7758004741968558, 0.7704470693779903, 0.76866616114149, 0.7756456126110731, 0.774008885850991, 0.7683430878332194, 0.7694324589883801, 0.7679586038961039, 0.769600670710868, 0.7541198521872865, 0.7729542250512645, 0.7799310064935066, 0.7685807202665755, 0.7714136192754613, 0.7672403665413534, 0.7737498931989064, 0.7718061132946001, 0.7681401657552973, 0.7659454032809296, 0.7698543233082707, 0.7715711508885852, 0.7701426862611074, 0.7747831937799043, 0.7740569463431305, 0.7628321514012304, 0.7685059595010253, 0.768893113465482, 0.7777495941558442, 0.7804756920710868, 0.7740195659603554, 0.7674165883458646, 0.7698543233082707, 0.7675207194121668, 0.7705965909090909, 0.7754320104237868, 0.7694538192071088, 0.7651257048872181, 0.7725644010594668, 0.7635290285372522, 0.7704977998974709, 0.7674406185919344, 0.7665862098427888, 0.7650082236842106, 0.7640016233766235, 0.7755041011619959, 0.76

In [100]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [50]:
# 19
column_to_drop_18 = 'Cat_현재 대기오염 정도'

In [51]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(6119, 171)


In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [104]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [105]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.04, 'n_estimators': 192, 'subsample': 1.0, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 5}
0.7961502182822895


In [106]:
optuna_19 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [107]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7767216336295284


In [108]:
X_train = X_train.values
y_train = y_train.values

In [109]:
auc_bootstrap = []

In [110]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74664064, 0.77277947])

In [111]:
t_19 = auc_bootstrap
print(t_19)

[0.7644688781613124, 0.7484353639781272, 0.7563786953178401, 0.7646504400205058, 0.7565175367395762, 0.7701186560150377, 0.7571930536568694, 0.7724602699931647, 0.7637826811346548, 0.7551317925495556, 0.7622928058783321, 0.7563840353725222, 0.7667971420027341, 0.7584212662337663, 0.7536819677033492, 0.7561250427204375, 0.7623435363978127, 0.7552118933697882, 0.7645089285714286, 0.7687382518796992, 0.7589659518113465, 0.7622954759056733, 0.76866616114149, 0.7593237354750514, 0.7550837320574162, 0.7601754741968558, 0.7553614149008886, 0.7521280117908408, 0.7632273154477102, 0.7644021274777855, 0.7602982954545455, 0.7593477657211211, 0.766727721291866, 0.7671469155844155, 0.7579994019138756, 0.7676675709159263, 0.7661670155502392, 0.7704070189678742, 0.7652191558441559, 0.7544429254955571, 0.7607388499658236, 0.7612968856801094, 0.7626078691045797, 0.7693683783321941, 0.753398944805195, 0.7593798060492141, 0.7555509868421053, 0.7631418745727957, 0.7735977016404648, 0.7631765849282296, 0.7

In [52]:
# 20.
column_to_drop_19 = 'Cat_주택 보유 의식'

In [53]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(6119, 169)


In [114]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [115]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [116]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7992554194943197


In [117]:
optuna_20 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [118]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7724789601845523


In [119]:
X_train = X_train.values
y_train = y_train.values

In [120]:
auc_bootstrap = []

In [121]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75057713, 0.77403325])

In [122]:
t_20 = auc_bootstrap
print(t_20)

[0.7703723086124401, 0.7731384569377989, 0.7693149777853725, 0.7651443950786057, 0.7562718942241968, 0.7749887858851675, 0.773101076555024, 0.7681107954545454, 0.7546111372180452, 0.773234577922078, 0.7569847915242651, 0.7682736671223513, 0.7622447453861929, 0.7617748205741628, 0.7580180921052632, 0.7656463602187287, 0.7521947624743677, 0.7572784945317839, 0.7647412209501024, 0.755575017088175, 0.7621406143198906, 0.752835569036227, 0.7687489319890635, 0.7531372821257689, 0.7675394096035543, 0.7602502349624061, 0.7687489319890635, 0.7579700316131237, 0.754410885167464, 0.7676862611073139, 0.761286205570745, 0.7597749700956937, 0.759318395420369, 0.7545230263157894, 0.7577884697539302, 0.7626452494873548, 0.7581382433356116, 0.7673818779904306, 0.7609444420710868, 0.7652992566643881, 0.7634809680451127, 0.7628989020847574, 0.7665114490772386, 0.7634943181818182, 0.7474474538619276, 0.7586081681476419, 0.7592783450102529, 0.7544455955228981, 0.7582770847573479, 0.7637292805878333, 0.7663

In [123]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [54]:
# 21
column_to_drop_20 = '총 가구원 수'

In [55]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(6119, 168)


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [56]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [57]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7972270219283967


In [58]:
optuna_21 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [59]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7779712064251538


In [60]:
X_train = X_train.values
y_train = y_train.values

In [61]:
auc_bootstrap = []

In [62]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75385499, 0.77521367])

In [63]:
t_21 = auc_bootstrap
print(t_21)

[0.7676862611073136, 0.7611046437115515, 0.7630057031784006, 0.7686287807587149, 0.7597429297676008, 0.7666075700615174, 0.7667384014012304, 0.7647679212235133, 0.7745509014012303, 0.7653152768284348, 0.7672430365686944, 0.767093515037594, 0.7640042934039645, 0.7675046992481203, 0.7662471163704717, 0.7738540242652084, 0.7671095352016404, 0.7732292378673957, 0.7676568908065617, 0.766359257518797, 0.7656677204374572, 0.7719583048530416, 0.7789644565960356, 0.7723908492822966, 0.7644635381066303, 0.7597375897129187, 0.766062884483937, 0.7654327580314422, 0.7548701298701299, 0.7677556818181819, 0.7620284731715653, 0.7642339157552973, 0.7663085269993164, 0.7560956724196857, 0.7692962875939848, 0.7648827323991798, 0.7568219198564594, 0.7695339200273411, 0.7597295796308953, 0.768895783492823, 0.764458198051948, 0.7679719540328093, 0.751767558099795, 0.7651630852699931, 0.7662978468899522, 0.7582530545112782, 0.7630404135338346, 0.7704363892686261, 0.7660041438824332, 0.7624530075187971, 0.762

In [64]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [56]:
# 22
column_to_drop_21 = '총 이사 횟수'

In [57]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(6119, 167)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [68]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [69]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 147, 'subsample': 0.7000000000000001, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7985918079449744


In [70]:
optuna_22 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [71]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7752691387559808


In [72]:
X_train = X_train.values
y_train = y_train.values

In [73]:
auc_bootstrap = []

In [74]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75798805, 0.77787555])

In [75]:
t_22 = auc_bootstrap
print(t_22)

[0.7654781484962405, 0.7665995599794941, 0.7660815746753247, 0.7587096291866029, 0.7711119061859194, 0.7736030416951469, 0.7668532125768968, 0.7694404690704033, 0.7698756835269993, 0.7724602699931646, 0.7706206211551605, 0.7691067156527684, 0.7677823820915926, 0.7691387559808613, 0.7702014268626112, 0.7660148239917977, 0.7667117011278195, 0.7763237995557074, 0.7620952238550922, 0.7660335141831852, 0.7712427375256322, 0.7779818865345182, 0.7724442498291183, 0.7726845522898155, 0.7704470693779905, 0.7704577494873549, 0.761434392088175, 0.7585601076555024, 0.7714056091934383, 0.7644368378332194, 0.7677102913533834, 0.7728474239576213, 0.7673792079630896, 0.7619830827067671, 0.768730241797676, 0.7674139183185236, 0.7609364319890635, 0.7675901401230348, 0.7678144224196856, 0.7611019736842106, 0.7834020420369106, 0.7678811731032125, 0.7676542207792207, 0.7704257091592617, 0.7696674213943951, 0.7687649521531101, 0.7648747223171566, 0.7630430835611757, 0.777546672077922, 0.7630804639439508, 0.

In [76]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [58]:
column_to_drop_22 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [59]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(6119, 163)


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [82]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [83]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7972812794764563


In [84]:
optuna_23 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [85]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.774043596206425


In [86]:
X_train = X_train.values
y_train = y_train.values

In [89]:
auc_bootstrap = []

In [90]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75517698, 0.77552887])

In [91]:
t_23 = auc_bootstrap
print(t_23)

[0.7599218215994531, 0.7590407125768968, 0.7753786098769652, 0.7608750213602187, 0.7672857570061518, 0.7676248504784688, 0.7649815234107997, 0.7667544215652767, 0.7577537593984962, 0.7668905929596719, 0.7703562884483938, 0.7662524564251538, 0.7628935620300752, 0.763123184381408, 0.7706847018113465, 0.7664073180109364, 0.7744868207450444, 0.7581943139097744, 0.769098705570745, 0.7717954331852358, 0.7672350264866713, 0.7632193053656869, 0.7664767387218046, 0.766727721291866, 0.755273303998633, 0.7628321514012304, 0.7704417293233082, 0.7567978896103896, 0.7583224752221462, 0.770802183014354, 0.7663111970266576, 0.7671682758031443, 0.7688503930280245, 0.7660735645933014, 0.7731518070745045, 0.758146253417635, 0.759881771189337, 0.7637613209159262, 0.7652992566643881, 0.7622554254955571, 0.7568379400205059, 0.7545844369446343, 0.7610192028366372, 0.7663592575187971, 0.7652351760082023, 0.7600820232399179, 0.7678811731032126, 0.7674486286739577, 0.7612942156527683, 0.7677423316814764, 0.7619

In [92]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [60]:
# 24
column_to_drop_23 = 'Cat_가족계획 시 중요 고려 사항 1순위'

In [61]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(6119, 156)


In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [96]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [97]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.8038297481615038


In [98]:
optuna_24 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [99]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7781687884483937


In [100]:
X_train = X_train.values
y_train = y_train.values

In [101]:
auc_bootstrap = []

In [102]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75497827, 0.77625758])

In [103]:
t_24 = auc_bootstrap
print(t_24)

[0.7735336209842789, 0.7758752349624061, 0.7812927204374572, 0.7600152725563911, 0.7634702879357484, 0.7606907894736842, 0.7667677717019823, 0.7678785030758715, 0.7604611671223513, 0.7520719412166781, 0.7686020804853041, 0.7666956809637732, 0.7630270633971292, 0.7632379955570745, 0.7649628332194122, 0.759248974709501, 0.7634515977443609, 0.7642392558099794, 0.7641030844155845, 0.7680867652084757, 0.765726461038961, 0.7630163832877648, 0.7658439422419686, 0.7727940234107997, 0.7652244958988381, 0.7663993079289132, 0.7691894865003417, 0.7641511449077237, 0.7677530117908407, 0.7728527640123035, 0.7699504442925496, 0.7724575999658236, 0.7680093344155845, 0.7662230861244019, 0.7757951341421736, 0.7582370343472318, 0.762156634483937, 0.7774906015037595, 0.7695446001367054, 0.7652351760082023, 0.7609524521531101, 0.7736217318865346, 0.7623381963431305, 0.7723027383800409, 0.7663939678742311, 0.7606293788448394, 0.7665648496240601, 0.7616600093984962, 0.7736884825700616, 0.7605359278879016, 0.

In [104]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [62]:
column_to_drop_24 = 'Cat_현재 의료시설 접근용이성'

In [63]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(6119, 152)


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [108]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [109]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7989799580964783


In [110]:
optuna_25 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [111]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.7766255126452495


In [112]:
X_train = X_train.values
y_train = y_train.values

In [113]:
auc_bootstrap = []

In [114]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75505329, 0.77628402])

In [115]:
t_25 = auc_bootstrap
print(t_25)

[0.7635850991114149, 0.7671415755297334, 0.7667731117566644, 0.7648800623718387, 0.7654060577580314, 0.7584559765892002, 0.7590380425495558, 0.7707114020847574, 0.7690239448051948, 0.7755842019822283, 0.7639802631578947, 0.7682816772043746, 0.7673071172248804, 0.7682442968215994, 0.7695846505468217, 0.7655048487696514, 0.7678731630211894, 0.7696540712576897, 0.7657398111756664, 0.7589846420027342, 0.7643487269309637, 0.7651363849965824, 0.7678811731032127, 0.7641004143882434, 0.7729088345864662, 0.7725644010594669, 0.7677156314080655, 0.7732933185235816, 0.7656143198906357, 0.762450337491456, 0.7674245984278879, 0.761582578605605, 0.7695899906015037, 0.7753252093301436, 0.7580020719412166, 0.7672857570061518, 0.772268028024607, 0.7690292848598769, 0.7750715567327409, 0.7627360304169515, 0.7679212235133288, 0.7647065105946684, 0.7692402170198224, 0.7679158834586466, 0.765224495898838, 0.7587069591592619, 0.7656143198906357, 0.7606987995557073, 0.7652458561175667, 0.764690490430622, 0.76

In [116]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [64]:
# 26
column_to_drop_25 = 'Cat_이사 예상 기간'

In [65]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(6119, 148)


In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [120]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [121]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7982662626566165


In [122]:
optuna_26 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [124]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.7756295924470266


In [125]:
X_train = X_train.values
y_train = y_train.values

In [126]:
auc_bootstrap = []

In [127]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75554685, 0.77636665])

In [128]:
t_26 = auc_bootstrap
print(t_26)

[0.7611046437115516, 0.7554361756664387, 0.767427268455229, 0.7643380468215994, 0.7571316430280246, 0.7664740686944633, 0.756608317669173, 0.766393967874231, 0.7622127050580998, 0.7708288832877649, 0.7673872180451127, 0.7596921992481204, 0.7610031826725906, 0.7595987482911825, 0.7686901913875598, 0.7585681177375257, 0.777346420027341, 0.7611607142857144, 0.7679479237867396, 0.7565202067669173, 0.765622329972659, 0.7654514482228298, 0.7594545668147642, 0.7658866626794261, 0.7751276273069037, 0.757711038961039, 0.7710077751196174, 0.766992053998633, 0.7688183526999317, 0.7645703392002734, 0.769200166609706, 0.7718942241968558, 0.7673177973342447, 0.7635850991114149, 0.7671015251196173, 0.7572170839029392, 0.7665862098427887, 0.765558249316473, 0.766562179596719, 0.765622329972659, 0.7648053016062885, 0.7628214712918662, 0.7650215738209158, 0.765558249316473, 0.7729782552973343, 0.7767483339029391, 0.7775333219412166, 0.7670160842447027, 0.7629256023581682, 0.761451747265892, 0.7673605177

In [129]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [66]:
# 27
column_to_drop_26 = '소득 중 근로/사업소득의 비중(월평균)'

In [67]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(6119, 147)


In [132]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [133]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [134]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 146, 'subsample': 0.7000000000000001, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8003823070309435


In [135]:
optuna_27 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [136]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

0.7792581596035543


In [137]:
X_train = X_train.values
y_train = y_train.values

In [138]:
auc_bootstrap = []

In [139]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76055061, 0.77964541])

In [140]:
t_27 = auc_bootstrap
print(t_27)

[0.7687756322624744, 0.7705939208817498, 0.7614010167464114, 0.769734172077922, 0.7688276977956254, 0.7750528665413534, 0.7724442498291183, 0.7719502947710185, 0.7764599709501026, 0.7692428870471633, 0.7675313995215312, 0.7732265678400545, 0.7751997180451129, 0.7660922547846889, 0.7680760850991114, 0.7711572966507176, 0.7727646531100479, 0.7720597658920028, 0.777114127648667, 0.7756215823650034, 0.775277148838004, 0.7826984898325359, 0.7713255083732058, 0.7673418275803144, 0.7660735645933013, 0.7727219326725905, 0.7639028323650036, 0.7697288320232399, 0.7805504528366369, 0.772268028024607, 0.7689011235475051, 0.7800778579972658, 0.7750688867053999, 0.7744147300068353, 0.7635103383458647, 0.7747111030416952, 0.7731758373205742, 0.7762063183526999, 0.78364768455229, 0.7624049470266575, 0.7600259526657552, 0.770762132604238, 0.7765614319890636, 0.7702521573820915, 0.7680306946343131, 0.7687809723171566, 0.7669359834244702, 0.7687195616883118, 0.7686608210868079, 0.7732479280587834, 0.7639

In [141]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [68]:
# 28
column_to_drop_27  = 'Cat_현재 교육환경'

In [69]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(6119, 143)


In [144]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [145]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [146]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7986711074382925


In [147]:
optuna_28 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [148]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7756936731032125


In [149]:
X_train = X_train.values
y_train = y_train.values

In [150]:
auc_bootstrap = []

In [151]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75479123, 0.77592723])

In [152]:
t_28 = auc_bootstrap
print(t_28)

[0.7677797120642517, 0.7663085269993165, 0.7745856117566643, 0.764725200786056, 0.7619537124060151, 0.759182224025974, 0.7692508971291866, 0.7740195659603555, 0.7618949718045112, 0.7712427375256321, 0.7619804126794258, 0.7570195018796991, 0.7635717489747096, 0.7689465140123035, 0.7671389055023923, 0.7769993164730007, 0.7620231331168832, 0.7707220821941216, 0.7603063055365686, 0.7718808740601504, 0.7654995087149692, 0.7516420668147643, 0.7670240943267259, 0.7636304895762132, 0.7742011278195489, 0.7668986030416951, 0.7696460611756665, 0.7652218258714969, 0.7684979494190021, 0.7691681262816131, 0.7605759782980177, 0.7612595052973342, 0.7676435406698565, 0.7653286269651401, 0.7633074162679425, 0.7715551307245386, 0.7706126110731373, 0.7626425794600138, 0.7674032382091593, 0.7613315960355433, 0.7620898838004102, 0.7571289730006834, 0.7609284219070404, 0.7661136150034176, 0.7626399094326727, 0.7528996496924127, 0.7733707493164731, 0.7676088303144224, 0.7648960825358853, 0.7676195104237868, 0

In [153]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [70]:
# 29
column_to_drop_28 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [71]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(6119, 139)


In [156]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [157]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [158]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 146, 'subsample': 0.6, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 8}
0.8001652768387049


In [159]:
optuna_29 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [160]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7789724666780589


In [161]:
X_train = X_train.values
y_train = y_train.values

In [162]:
auc_bootstrap = []

In [163]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75551895, 0.77710124])

In [164]:
t_29 = auc_bootstrap
print(t_29)

[0.769063995215311, 0.7724709501025291, 0.773173167293233, 0.7667384014012304, 0.7643166866028708, 0.7623034859876967, 0.7640870642515379, 0.772866114149009, 0.760645399008886, 0.7697234919685578, 0.7659721035543403, 0.7673284774436091, 0.7603703861927545, 0.7702014268626112, 0.7719796650717703, 0.7758058142515379, 0.7729408749145592, 0.7753385594668489, 0.7639375427204376, 0.7610405630553657, 0.7646103896103896, 0.7652138157894737, 0.7620738636363636, 0.7677850521189337, 0.7620658535543403, 0.7713602187286397, 0.7646050495557075, 0.7590033321941216, 0.7594091763499659, 0.7693790584415585, 0.7611767344497609, 0.7674245984278879, 0.7548007091592618, 0.765256536226931, 0.7627600606630213, 0.7677209714627478, 0.7580394523239917, 0.7743586594326726, 0.7654834885509227, 0.767659560833903, 0.7658733125427204, 0.7627333603896104, 0.762989683014354, 0.7551878631237183, 0.7667250512645248, 0.7603543660287081, 0.7706126110731374, 0.7581462534176351, 0.7678998632946, 0.7756723128844839, 0.7673177

In [165]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
column_to_drop_29 = '현재 주택 거주 기간(총 개월)'

In [73]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(6119, 138)


In [168]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [169]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [170]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 141, 'subsample': 0.7000000000000001, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 7}
0.8018305661983824


In [171]:
optuna_30 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [172]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.778280929596719


In [173]:
X_train = X_train.values
y_train = y_train.values

In [174]:
auc_bootstrap = []

In [175]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75897837, 0.77848419])

In [176]:
t_30 = auc_bootstrap
print(t_30)

[0.7620952238550924, 0.7688210227272727, 0.7638173914900889, 0.7755868720095694, 0.7683030374231031, 0.7660041438824332, 0.7682336167122351, 0.7564748163021189, 0.7676355305878331, 0.7712480775803143, 0.7710958860218728, 0.7737552332535885, 0.7724202195830486, 0.7563386449077238, 0.7709650546821599, 0.7681001153451812, 0.762850841592618, 0.7694030886876282, 0.7698516532809296, 0.7711813268967873, 0.7681935663021189, 0.7642232356459331, 0.7642365857826384, 0.769897043745728, 0.7647999615516062, 0.7733333689336979, 0.7696460611756665, 0.7709330143540671, 0.7696033407382092, 0.7720570958646618, 0.7808428208304854, 0.768028024606972, 0.765659710355434, 0.7700599154135339, 0.7626639396787424, 0.7686501409774436, 0.7633848470608339, 0.7648987525632263, 0.7691948265550238, 0.7625277682843473, 0.7729622351332877, 0.7733333689336978, 0.7757123632946001, 0.7767696941216677, 0.7701827366712235, 0.7654354280587833, 0.7704070189678742, 0.7758779049897471, 0.7680734150717703, 0.7675100393028025, 0.7

In [177]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [74]:
# 31
column_to_drop_30 = 'Cat_현재 상업시설 접근용이성'

In [75]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(6119, 134)


In [180]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [181]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [182]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.8025985191863036


In [183]:
optuna_31= GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [184]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7748873248462064


In [185]:
X_train = X_train.values
y_train = y_train.values

In [186]:
auc_bootstrap = []

In [187]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75273765, 0.77575962])

In [188]:
t_31 = auc_bootstrap
print(t_31)

[0.7585974880382775, 0.7581409133629529, 0.7555536568694463, 0.7558153195488722, 0.7635316985645932, 0.7601060534859877, 0.7719823350991115, 0.7586268583390294, 0.7654354280587834, 0.7632139653110048, 0.7733440490430623, 0.7608056006493507, 0.7676248504784687, 0.771469689849624, 0.7553534048188654, 0.7731624871838687, 0.7611073137388926, 0.7613903366370471, 0.777341079972659, 0.7657745215311005, 0.7606667592276144, 0.7570221719070404, 0.7587737098427888, 0.7686688311688312, 0.7667757817840054, 0.7695926606288448, 0.7560235816814764, 0.7653179468557758, 0.7652458561175667, 0.7591047932330827, 0.7665034389952153, 0.7596921992481203, 0.7783183099794941, 0.7720384056732741, 0.7672697368421051, 0.760850991114149, 0.7612061047505125, 0.7572064037935748, 0.7676221804511278, 0.7655262089883801, 0.7603036355092275, 0.7708502435064934, 0.7636011192754614, 0.7583651956596036, 0.7559808612440192, 0.757446706254272, 0.7594065063226246, 0.7640496838687628, 0.7602555750170882, 0.7557939593301435, 0.7

In [189]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [76]:
# 32
column_to_drop_31 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [77]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(6119, 130)


In [192]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [193]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [194]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7986544128081202


In [195]:
optuna_32 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [196]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.7792955399863293


In [197]:
X_train = X_train.values
y_train = y_train.values

In [198]:
auc_bootstrap = []

In [199]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75664536, 0.77826264])

In [200]:
t_32 = auc_bootstrap
print(t_32)

[0.7708635936431989, 0.7746630425495558, 0.7724843002392344, 0.7602181946343131, 0.7646904904306219, 0.7595560278537252, 0.7699584543745728, 0.7668986030416952, 0.7649601631920712, 0.7802968002392343, 0.7611713943950786, 0.7729088345864661, 0.7671335654477102, 0.7691601161995899, 0.7645943694463431, 0.7730770463089541, 0.7672136662679426, 0.7640843942241969, 0.7650669642857142, 0.7590380425495556, 0.7677476717361587, 0.772535030758715, 0.7725403708133972, 0.781479622351333, 0.7620471633629529, 0.7764065704032809, 0.7715578007518796, 0.7719476247436774, 0.7673444976076556, 0.7602315447710184, 0.7694965396445659, 0.7823073308270677, 0.7681134654818866, 0.7681134654818866, 0.7573025247778536, 0.7588217703349283, 0.7585093771360218, 0.7592062542720438, 0.7671549256664388, 0.7651737653793576, 0.7733974495898839, 0.7647786013328777, 0.7703856587491456, 0.7709463644907724, 0.7670721548188654, 0.7683083774777854, 0.7725750811688312, 0.7666636406356802, 0.7697742224880383, 0.7778403750854408, 0

In [201]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [78]:
column_to_drop_32 = 'Cat_현재 문화시설 접근용이성'

In [79]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(6119, 126)


In [204]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [205]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [206]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 147, 'subsample': 0.7000000000000001, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8012838170602427


In [207]:
optuna_33 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [208]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.7783049598427888


In [209]:
X_train = X_train.values
y_train = y_train.values

In [210]:
auc_bootstrap = []

In [211]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75979847, 0.77905777])

In [212]:
t_33 = auc_bootstrap
print(t_33)

[0.7745242011278196, 0.7666342703349283, 0.7713762388926864, 0.7734041246582365, 0.764324696684894, 0.7744174000341764, 0.7679185534859877, 0.772198607313739, 0.7652618762816131, 0.762220715140123, 0.7676248504784688, 0.7696620813397129, 0.7669787038619275, 0.780280780075188, 0.7694271189336979, 0.7745749316473001, 0.7668478725222148, 0.7747484834244704, 0.7604825273410799, 0.7720303955912509, 0.7679425837320574, 0.7643727571770333, 0.769133415926179, 0.7748846548188653, 0.768895783492823, 0.7693069677033493, 0.7668371924128503, 0.7687115516062886, 0.7698730134996583, 0.7728954844497608, 0.7648106416609706, 0.7695873205741627, 0.7724495898838004, 0.7664046479835953, 0.7699918297163363, 0.7674886790840738, 0.7654541182501708, 0.7679906442241968, 0.7729248547505125, 0.7721051563568011, 0.7746523624401914, 0.7663752776828434, 0.7773197197539302, 0.7647305408407382, 0.7671362354750513, 0.7648693822624744, 0.7686234407040329, 0.7582316942925496, 0.7691147257347914, 0.7644742182159945, 0.772

In [213]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [80]:
# 34
column_to_drop_33 = '자산 중 금융자산의 비중'

In [81]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(6119, 125)


In [216]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [217]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [218]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7995559228374194


In [219]:
optuna_34 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [220]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.7757390635680109


In [221]:
X_train = X_train.values
y_train = y_train.values

In [222]:
auc_bootstrap = []

In [223]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75559224, 0.77766562])

In [224]:
t_34 = auc_bootstrap
print(t_34)

[0.764220565618592, 0.7571663533834587, 0.766495428913192, 0.7731170967190705, 0.760015272556391, 0.7741156869446344, 0.7721051563568011, 0.7741236970266575, 0.7677903921736159, 0.7596921992481203, 0.7648266618250171, 0.7675901401230348, 0.7581035329801779, 0.7667250512645248, 0.7727352828092959, 0.7733840994531783, 0.7594438867054, 0.7699451042378673, 0.763590439166097, 0.7761956382433356, 0.7744147300068353, 0.7646584501025291, 0.7719930152084757, 0.7667170411825016, 0.759112803315106, 0.7651951255980862, 0.7602235346889952, 0.7589873120300752, 0.7721959372863978, 0.7679319036226931, 0.7743586594326726, 0.7733760893711551, 0.7644635381066301, 0.7745295411825017, 0.7651791054340396, 0.767496689166097, 0.7650189037935748, 0.7708956339712919, 0.7653740174299386, 0.7699771445659603, 0.757278494531784, 0.7559701811346549, 0.7633314465140123, 0.7639695830485304, 0.7700412252221464, 0.7594999572795624, 0.7666316003075871, 0.7636171394395078, 0.7802140293916608, 0.768057394907724, 0.77731971

In [225]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [82]:
# 35
column_to_drop_34 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [83]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(6119, 121)


In [228]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [229]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [230]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.8027571181729398


In [231]:
optuna_35 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [232]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.7755628417634997


In [233]:
X_train = X_train.values
y_train = y_train.values

In [234]:
auc_bootstrap = []

In [235]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75605275, 0.77724603])

In [236]:
t_35 = auc_bootstrap
print(t_35)

[0.7704283791866029, 0.7648854024265208, 0.772270698051948, 0.771838153622693, 0.7633795070061518, 0.7657825316131237, 0.7698329630895421, 0.7685433398838004, 0.7589285714285714, 0.7668825828776487, 0.7592810150375939, 0.7698035927887901, 0.7569313909774436, 0.761686709671907, 0.772396189336979, 0.7597295796308955, 0.7668291823308271, 0.7702014268626112, 0.7700572453861927, 0.7676168403964455, 0.771998355263158, 0.7705832407723855, 0.7680360346889952, 0.7690933655160629, 0.7604531570403281, 0.7738219839371155, 0.766895933014354, 0.7660228340738209, 0.7767990644224196, 0.7651043446684893, 0.7772449589883801, 0.7798295454545454, 0.7675100393028025, 0.7652752264183185, 0.7692135167464115, 0.762818801264525, 0.7742198180109364, 0.7649735133287765, 0.7521520420369106, 0.768297697368421, 0.7721852571770335, 0.762885551948052, 0.7673151273069037, 0.7640016233766235, 0.769760872351333, 0.7673177973342448, 0.7570488721804512, 0.7673685278537252, 0.766992053998633, 0.7570088217703349, 0.75982036

In [237]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [84]:
# 36
column_to_drop_35 = 'Cat_현재 주차시설 이용편의성'

In [85]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(6119, 117)


In [240]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [241]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [242]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.799768779372115


In [243]:
optuna_36 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [244]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.7760274265208476


In [245]:
X_train = X_train.values
y_train = y_train.values

In [246]:
auc_bootstrap = []

In [247]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75696243, 0.77696788])

In [248]:
t_36 = auc_bootstrap
print(t_36)

[0.7658706425153793, 0.7711012260765551, 0.771266767771702, 0.7677209714627479, 0.7715578007518797, 0.7608109407040329, 0.7663592575187971, 0.7642793062200957, 0.7764199205399863, 0.7634035372522214, 0.7641324547163362, 0.7656784005468216, 0.7671709458304853, 0.7633501367053998, 0.7705832407723854, 0.7649307928913192, 0.7660522043745729, 0.7567177887901573, 0.7582076640464798, 0.7671415755297335, 0.7679452537593985, 0.7691414260082023, 0.7663432373547505, 0.7646264097744362, 0.7606106886534518, 0.7704150290498974, 0.7666876708817498, 0.7750154861585783, 0.7654033877306903, 0.7620978938824334, 0.7625411184210527, 0.7653820275119617, 0.7722733680792891, 0.7586642387218046, 0.7567044386534518, 0.7577991498632946, 0.7659507433356118, 0.7700198650034176, 0.7710932159945318, 0.7685914003759399, 0.7540557715311004, 0.7631018241626794, 0.7686074205399863, 0.7636384996582365, 0.7687142216336296, 0.7696353810663021, 0.7680814251537936, 0.7708769437799043, 0.7704417293233083, 0.7739634953861929, 

In [249]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [86]:
# 37
column_to_drop_36 = '현재 무주택 기간(총 개월)'

In [87]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(6119, 116)


In [252]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [253]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [254]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7995016652893597


In [255]:
optuna_37 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [256]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7763825401572112


In [257]:
X_train = X_train.values
y_train = y_train.values

In [258]:
auc_bootstrap = []

In [259]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7583411 , 0.77835128])

In [260]:
t_37 = auc_bootstrap
print(t_37)

[0.7768084095181135, 0.7650189037935748, 0.761582578605605, 0.7643327067669174, 0.7603009654818865, 0.7749754357484622, 0.771701982228298, 0.7629256023581681, 0.7725617310321258, 0.7744280801435407, 0.7662551264524948, 0.7743960398154477, 0.764913437713602, 0.7651310449419002, 0.770332258202324, 0.7657665114490773, 0.7643407168489404, 0.7696487312030075, 0.7644742182159945, 0.7626318993506493, 0.7610325529733424, 0.7603009654818864, 0.7707260872351333, 0.7716646018455229, 0.7734481801093642, 0.7783423402255638, 0.7664019779562543, 0.7690680002563226, 0.7653846975393028, 0.7688263627819548, 0.7663192071086808, 0.7745348812371838, 0.7817439550580998, 0.7740102208646616, 0.7751383074162679, 0.7795705528024608, 0.7666956809637732, 0.7644021274777854, 0.7670774948735475, 0.7614357271018455, 0.7649655032467533, 0.770762132604238, 0.7686154306220097, 0.77630510936432, 0.7717607228298018, 0.7696460611756665, 0.76532862696514, 0.7769686111585783, 0.7706046009911142, 0.7646103896103895, 0.778566

In [261]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [88]:
# 38
column_to_drop_37 = 'Cat_현재 청소/쓰레기 처리상태'

In [89]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(6119, 112)


In [264]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [265]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [266]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7994056711658695


In [267]:
optuna_38 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [268]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7744961658407381


In [269]:
X_train = X_train.values
y_train = y_train.values

In [270]:
auc_bootstrap = []

In [271]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7590341, 0.7785251])

In [272]:
t_38 = auc_bootstrap
print(t_38)

[0.7773036995898838, 0.7745081809637732, 0.7768497949419002, 0.7647091806220095, 0.7725056604579632, 0.7647385509227614, 0.7633287764866712, 0.7826811346548189, 0.7698022577751196, 0.7630270633971292, 0.7667651016746412, 0.7665087790498974, 0.760349025974026, 0.7652992566643882, 0.7664580485304169, 0.7635850991114148, 0.765021573820916, 0.7623141660970607, 0.7735309509569379, 0.7632059552289815, 0.7743933697881066, 0.7641698350991114, 0.7655635893711552, 0.7637346206425154, 0.7627734107997266, 0.7612381450786057, 0.7704043489405331, 0.7693336679767601, 0.7742358381749829, 0.7685486799384825, 0.7702788576555023, 0.7738460141831852, 0.7615265080314422, 0.7630350734791524, 0.7668318523581682, 0.7674326085099111, 0.7707541225222145, 0.769595330656186, 0.772502990430622, 0.7694030886876282, 0.7669653537252222, 0.766722381237184, 0.7748205741626795, 0.7677102913533835, 0.769241552033493, 0.7737899436090226, 0.7750208262132604, 0.761434392088175, 0.7735202708475736, 0.7675954801777171, 0.7739

In [273]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [90]:
# 38
column_to_drop_38 = 'Cat_현재 대중교통 접근용이성'

In [91]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(6119, 108)


In [276]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [277]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [278]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 103, 'subsample': 0.4, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 3}
0.8050776717668762


In [279]:
optuna_39 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [280]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.7725029904306221


In [281]:
X_train = X_train.values
y_train = y_train.values

In [282]:
auc_bootstrap = []

In [283]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75538278, 0.77693804])

In [284]:
t_39 = auc_bootstrap
print(t_39)

[0.7674219284005468, 0.7738166438824332, 0.7682122564935064, 0.7595213174982912, 0.7524350649350648, 0.7638013713260424, 0.7662791566985645, 0.7637773410799726, 0.768460569036227, 0.7621139140464798, 0.7631125042720437, 0.7639722530758715, 0.7742785586124402, 0.7612942156527682, 0.7685112995557075, 0.7618923017771703, 0.762920262303486, 0.766591549897471, 0.7671976461038962, 0.77096505468216, 0.7660468643198908, 0.7626772898154478, 0.7723534688995215, 0.7751810278537253, 0.7645676691729324, 0.7739982057416268, 0.774876644736842, 0.7674432886192756, 0.7569180408407382, 0.7725056604579632, 0.7687168916609706, 0.7626559295967191, 0.7594278665413534, 0.7700519053315106, 0.7580928528708133, 0.7638334116541352, 0.7719129143882433, 0.7726738721804511, 0.7633100862952837, 0.7661216250854408, 0.7578071599453178, 0.7705779007177034, 0.7611553742310322, 0.7621085739917977, 0.763189935064935, 0.7658332621326043, 0.7643807672590567, 0.7525632262474369, 0.7618095309295967, 0.7747324632604238, 0.7721

In [285]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [92]:
# 40
column_to_drop_39 = '자산 중 기타자산의 비중'

In [93]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(6119, 107)


In [288]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [289]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [290]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 147, 'subsample': 0.7000000000000001, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.8006243791684404


In [291]:
optuna_40 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [292]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.779647983595352


In [293]:
X_train = X_train.values
y_train = y_train.values

In [294]:
auc_bootstrap = []

In [295]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76009744, 0.77917966])

In [296]:
t_40 = auc_bootstrap
print(t_40)

[0.7729889354066986, 0.7653099367737525, 0.7645463089542037, 0.7646531100478469, 0.7794584116541354, 0.7782221889952153, 0.7717046522556391, 0.7740676264524948, 0.7699210739917977, 0.7703028879015721, 0.7588084201982229, 0.7738566942925497, 0.7732105476760082, 0.7676729109706084, 0.7628321514012304, 0.7770767472658919, 0.7660735645933014, 0.7704176990772386, 0.7711412764866713, 0.7653099367737526, 0.7716939721462748, 0.769461829289132, 0.763091144053315, 0.7745802717019822, 0.7614330570745044, 0.7689385039302803, 0.758044792378674, 0.7673311474709501, 0.7658439422419685, 0.7734188098086124, 0.7622087000170882, 0.7652538661995899, 0.7681188055365686, 0.7650295839029391, 0.7754079801777172, 0.7746256621667806, 0.765224495898838, 0.7742892387218044, 0.7767563439849624, 0.7712721078263841, 0.7712026871155161, 0.7779631963431304, 0.7668211722488039, 0.7682549769309637, 0.7643647470950102, 0.7659427332535885, 0.7694084287423103, 0.762725350307587, 0.7724682800751879, 0.7690666652426521, 0.77

In [297]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [94]:
# 41.
column_to_drop_40 = '소득 대비 생활비의 비율'

In [95]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(6119, 106)


In [300]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [301]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [302]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.800269618277281


In [303]:
optuna_41 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [304]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.7786226930963773


In [305]:
X_train = X_train.values
y_train = y_train.values

In [306]:
auc_bootstrap = []

In [307]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75881616, 0.77889303])

In [308]:
t_41 = auc_bootstrap
print(t_41)

[0.7711973470608339, 0.7772209287423103, 0.7707541225222146, 0.7625491285030759, 0.7759580058099795, 0.7670561346548189, 0.7825049128503077, 0.7682175965481887, 0.7689198137388927, 0.7676675709159261, 0.7705565404989748, 0.7718595138414218, 0.7648693822624744, 0.7705084800068351, 0.7690853554340397, 0.762685299897471, 0.7698169429254956, 0.7637266105604922, 0.7746443523581681, 0.7622687756322626, 0.7620298081852359, 0.7718888841421736, 0.7661696855775805, 0.7642926563568011, 0.7583705357142856, 0.7686448009227616, 0.766626260252905, 0.7714590097402598, 0.75934509569378, 0.7649307928913192, 0.7710451555023923, 0.7732265678400547, 0.7806599239576214, 0.7618348961893369, 0.7706620065789473, 0.7711813268967873, 0.7732639482228298, 0.7779338260423787, 0.7661803656869448, 0.7708288832877648, 0.7702895377648666, 0.7689705442583733, 0.7633421266233766, 0.7602902853725221, 0.7618122009569379, 0.7717113273239917, 0.7720757860560492, 0.7617948457792207, 0.7627947710184553, 0.768927823820916, 0.76

In [309]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [96]:
# 42.
column_to_drop_41 = 'Cat_가구주 종사상 지위'

In [97]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(6119, 101)


In [312]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [313]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [314]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.801563452115627


In [315]:
optuna_42 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [316]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.7777015336637048


In [317]:
X_train = X_train.values
y_train = y_train.values

In [318]:
auc_bootstrap = []

In [319]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75431663, 0.77580374])

In [321]:
t_42 = auc_bootstrap
print(t_42)

[0.7602876153451812, 0.7669920539986329, 0.7684552289815447, 0.7661109449760766, 0.7607789003759398, 0.7617454502734109, 0.76866616114149, 0.7645382988721804, 0.7615398581681477, 0.775570851845523, 0.767793062200957, 0.7645863593643198, 0.7637720010252905, 0.7601701341421736, 0.7719689849624061, 0.7593637858851675, 0.7639936132946001, 0.7783877306903623, 0.7598417207792207, 0.7631979451469584, 0.7682389567669172, 0.763288726076555, 0.7606827793916611, 0.7676595608339029, 0.7688744232740943, 0.7665514994873548, 0.7720303955912508, 0.765291246582365, 0.7626666097060835, 0.7601380938140807, 0.7629843429596719, 0.7645169386534517, 0.7636785500683526, 0.7679639439507862, 0.7636064593301435, 0.7669413234791523, 0.7821925196514012, 0.7644154776144907, 0.7677797120642516, 0.7644822282980178, 0.7726071214969241, 0.7614944677033494, 0.770663341592618, 0.7718141233766234, 0.7721051563568011, 0.7686047505126452, 0.7585974880382774, 0.7656169899179768, 0.761617288961039, 0.7608910415242652, 0.76157

In [322]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [98]:
# 43.
column_to_drop_42 = 'Cat_남편/아내의 부모님과 동거 의향'

In [99]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(6119, 96)


In [326]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [327]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [328]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 146, 'subsample': 0.6, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 8}
0.7997604320570288


In [329]:
optuna_43 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [330]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.7780966977101846


In [331]:
X_train = X_train.values
y_train = y_train.values

In [332]:
auc_bootstrap = []

In [333]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75538231, 0.77848666])

In [334]:
t_43 = auc_bootstrap
print(t_43)

[0.7659988038277511, 0.7612061047505125, 0.7632807159945317, 0.7588110902255639, 0.7724843002392345, 0.7606961295283663, 0.7625144181476419, 0.7612942156527682, 0.772067775974026, 0.7716379015721122, 0.7686047505126452, 0.7649441430280246, 0.7597215695488722, 0.7669493335611757, 0.7585494275461381, 0.7758245044429255, 0.7722840481886535, 0.7633154263499657, 0.7667090311004785, 0.763157894736842, 0.7689251537935748, 0.7776534731715653, 0.7693817284688994, 0.771133266404648, 0.7628561816473001, 0.7681722060833903, 0.7656116498632946, 0.765558249316473, 0.7743880297334245, 0.7695819805194806, 0.7641164345522898, 0.7732185577580315, 0.7717580528024608, 0.7774959415584415, 0.7644822282980178, 0.7707808227956254, 0.762917592276145, 0.7751516575529733, 0.7745295411825017, 0.7682416267942584, 0.7616252990430621, 0.7612995557074504, 0.7527527981886535, 0.7599752221462749, 0.7667891319207109, 0.7794771018455229, 0.7656436901913877, 0.7737151828434723, 0.7639268626110732, 0.7726872223171566, 0.75

In [335]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [100]:
## 44
column_to_drop_43 = '소득 중 정부 보조금의 비중(월평균)'

In [101]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(6119, 95)


In [338]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [339]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [340]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.8020392490755348


In [341]:
optuna_44 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [342]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.7721398667122352


In [343]:
X_train = X_train.values
y_train = y_train.values

In [344]:
auc_bootstrap = []

In [345]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75312573, 0.7746806 ])

In [346]:
t_44 = auc_bootstrap
print(t_44)

[0.7721772470950103, 0.7576015678400546, 0.7594091763499659, 0.7665701896787422, 0.7595346676349966, 0.7634435876623377, 0.7702521573820916, 0.7602368848257006, 0.7614944677033493, 0.7674059082365002, 0.7625144181476419, 0.7562612141148325, 0.7730369958988379, 0.7560155715994532, 0.7694057587149692, 0.7550650418660287, 0.7621886748120301, 0.7665488294600137, 0.7619590524606972, 0.7589579417293233, 0.7764973513328777, 0.7543708347573479, 0.7541599025974026, 0.7606026785714286, 0.7719930152084757, 0.7587443395420368, 0.7660415242652086, 0.7648987525632263, 0.7675260594668489, 0.7698730134996583, 0.7721959372863979, 0.7640069634313056, 0.7618095309295967, 0.7648960825358851, 0.7610378930280246, 0.7548220693779905, 0.7659480733082706, 0.7678224325017088, 0.7667223812371838, 0.7654220779220778, 0.7677503417634997, 0.7613422761449077, 0.7678624829118251, 0.7542987440191389, 0.7573479152426521, 0.7731811773752564, 0.7681161355092276, 0.7575027768284347, 0.764319356630212, 0.7625758287764867, 

In [347]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [102]:
# 45
column_to_drop_44 = '가구주 나이'

In [103]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(6119, 94)


In [350]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [351]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [352]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7952570555680765


In [131]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7952570555680765


In [353]:
optuna_45 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [133]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.774740473342447


In [354]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.774740473342447


In [134]:
X_train = X_train.values
y_train = y_train.values

In [135]:
auc_bootstrap = []

In [136]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75999104, 0.77954485])

In [137]:
np.mean(auc_bootstrap)

0.770099394437799

In [138]:
t_45 = auc_bootstrap
print(t_45)

[0.7817573051948052, 0.7686234407040327, 0.7718167934039645, 0.7686007454716336, 0.7764306006493508, 0.7665354793233082, 0.7773851354237867, 0.7688971185064934, 0.7738727144565961, 0.7688944484791524, 0.7669520035885168, 0.7712587576896788, 0.7693924085782637, 0.7616933847402596, 0.7658038918318524, 0.7696407211209843, 0.7671028601332878, 0.7688557330827067, 0.7747030929596719, 0.7639455528024606, 0.7639989533492824, 0.7665461594326727, 0.7680814251537935, 0.7711492865686944, 0.7638681220095693, 0.7601728041695147, 0.7685900653622693, 0.7773117096719071, 0.7755922120642516, 0.7737979536910459, 0.7740155609193439, 0.7734615302460698, 0.7747645035885167, 0.7727886833561176, 0.7727045774948735, 0.763091144053315, 0.7677343215994532, 0.7715578007518795, 0.7677076213260423, 0.7673031121838689, 0.7758164943609023, 0.7770206766917294, 0.7713535436602871, 0.7553801050922762, 0.7594599068694463, 0.7668879229323309, 0.7791006279904307, 0.7759433206596036, 0.7759540007689678, 0.7672403665413534, 

In [139]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [104]:
# 46.
column_to_drop_45 = 'Cat_이사 계획 첫 번째 이유'

In [105]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(6119, 82)


In [142]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [143]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [144]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7941092997437375


In [145]:
optuna_46 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [146]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.7766655630553657


In [147]:
X_train = X_train.values
y_train = y_train.values

In [148]:
auc_bootstrap = []

In [149]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75995914, 0.78028732])

In [150]:
np.mean(auc_bootstrap)

0.7701807361532381

In [151]:
t_46 = auc_bootstrap
print(t_46)

[0.7698863636363635, 0.7707447774265208, 0.7769045305023923, 0.7721825871496923, 0.7674406185919344, 0.7679359086637048, 0.7663485774094326, 0.7719089093472317, 0.7702321321770335, 0.774406719924812, 0.7716192113807245, 0.7623008159603555, 0.7692201918147642, 0.7705111500341764, 0.7664740686944633, 0.7716218814080656, 0.7736284069548871, 0.7632433356117566, 0.7724041994190021, 0.7660335141831852, 0.767142910543404, 0.7654247479494192, 0.7657905416951469, 0.7669733638072453, 0.7697715524606972, 0.7795278323650034, 0.7734962406015037, 0.7720063653451812, 0.767348502648667, 0.7804089413875599, 0.7702695125598086, 0.7744627904989747, 0.7719049043062202, 0.77163256151743, 0.7801392686261107, 0.7716245514354066, 0.7728327388072453, 0.7736030416951469, 0.7695232399179768, 0.7678077473513328, 0.7728127136021874, 0.7684018284347232, 0.7659080228981545, 0.7761849581339713, 0.766188375768968, 0.7692976226076556, 0.763439582621326, 0.7707354323308271, 0.7803341806220095, 0.7741076768626111, 0.7656

In [152]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [106]:
# 47.
column_to_drop_46 = 'Cat_가구주 최종 학력'

In [107]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(6119, 79)


In [155]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [156]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [157]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 149, 'subsample': 0.6, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 8}
0.7919223031911786


In [158]:
optuna_47 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [159]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.7707941729323308


In [160]:
X_train = X_train.values
y_train = y_train.values

In [161]:
auc_bootstrap = []

In [162]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75097603, 0.7750506 ])

In [163]:
np.mean(auc_bootstrap)

0.7632087360624573

In [164]:
t_47 = auc_bootstrap
print(t_47)

[0.7650990046138072, 0.7530198009227616, 0.7624690276828435, 0.7584666566985646, 0.7560823222829802, 0.7706473214285715, 0.7751783578263842, 0.7637399606971975, 0.7538421693438142, 0.756944741114149, 0.7579540114490773, 0.756059627050581, 0.7539382903280929, 0.7571289730006836, 0.7442140507518797, 0.7588698308270676, 0.7704817797334245, 0.766759761619959, 0.7527474581339713, 0.7659347231715652, 0.7585894779562543, 0.772535030758715, 0.7548327494873548, 0.7631418745727956, 0.770030545112782, 0.7701987568352702, 0.7585574376281614, 0.7587843899521531, 0.7596548188653451, 0.7623034859876965, 0.7611179938482571, 0.7648667122351333, 0.7605199077238551, 0.7664820787764867, 0.758546757518797, 0.7683484278879016, 0.7679052033492823, 0.7695339200273411, 0.7665328092959672, 0.7522054425837321, 0.7627173402255638, 0.7657531613123718, 0.7518903793574847, 0.7731624871838687, 0.7645649991455912, 0.7550089712918661, 0.7685807202665754, 0.7618575914217361, 0.7606080186261107, 0.7650949995727956, 0.767

In [165]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [108]:
# 48
column_to_drop_47 = '소득 대비 주거관리비의 비율'

In [109]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(6119, 78)


In [168]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [169]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [170]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7893701116036027


In [171]:
optuna_48 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [172]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.7781914836807929


In [173]:
X_train = X_train.values
y_train = y_train.values

In [174]:
auc_bootstrap = []

In [175]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76068391, 0.77959365])

In [176]:
np.mean(auc_bootstrap)

0.7707228945499403

In [177]:
t_48 = auc_bootstrap
print(t_48)

[0.7700091848940533, 0.7735082557245386, 0.7750555365686944, 0.7682496368762817, 0.7738820595522898, 0.7717914281442242, 0.7685553550068354, 0.7762356886534518, 0.7728634441216677, 0.7763451597744361, 0.7690880254613807, 0.7690039196001368, 0.7639335376794257, 0.7719529647983595, 0.7732732933185236, 0.7640616989917977, 0.7659413982399179, 0.7656476952323992, 0.7735082557245387, 0.7724789601845523, 0.7666729857313739, 0.7720384056732741, 0.7735416310663021, 0.7693576982228298, 0.7732265678400546, 0.774157072368421, 0.77039633885851, 0.7655128588516746, 0.7669306433697881, 0.7686674961551607, 0.7718234684723171, 0.7653059317327409, 0.7740075508373205, 0.7720303955912509, 0.7603036355092275, 0.7692348769651401, 0.76705880468216, 0.7623835868079291, 0.7751302973342448, 0.7665635146103896, 0.7719916801948052, 0.7754360154647983, 0.7729074995727956, 0.7765480818523582, 0.7721011513157894, 0.7728567690533151, 0.7721679019993164, 0.7759967212064252, 0.7665595095693778, 0.7811765742481203, 0.77

In [178]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [110]:
# 49
column_to_drop_48 = '중기부채부담지표'

In [111]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(6119, 77)


In [181]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [182]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [183]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7917678778620858


In [184]:
optuna_49 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [185]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.7795545326384141


In [186]:
X_train = X_train.values
y_train = y_train.values

In [187]:
auc_bootstrap = []

In [188]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76099384, 0.78004695])

In [189]:
np.mean(auc_bootstrap)

0.7706954613540242

In [190]:
t_49 = auc_bootstrap
print(t_49)

[0.7748352593130553, 0.7684071684894054, 0.7678024072966507, 0.7714803699589883, 0.7716432416267942, 0.7815824184039645, 0.7730316558441558, 0.7799857420539987, 0.7762290135850992, 0.7697074718045113, 0.7725644010594669, 0.7706606715652768, 0.7679719540328094, 0.7647665862098428, 0.7664740686944634, 0.7696607463260424, 0.77796186132946, 0.7704470693779903, 0.7773958155331508, 0.7678397876794258, 0.7715698158749147, 0.7684325337491457, 0.7789604515550239, 0.7689905694634314, 0.7734468450956938, 0.7602715951811345, 0.7692455570745045, 0.7694671693438141, 0.7679399137047163, 0.7710932159945318, 0.7731424619788108, 0.7775386619958989, 0.7744721355946684, 0.7652285009398496, 0.7716766169685578, 0.774424075102529, 0.7787615345181134, 0.763659859876965, 0.7670013990943266, 0.7742625384483937, 0.7721024863294601, 0.7673631877990432, 0.7728594390806562, 0.7703389332706767, 0.7685032894736843, 0.7663979729152426, 0.7666142451298702, 0.7697141468728639, 0.7660348491968558, 0.77487130468216, 0.772

In [191]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [112]:
# 50
column_to_drop_49 = 'Cat_이사 계획 중인 거주 지역'

In [113]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(6119, 70)


In [194]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [195]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [196]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7895266237614671


In [197]:
optuna_50 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [198]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.7787922398325358


In [199]:
X_train = X_train.values
y_train = y_train.values

In [200]:
auc_bootstrap = []

In [201]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76040376, 0.77922585])

In [202]:
np.mean(auc_bootstrap)

0.7701033367331681

In [203]:
t_50 = auc_bootstrap
print(t_50)

[0.7735977016404647, 0.78592254784689, 0.7687168916609706, 0.7679198884996582, 0.7713428635509227, 0.7724135445146958, 0.7604825273410799, 0.7681001153451812, 0.7750234962406014, 0.7731758373205743, 0.7742772235987697, 0.7765026913875598, 0.7791994190020506, 0.7689999145591251, 0.7675821300410117, 0.7654447731544771, 0.7732546031271359, 0.7677943972146275, 0.7670361094497608, 0.7666182501708817, 0.7707541225222145, 0.7694377990430622, 0.7735242758885852, 0.7705658855946684, 0.7680360346889953, 0.768927823820916, 0.760650739063568, 0.7767723641490089, 0.7684178485987696, 0.7666956809637732, 0.7736497671736158, 0.770233467190704, 0.7751302973342448, 0.7656543703007519, 0.773942135167464, 0.770782157809296, 0.7656864106288448, 0.7672483766233766, 0.768082760167464, 0.7713375234962405, 0.7628174662508544, 0.771266767771702, 0.7707768177546138, 0.7672256813909775, 0.7716138713260423, 0.7726738721804511, 0.7665288042549556, 0.7684832642686262, 0.7706286312371837, 0.7739154348940533, 0.774538

In [204]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [114]:
# 51
column_to_drop_50 = '소득 대비 주택 임대료의 비율'

In [115]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(6119, 69)


In [116]:
alll = list(X_51)
alll

['현재 주택의 면적(㎡)',
 '장기부채부담지표',
 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
 'Cat_현재 주택의 유형_고시원',
 'Cat_현재 주택의 유형_기타',
 'Cat_현재 주택의 유형_다가구 단독주택',
 'Cat_현재 주택의 유형_다세대주택',
 'Cat_현재 주택의 유형_비거주용 건물 내 주택',
 'Cat_현재 주택의 유형_아파트',
 'Cat_현재 주택의 유형_연립주택',
 'Cat_현재 주택의 유형_영업겸용 단독주택',
 'Cat_현재 주택의 유형_오피스텔',
 'Cat_현재 주택의 유형_일반 단독주택',
 'Cat_현재 주택의 유형_판잣집 비닐하우스 컨테이너 움막',
 'Cat_현재 주택의 점유형태_무상',
 'Cat_현재 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_현재 주택의 점유형태_보증금 있는 월세',
 'Cat_현재 주택의 점유형태_전세',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 자가',
 'Cat_이사 계획 중인 주택

In [207]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [208]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [209]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7880929723954291


In [210]:
optuna_51 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [211]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.7785826426862611


In [212]:
X_train = X_train.values
y_train = y_train.values

In [213]:
auc_bootstrap = []

In [214]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76275529, 0.78030625])

In [215]:
np.mean(auc_bootstrap)

0.7720277669493336

In [216]:
t_51 = auc_bootstrap
print(t_51)

[0.7687769672761449, 0.7797587897300067, 0.7730783813226247, 0.775808484278879, 0.7682469668489404, 0.7641484748803827, 0.7724989853896104, 0.7750448564593302, 0.782320680963773, 0.7717033172419685, 0.7732799683868763, 0.777944506151743, 0.7756629677887903, 0.7745602464969241, 0.7733373739747094, 0.7710972210355433, 0.7747885338345865, 0.7719449547163364, 0.7729715802289814, 0.7688704182330828, 0.7681388307416268, 0.7720544258373205, 0.7671415755297334, 0.7611860795454546, 0.7780219369446342, 0.7725964413875599, 0.7732412529904307, 0.7670280993677375, 0.7735202708475735, 0.7703242481203008, 0.7702561624231032, 0.77801125683527, 0.7696180258885852, 0.7678397876794258, 0.768117470522898, 0.7715578007518796, 0.7731117566643882, 0.7761462427375256, 0.7673992331681476, 0.7741971227785373, 0.7772663192071086, 0.7787802247095009, 0.7714403195488722, 0.770982409859877, 0.7753692647812713, 0.7758645548530416, 0.7725110005126453, 0.7768324397641831, 0.7756829929938482, 0.7741383821770336, 0.7729

In [217]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [218]:
# 52
column_to_drop_51 = 'Cat_현재 주택의 유형'

In [219]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(6119, 58)


In [220]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [221]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [222]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7862816050217448


In [223]:
optuna_52 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [224]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.7782275290498974


In [225]:
X_train = X_train.values
y_train = y_train.values

In [226]:
auc_bootstrap = []

In [227]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76250945, 0.7802666 ])

In [228]:
np.mean(auc_bootstrap)

0.7714282543628247

In [229]:
t_52 = auc_bootstrap
print(t_52)

[0.7719035692925496, 0.7751556625939849, 0.7756976781442242, 0.7637853511619959, 0.7779605263157895, 0.7687689571941216, 0.7710438204887218, 0.7760194164388243, 0.7622367353041695, 0.7708622586295284, 0.7752584586466165, 0.7708729387388926, 0.7717220074333562, 0.7685780502392343, 0.7698636684039645, 0.7592503097231715, 0.7692535671565277, 0.7670975200786057, 0.7751449824846207, 0.7754547056561859, 0.7757217083902939, 0.768866413192071, 0.7704056839542037, 0.7758485346889952, 0.769162786226931, 0.7716405715994532, 0.7721038213431306, 0.765938728212577, 0.7699050538277512, 0.7734949055878333, 0.7666556305536569, 0.7774732463260424, 0.7710237952836636, 0.7779511812200958, 0.7738273239917978, 0.7749420604066986, 0.7770366968557757, 0.7760648069036227, 0.770354953434723, 0.7774064956425154, 0.7702588324504442, 0.7795798978981545, 0.776678913192071, 0.7631218493677374, 0.779413021189337, 0.7811979344668489, 0.771051830570745, 0.7722413277511961, 0.7683684530929596, 0.7712881279904307, 0.7738

In [230]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [231]:
# 53
column_to_drop_52 = 'Cat_이사 계획 중인 주택의 점유형태'

In [232]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(6119, 38)


In [233]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [234]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [235]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7868680039065435


In [236]:
optuna_53 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [237]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.7755441515721121


In [238]:
X_train = X_train.values
y_train = y_train.values

In [239]:
auc_bootstrap = []

In [240]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75853885, 0.7765923 ])

In [241]:
np.mean(auc_bootstrap)

0.7674011395676691

In [242]:
t_53 = auc_bootstrap
print(t_53)

[0.7631458796138073, 0.7742985838174982, 0.7672136662679426, 0.7681281506322625, 0.7694110987696515, 0.7623181711380725, 0.762037818267259, 0.7696487312030076, 0.7701920817669173, 0.7699397641831853, 0.7637279455741628, 0.7676475457108681, 0.7579393262987013, 0.7717473726930963, 0.7726992374401913, 0.7683390827922078, 0.7727432928913192, 0.7668892579460013, 0.7685833902939165, 0.7654848235645934, 0.7638013713260424, 0.7631765849282296, 0.7614971377306903, 0.7611740644224197, 0.7663899628332195, 0.7692508971291865, 0.765924043062201, 0.7661296351674641, 0.7691908215140124, 0.7647812713602187, 0.7650229088345865, 0.7612020997095009, 0.7681308206596036, 0.7681682010423787, 0.7746229921394395, 0.7672150012816131, 0.7633327815276828, 0.7686674961551606, 0.7771047825529733, 0.7694458091250853, 0.7600940383629529, 0.7659614234449761, 0.7606293788448394, 0.7681575209330145, 0.7658906677204375, 0.771704652255639, 0.7624329823137389, 0.7710918809808612, 0.7685286547334245, 0.7658799876110731, 0.

In [243]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [244]:
# 54
column_to_drop_53 = 'Cat_주택 마련 예상 소요연수'

In [245]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(6119, 32)


In [246]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [247]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [248]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7834706466664996


In [249]:
optuna_54 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [250]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.7644982484620643


In [251]:
X_train = X_train.values
y_train = y_train.values

In [252]:
auc_bootstrap = []

In [253]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74820598, 0.76734587])

In [254]:
np.mean(auc_bootstrap)

0.7581450819431391

In [255]:
t_54 = auc_bootstrap
print(t_54)

[0.7514751901059467, 0.7471444057587149, 0.7573439102016404, 0.7526006066302119, 0.7652084757347914, 0.755871390123035, 0.755055696770335, 0.756956756237184, 0.7677463367224882, 0.7624423274094326, 0.7581596035543404, 0.7537700786056049, 0.7559328007518797, 0.7546618677375256, 0.7556965033321942, 0.7596067583732056, 0.7546471825871497, 0.7589606117566644, 0.7615144929084074, 0.7549115152939166, 0.7636198094668488, 0.7503911590054682, 0.7613342660628845, 0.7575374871838687, 0.7624289772727273, 0.7537660735645932, 0.7651937905844155, 0.7540290712576897, 0.7543588196343131, 0.7495554404477103, 0.7558940853554339, 0.7606320488721805, 0.7588004101161996, 0.7567351439678742, 0.7576496283321942, 0.7493084629186603, 0.7599805622009569, 0.760200839456596, 0.7565522470950102, 0.7604625021360218, 0.7536579374572795, 0.7588150952665755, 0.7610151977956253, 0.7619563824333562, 0.7511134014012302, 0.7618108659432673, 0.7568766554169515, 0.7517795732228298, 0.7539476354237867, 0.7544789708646618, 0.7

In [256]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [257]:
# 55
column_to_drop_54 = '장기부채부담지표'

In [258]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(6119, 31)


In [259]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [260]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [261]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7794952378567435


In [262]:
optuna_55 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [263]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7592075892857143


In [264]:
X_train = X_train.values
y_train = y_train.values

In [265]:
auc_bootstrap = []

In [266]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74422643, 0.76077953])

In [267]:
np.mean(auc_bootstrap)

0.7527308045059381

In [268]:
t_55 = auc_bootstrap
print(t_55)

[0.7481470010252904, 0.7540664516404647, 0.7494032488892686, 0.7552893241626795, 0.7514244595864661, 0.7581729536910458, 0.7498798487696514, 0.7539449653964457, 0.7567685193096376, 0.7639281976247437, 0.7610325529733424, 0.7554174854750513, 0.7515900012816131, 0.7568713153622694, 0.7510226204716337, 0.7589606117566644, 0.7594105113636365, 0.7474007283834586, 0.7555189465140122, 0.7561290477614491, 0.7544669557416267, 0.7476650610902256, 0.7467986372180451, 0.7497717126623377, 0.7527487931476419, 0.753656602443609, 0.748778462491456, 0.7587376644736843, 0.7544736308099795, 0.7556684680451129, 0.7570875875768968, 0.7417189102016405, 0.7479387388926863, 0.7565696022727273, 0.7569167058270677, 0.7475168745727957, 0.7534136299555707, 0.7473526678913192, 0.7538702046308955, 0.756663053229665, 0.7530304810321258, 0.7462526166267942, 0.7495247351332877, 0.7567952195830485, 0.7534336551606289, 0.7532440832194122, 0.7506982121496923, 0.75848000683527, 0.7569313909774437, 0.7522334778708133, 0.75

In [269]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [270]:
column_to_drop_55 = 'Cat_현재 거주 지역'

In [271]:
if not column_to_drop_55.startswith('Cat_'):
    comp_56 = comp_55.drop(column_to_drop_55, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']
else:
    comp_56 = comp_55.drop(comp_55.filter(regex='^' + column_to_drop_55).columns, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']

print(X_56.shape)

(6119, 14)


In [272]:
X_train, X_test, y_train, y_test = train_test_split(X_56, y_56, test_size=0.2, shuffle=True, stratify=y_56, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [273]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [274]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7635205636107145


In [275]:
optuna_56 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_56.fit(X_train, y_train)

In [276]:
optuna_proba_56 = optuna_56.predict_proba(X_test)[:, 1]
auc_56 = roc_auc_score(y_test, optuna_proba_56)
print(auc_56)

0.7463153622693096


In [277]:
X_train = X_train.values
y_train = y_train.values

In [278]:
auc_bootstrap = []

In [279]:
rs = RandomState(seed = 56)
bootstrap_auc(optuna_56, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.72990557, 0.74734956])

In [280]:
np.mean(auc_bootstrap)

0.7387879858114748

In [281]:
t_56 = auc_bootstrap
print(t_56)

[0.7444249829118252, 0.7451031698564593, 0.7312417229152426, 0.7439684082365003, 0.745643850393028, 0.7404052567498292, 0.7392584800068354, 0.7469361436261108, 0.7357513990943269, 0.7445624893198906, 0.7344070403280929, 0.7413704716336295, 0.7417562905844155, 0.7380062371838688, 0.7404720074333561, 0.7407283300580998, 0.7319439401059467, 0.7454809787252222, 0.7388739960697196, 0.742605359278879, 0.7455717596548188, 0.7338303144224196, 0.7457546565276829, 0.7361932886192755, 0.7459589136192755, 0.7380850029904307, 0.7446479301948051, 0.7368888307416268, 0.7268188226247437, 0.7387658599624061, 0.7436600200786057, 0.7378740708304853, 0.738847295796309, 0.7408271210697198, 0.7364976717361585, 0.7406282040328093, 0.7298239383971293, 0.7371544984620644, 0.743223470608339, 0.744219390806562, 0.745211305963773, 0.7416748547505128, 0.7375950529733426, 0.7400915285372522, 0.7341106672932332, 0.7371838687628162, 0.740298455656186, 0.7400501431134654, 0.7440164687286397, 0.7427562158236501, 0.7319

In [282]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [283]:
column_to_drop_56 = 'Cat_현재 주택의 점유형태'

In [284]:
if not column_to_drop_56.startswith('Cat_'):
    comp_57 = comp_56.drop(column_to_drop_56, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']
else:
    comp_57 = comp_56.drop(comp_56.filter(regex='^' + column_to_drop_56).columns, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']

print(X_57.shape)

(6119, 10)


In [285]:
X_train, X_test, y_train, y_test = train_test_split(X_57, y_57, test_size=0.2, shuffle=True, stratify=y_57, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [286]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [287]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 175, 'subsample': 0.1, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 9}
0.7544407716258066


In [288]:
optuna_57 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_57.fit(X_train, y_train)

In [289]:
optuna_proba_57 = optuna_57.predict_proba(X_test)[:, 1]
auc_57 = roc_auc_score(y_test, optuna_proba_57)
print(auc_57)

0.7289081190191387


In [290]:
X_train = X_train.values
y_train = y_train.values

In [291]:
auc_bootstrap = []

In [292]:
rs = RandomState(seed = 57)
bootstrap_auc(optuna_57, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71026135, 0.73178694])

In [293]:
np.mean(auc_bootstrap)

0.7216364477422248

In [294]:
t_57 = auc_bootstrap
print(t_57)

[0.7201103789302802, 0.7289067840054682, 0.7130428165584416, 0.7173388905502392, 0.7136876281613124, 0.7246360752734108, 0.7234999786397812, 0.7125995920198223, 0.7290416203861928, 0.7222717660628845, 0.7284996048359534, 0.7307050474196856, 0.724240911226931, 0.7183134505297334, 0.7137076533663705, 0.7197806305536568, 0.71823067968216, 0.7115649564251539, 0.7168235752734107, 0.722453327922078, 0.7241447902426521, 0.7213052161654137, 0.7236975606630212, 0.7219460227272727, 0.7175124423274095, 0.7087534176349967, 0.7182587149692412, 0.7150520121326043, 0.715229568950786, 0.723489298530417, 0.7152188888414218, 0.7222717660628845, 0.7111831425153794, 0.7211223192925496, 0.7228111115857827, 0.7154805515208476, 0.7337622287252221, 0.720888691900205, 0.7223291716507176, 0.7233397769993164, 0.7287639375427204, 0.7254557736671223, 0.7285476653280929, 0.7265932053144225, 0.717894256237184, 0.724023303998633, 0.7163629955570745, 0.7165191921565277, 0.7229125726247436, 0.7304060043574846, 0.723329

In [295]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [296]:
column_to_drop_57 = '현재 주택의 면적(㎡)'

In [297]:
if not column_to_drop_57.startswith('Cat_'):
    comp_58 = comp_57.drop(column_to_drop_57, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']
else:
    comp_58 = comp_57.drop(comp_57.filter(regex='^' + column_to_drop_57).columns, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']

print(X_58.shape)

(6119, 9)


In [298]:
X_train, X_test, y_train, y_test = train_test_split(X_58, y_58, test_size=0.2, shuffle=True, stratify=y_58, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [299]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [300]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.01, 'n_estimators': 127, 'subsample': 0.9, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7172639170610773


In [301]:
optuna_58 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_58.fit(X_train, y_train)

In [302]:
optuna_proba_58 = optuna_58.predict_proba(X_test)[:, 1]
auc_58 = roc_auc_score(y_test, optuna_proba_58)
print(auc_58)

0.7064251537935747


In [303]:
X_train = X_train.values
y_train = y_train.values

In [304]:
auc_bootstrap = []

In [305]:
rs = RandomState(seed = 58)
bootstrap_auc(optuna_58, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.70263638, 0.7079444 ])

In [306]:
np.mean(auc_bootstrap)

0.705342830175581

In [307]:
t_58 = auc_bootstrap
print(t_58)

[0.7057362867395761, 0.7057362867395761, 0.7064812243677374, 0.7064251537935747, 0.7064251537935747, 0.7043585526315788, 0.7057229366028708, 0.7031249999999999, 0.7058243976418317, 0.7043585526315788, 0.7079443993506492, 0.7029995087149692, 0.7057362867395761, 0.7064251537935747, 0.7057362867395761, 0.7035148239917975, 0.7053384526657552, 0.7057229366028708, 0.7057229366028708, 0.7057229366028708, 0.7036696855775801, 0.7043585526315788, 0.7032237910116198, 0.7072875726247436, 0.7043879229323309, 0.702625704887218, 0.7043585526315788, 0.7043585526315788, 0.7058964883800409, 0.7072421821599453, 0.7044359834244702, 0.7064251537935747, 0.7071994617224879, 0.7023960825358851, 0.7055814251537935, 0.7064812243677374, 0.7064812243677374, 0.7058243976418317, 0.7059525589542036, 0.7028820275119616, 0.7057229366028708, 0.7064812243677374, 0.7079443993506492, 0.7057229366028708, 0.7057362867395761, 0.7057229366028708, 0.7071861115857826, 0.7079443993506492, 0.7032665114490771, 0.705066109876965, 0

In [308]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc