In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
중장년가구 = pd.read_csv('중장년가구_변수추가.csv', encoding='cp949')
중장년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [7]:
중장년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [8]:
cat = 중장년가구.select_dtypes(include = 'object')
num = 중장년가구.select_dtypes(exclude = 'object')
num_중장년 = num.drop('target',axis=1)
target = 중장년가구.target

In [9]:
scaler=RobustScaler()
scaler.fit(num_중장년)
num_scaled_중장년=scaler.transform(num_중장년)
num_df_scaled_중장년=pd.DataFrame(data=num_scaled_중장년, columns=num_중장년.columns)

In [10]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [11]:
comp =pd.concat([num_df_scaled_중장년, target,cat2],axis=1)

In [12]:
X=comp.drop('target', axis = 1)
y=comp.target
X.shape

(19949, 214)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [15]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [16]:
print(study.best_trial.params)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}


In [17]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7957493724735377


In [18]:
optuna_0 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)

In [19]:
optuna_0.fit(X_train, y_train)

In [20]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(auc_0)

0.798389346418903


In [21]:
X_train = X_train.values
y_train = y_train.values

In [22]:
auc_bootstrap = []

In [23]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.789207  , 0.79726572])

In [24]:
t_0 = auc_bootstrap
print(t_0)

[0.7901778449808007, 0.7964857890966266, 0.7914648462678019, 0.7922903550982369, 0.7906749994188418, 0.7928583051735761, 0.7952521487004246, 0.7922504665115502, 0.7915153013675181, 0.7951657674564079, 0.7952735458893094, 0.790602882967415, 0.7927238462952749, 0.7915797570969985, 0.7932902113936597, 0.7931126939747628, 0.7956116743062556, 0.7923492634083767, 0.7934679929753821, 0.7927864528849751, 0.7933533463090113, 0.7893856206664089, 0.7899142104807129, 0.7925080252666459, 0.792342923500559, 0.793204886800946, 0.7961563780529297, 0.7920700433015704, 0.7955326896213596, 0.7947586925419436, 0.794828431527939, 0.7951451627560001, 0.7949919483170715, 0.7929924998890516, 0.7919889453140685, 0.7971879338874412, 0.7971826506309265, 0.7959159898815071, 0.794161948718599, 0.7957984374240532, 0.7890630778561815, 0.7938365001172882, 0.7970289078663464, 0.7927188272015858, 0.7909827491108279, 0.7935094665390232, 0.7938486516072725, 0.7973564697702629, 0.790673678604713, 0.7913285382497206, 0.794

In [25]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [13]:
# 1.
column_to_drop = '부채 중 임대 보증금의 비중'

In [14]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(19949, 213)


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [29]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [30]:
print(study.best_trial.params)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}


In [31]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7966244628204244


In [32]:
optuna_1 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [33]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(auc_1)

0.7977912817814296


In [34]:
X_train = X_train.values
y_train = y_train.values

In [35]:
auc_bootstrap = []

In [36]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78912446, 0.79719195])

In [37]:
t_1 = auc_bootstrap
print(t_1)

[0.7922676370952233, 0.7936315097645147, 0.7894080745065967, 0.793984695462528, 0.7912503460533017, 0.7916109283104357, 0.7950442525565679, 0.7920219656672859, 0.794057604402432, 0.7915121314136091, 0.7923252245912344, 0.7920135124568622, 0.7898914924776994, 0.7937659686428159, 0.7918389008290486, 0.7926298043293116, 0.7955181606659439, 0.7934418408556339, 0.7916627042242804, 0.7937036262159416, 0.7899097197126753, 0.7944039218669758, 0.7913398972512273, 0.7946493291320877, 0.7923685472946556, 0.7916529301997283, 0.7907859478056521, 0.8021243446120294, 0.7924890055431928, 0.7931499409331921, 0.7934119904563254, 0.7905548053331306, 0.791656100153637, 0.7882671552622291, 0.7908356104168912, 0.7915293019972823, 0.7908382520451487, 0.7952471296067355, 0.7915646998159314, 0.7950833486547771, 0.7932688142047748, 0.794148740577312, 0.7944216207763007, 0.7904208747804806, 0.7909890890186456, 0.7923405460351274, 0.7921064977715223, 0.7919868320114625, 0.7942266686109052, 0.7922351450676575, 0.7

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [15]:
#### 2. 
column_to_drop_1 = 'Cat_가구주 동거 여부'

In [16]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(19949, 211)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7966938096026306


In [44]:
optuna_2 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [45]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(auc_2)

0.7986656607346261


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78899971, 0.79736267])

In [49]:
t_2 = auc_bootstrap
print(t_2)

[0.7919699255906153, 0.7920267205981492, 0.7924504377706347, 0.7919926435936288, 0.7920872138852434, 0.790510690141232, 0.7907090764233622, 0.7979590251757739, 0.7907265111698609, 0.7972272941484765, 0.796736479618253, 0.7962263812017507, 0.7941899499781273, 0.79212710247193, 0.7931060899041196, 0.7918132770349519, 0.7930342376155184, 0.7909418038728384, 0.794602572311932, 0.7956925081309317, 0.7907492291728745, 0.792084572256986, 0.7907859478056523, 0.7922118987389923, 0.7915369627192287, 0.7949840234322992, 0.7928915896896193, 0.7974597574351269, 0.7939514109464849, 0.7910625262842012, 0.7919049415354834, 0.7964715243040366, 0.7938468024674921, 0.7928947596435281, 0.7930712204111219, 0.796152151447718, 0.7905740892194093, 0.7936066784588952, 0.7927819621169374, 0.79458223177435, 0.796775047390811, 0.7947700515434506, 0.7968693535195999, 0.7936362646953778, 0.7938119329744945, 0.79520407106614, 0.7927671689986961, 0.7977556197999547, 0.7945967607297657, 0.7919638498456233, 0.795970671

In [50]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [17]:
column_to_drop_2 = 'Cat_가구주 주민등록상 등재 여부'

In [18]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(19949, 209)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [54]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [55]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7969319827772313


In [56]:
optuna_3 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [57]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(auc_3)

0.7975796873580125


In [58]:
X_train = X_train.values
y_train = y_train.values

In [59]:
auc_bootstrap = []

In [60]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78934802, 0.79733378])

In [61]:
t_3 = auc_bootstrap
print(t_3)

[0.7950104397148733, 0.7955482752280781, 0.79740190577629, 0.7900624058259526, 0.7917287449307153, 0.7916354954532294, 0.7913121601545248, 0.7918632038090165, 0.7925003645446996, 0.7918724495079176, 0.7939310704089031, 0.7928522294285841, 0.7924670800286564, 0.7949000196537144, 0.7937622703632555, 0.7920726849298279, 0.7953224160120712, 0.7933649694733438, 0.791663232549932, 0.7917794641932574, 0.7934249344347868, 0.7953681161809241, 0.7928054726084284, 0.7924953454510104, 0.7931013349732562, 0.7938375567685914, 0.7929943490288317, 0.7985732037456176, 0.7962303436441368, 0.791990530291023, 0.791877996927258, 0.7955826163954243, 0.7974975327192076, 0.7914392224737052, 0.7893066359815127, 0.7948342431101053, 0.792659918891446, 0.7955192173172468, 0.7912801964526103, 0.7933023628836438, 0.7933480630524965, 0.7914598271741129, 0.7913501996014312, 0.7937334766152501, 0.7948456021116119, 0.7974867020433523, 0.7947098224191821, 0.7933802909172367, 0.7950513849528628, 0.7914563930573784, 0.793

In [62]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [19]:
### 4. 
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [20]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(19949, 208)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [22]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 


In [23]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7969336338910934


In [24]:
optuna_4 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [25]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(auc_4)

0.7976821825343994


In [26]:
X_train = X_train.values
y_train = y_train.values

In [27]:
auc_bootstrap = []

In [28]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78903712, 0.79733002])

In [29]:
t_4 = auc_bootstrap
print(t_4)

[0.7963928037819663, 0.7944348289175875, 0.7947095582563561, 0.7957345100202243, 0.7910268643027263, 0.7949570788240738, 0.7930400491976847, 0.7935033907940312, 0.7920769115350397, 0.7917871249152038, 0.7932059434522489, 0.7923791138076852, 0.7924116058352512, 0.7901408621851973, 0.791528773671631, 0.7958539116174584, 0.7957923616790612, 0.7978692098150226, 0.7967372721067303, 0.7966189271607991, 0.7942155737722241, 0.7950424034167876, 0.7899731187908527, 0.7925568953894078, 0.7944118467517483, 0.79150526318014, 0.7945714010984947, 0.7923133372640762, 0.7934553131597466, 0.7931853387518412, 0.7941677603007652, 0.7925407814570375, 0.7909563328282541, 0.7950920660280266, 0.7922914117495398, 0.790766928082199, 0.7903360785134184, 0.7933797625915853, 0.7937712518993307, 0.7875969741733289, 0.792275033654344, 0.795499933430968, 0.7941342116218963, 0.7947251438630748, 0.7971380071133766, 0.7884208980268093, 0.793042690825942, 0.7961376224923024, 0.7941357965988508, 0.7944749816670998, 0.7905

In [30]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [21]:
column_to_drop_4 = 'Cat_현재 주택의 위치'

In [22]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(19949, 204)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [34]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [35]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7975453715769829


In [36]:
optuna_5 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [37]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(auc_5)

0.7969847926744479


In [38]:
X_train = X_train.values
y_train = y_train.values

In [39]:
auc_bootstrap = []

In [40]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78923394, 0.79725798])

In [41]:
t_5 = auc_bootstrap
print(t_5)

[0.794080850731097, 0.7940063568142386, 0.7965439049182891, 0.7944673209451535, 0.7985510140682555, 0.7929383465097751, 0.7966448151177216, 0.791921583793505, 0.7894532463497981, 0.7935155422840153, 0.7924948171253591, 0.7928223790292757, 0.7923120164499475, 0.7906533380671312, 0.7894812476093265, 0.7965851143191045, 0.793449765740406, 0.7938005739729878, 0.7946871044161685, 0.7895776670407211, 0.7925101385692518, 0.7952740742149609, 0.7922573347450195, 0.7959685582838292, 0.7954872536153323, 0.7928458895207663, 0.7934664079984277, 0.7926628246825291, 0.7945484189326554, 0.7939912995331715, 0.7937440431282796, 0.7928337380307824, 0.7919324144693602, 0.7941770059996661, 0.7951364453827507, 0.7921339707053993, 0.7936243773682197, 0.7933441006101105, 0.7926324459575691, 0.7922929967264941, 0.7898442073318921, 0.7955263497135419, 0.7961476606796805, 0.794118361852352, 0.7926681079390439, 0.7948294881792419, 0.7907199070992175, 0.7936867197950942, 0.7913177075738651, 0.7927301862030927, 0.7

In [42]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [23]:
column_to_drop_5 = '소득 중 사적이전소득의 비중(월평균)'

In [24]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(19949, 203)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [46]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [47]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7967470580246816


In [48]:
optuna_6 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [49]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(auc_6)

0.798986882730725


In [50]:
X_train = X_train.values
y_train = y_train.values

In [51]:
auc_bootstrap = []

In [52]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78907432, 0.79720189])

In [53]:
t_6 = auc_bootstrap
print(t_6)

[0.7942124038183151, 0.7956689976394411, 0.7969488665301473, 0.7923075256819099, 0.7904636691582504, 0.7927410168789479, 0.793013368752285, 0.7904726506943256, 0.7940335655852897, 0.7930249919166175, 0.7956845832461596, 0.7918563355755474, 0.7939577508543025, 0.792662032194052, 0.7899747037678072, 0.7936381138351583, 0.7916584776190687, 0.7899052289446378, 0.7945687594702373, 0.7942047430963687, 0.7912109857922666, 0.7924686650056109, 0.7918822235324698, 0.794429281498247, 0.7927581874626211, 0.7915921727498082, 0.7937207967996145, 0.7945669103304572, 0.7964704676527337, 0.7896727656579874, 0.7953829092991656, 0.7965610755019622, 0.795014666320085, 0.7956663560111836, 0.7949819101296933, 0.7910612054700724, 0.7897715625548136, 0.7928136616560262, 0.7927249029465778, 0.7937281933587352, 0.7920903838391524, 0.7916994228570582, 0.7875510098416503, 0.793793177413867, 0.7941783268137949, 0.7908720648868433, 0.7965325459167825, 0.7948419038320518, 0.7941439856464487, 0.7943661465828954, 0.79

In [54]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [25]:
## 7 .
column_to_drop_6 = 'Cat_가구주 장애 여부'

In [26]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(19949, 201)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [30]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [31]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7966360206174588


In [32]:
optuna_7 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [33]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(auc_7)

0.7987031718558814


In [34]:
X_train = X_train.values
y_train = y_train.values

In [35]:
auc_bootstrap = []

In [36]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78909463, 0.79724479])

In [37]:
t_7 = auc_bootstrap
print(t_7)

[0.7984715010577079, 0.7896640482847379, 0.7910123353473106, 0.7932923246962655, 0.7920890630250236, 0.7967858780666663, 0.7947954111747215, 0.7942625947552056, 0.7921506129634208, 0.7919303011667543, 0.7912638183574144, 0.7949267000991139, 0.7935385244498545, 0.7928863064331044, 0.7942105546785349, 0.7925259883387962, 0.7880743163994396, 0.7949951182709803, 0.7924720991223453, 0.793208849243332, 0.7928984579230884, 0.795291244798634, 0.7941519105312209, 0.791831504269928, 0.7930585405954864, 0.7930788811330682, 0.7921492921492921, 0.791883280183773, 0.7951195389619035, 0.7913700118133616, 0.7913253682958117, 0.7964004645039128, 0.7926424841449472, 0.7975913105223449, 0.7932146608254983, 0.7950365918346214, 0.7921191775871579, 0.7960626002497924, 0.7917950497999758, 0.794165911160985, 0.7910416574209678, 0.7966828545646281, 0.7961730203109514, 0.7928448328694635, 0.7978224529948668, 0.7969050155010746, 0.7919744163586527, 0.7934162170615373, 0.7940982854775958, 0.7902969824152091, 0.79

In [38]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
column_to_drop_7 = 'Cat_소득 계층'

In [28]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(19949, 199)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7970483863045059


In [44]:
optuna_8 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [45]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(auc_8)

0.7990008833604892


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78910636, 0.79728789])

In [49]:
t_8 = auc_bootstrap
print(t_8)

[0.7950181004368196, 0.7954597806814556, 0.7937178910085314, 0.7919712464047439, 0.7948413755064001, 0.7946408759216641, 0.7925674619024372, 0.7935210897033558, 0.7919828695690764, 0.791577115468741, 0.7938483874444465, 0.7939944694870803, 0.7917049702763989, 0.7947425786095736, 0.78821564351121, 0.7949795326642617, 0.7927806413028087, 0.7937144568917968, 0.7935958477830398, 0.7929473280458502, 0.7932056792894231, 0.7973720553769816, 0.7943307487642461, 0.795874516317866, 0.7932659084136917, 0.7939852237881795, 0.7928601543133564, 0.7960073902192129, 0.7914331467287132, 0.7898159419095381, 0.7925550462496275, 0.7912706865908836, 0.7919202629793762, 0.7945790618204411, 0.7933316849573009, 0.7936867197950942, 0.7948691126031028, 0.7944517353384348, 0.7926517298438481, 0.7968397672831171, 0.7927676973243476, 0.79315628084101, 0.7945994023580231, 0.7974811546240118, 0.7900991244587304, 0.7917334998615787, 0.7939276362921681, 0.7930815227613258, 0.7958031923549165, 0.7938029514384195, 0.792

In [50]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
#9.
column_to_drop_8 = '부채 중 비금융기관 대출금의 비중'

In [30]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(19949, 198)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [32]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [33]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.79592026275826


In [34]:
optuna_9 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [35]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(auc_9)

0.7978383027644111


In [36]:
X_train = X_train.values
y_train = y_train.values

In [37]:
auc_bootstrap = []

In [38]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7873732 , 0.79647043])

In [39]:
t_9 = auc_bootstrap
print(t_9)

[0.7909304448713315, 0.7917395756065707, 0.7949726644307925, 0.7935892437123965, 0.788648078056945, 0.7884298795628845, 0.7947853729873434, 0.7914873001079897, 0.7954647997751446, 0.7845530259323362, 0.7864631873252563, 0.7945891000078191, 0.7921788783857749, 0.7939160131278358, 0.7954418176093052, 0.7920272489238006, 0.7896783130773278, 0.7904708015545453, 0.7913454446705677, 0.7912154765603041, 0.7927510550663261, 0.7893288256588749, 0.7935895078752221, 0.7941656469981593, 0.7942795011760528, 0.792660975542749, 0.7940385846789787, 0.7925907082311023, 0.7914505814752121, 0.7896468777010649, 0.7936975504709495, 0.789908134735721, 0.7887698571196109, 0.7931919428224847, 0.7955456335998208, 0.7902911708330427, 0.7868861120092647, 0.7932590401802224, 0.7898854167327074, 0.7916444769893046, 0.7920354379713985, 0.7910429782350965, 0.7922084646222577, 0.7928794381996354, 0.7930305393359581, 0.7893103342610732, 0.7924060584159106, 0.7925323282466139, 0.7934265194117411, 0.7913533695553399, 0.

In [40]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [31]:
column_to_drop_9 = '소득 중 재산소득의 비중(월평균)'

In [32]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(19949, 197)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [44]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [45]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 97, 'subsample': 0.7000000000000001, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 7}
0.7964816414713571


In [46]:
optuna_10 = GradientBoostingClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [47]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(auc_10)

0.7981690346222364


In [48]:
X_train = X_train.values
y_train = y_train.values

In [49]:
auc_bootstrap = []

In [50]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78259948, 0.79364814])

In [51]:
t_10 = auc_bootstrap
print(t_10)

[0.7865817964340132, 0.7900090449351533, 0.788278514263736, 0.7860521499684061, 0.7870966497813788, 0.7866071560652842, 0.7814263947268874, 0.7940848131734831, 0.7891100988391628, 0.7854686142863483, 0.7900690098965961, 0.7890255667349262, 0.7894281508813528, 0.7907653431052446, 0.7900779914326713, 0.7884576166595871, 0.7915221696009873, 0.7863112937004564, 0.7877237723296837, 0.7849421377746502, 0.7869426428539728, 0.7853809122282029, 0.7917733884482652, 0.7885965663059258, 0.7916743273886131, 0.7849809697100337, 0.7884895803615015, 0.785402045254262, 0.7864063923177224, 0.7862317806899088, 0.7876862612084287, 0.7880745805622653, 0.7908102507856203, 0.7866414972326301, 0.7881393004545715, 0.7833991627095076, 0.7931512617473209, 0.7797140912904459, 0.7898862092211846, 0.7824481765368465, 0.7852995500778752, 0.7898476414486267, 0.7888916361822764, 0.7924060584159106, 0.7859478056522391, 0.7886578520814974, 0.7899287394361285, 0.7849910078974118, 0.7875861434974735, 0.7919633215199717, 0

In [52]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [33]:
column_to_drop_10 = 'Cat_이사 예상 기간'

In [34]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(19949, 193)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [36]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [37]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7966331311682002


In [38]:
optuna_11 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [39]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(auc_11)

0.7993648997343579


In [40]:
X_train = X_train.values
y_train = y_train.values

In [41]:
auc_bootstrap = []

In [42]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78925213, 0.79728584])

In [43]:
t_11 = auc_bootstrap
print(t_11)

[0.7957223585302403, 0.7893895831087949, 0.7944435462908369, 0.7904050250109363, 0.7947969961516761, 0.7958866678078502, 0.7895988000667803, 0.7951401436623112, 0.7985586747902019, 0.7934925601181759, 0.7929771784451587, 0.792916156832413, 0.7916819881105596, 0.7916183248695563, 0.7928683433609542, 0.7929145718554587, 0.7912807247782617, 0.7928672867096512, 0.7896516326319283, 0.7897425046439825, 0.7949206243541219, 0.7971604609535645, 0.7917741809367425, 0.7939376744795464, 0.7957543222321548, 0.7935842246187074, 0.7905172942118754, 0.7949628904062401, 0.7869796256495765, 0.7930669938059102, 0.792197105620751, 0.7925391964800833, 0.7933319491201265, 0.7927447151585082, 0.7916159474041247, 0.790131880649122, 0.791250081890476, 0.7963167248881535, 0.7913488787873024, 0.7974544741786121, 0.7933020987208179, 0.7930136329151106, 0.7932667009021689, 0.7933863666622287, 0.7962284945043565, 0.7945806467973956, 0.7946575181796857, 0.7902071670544577, 0.7898336408188624, 0.7923109597986445, 0.7

In [44]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [35]:
column_to_drop_11 = 'Cat_기초생활보장 수급가구 여부'

In [36]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(19949, 191)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [48]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [49]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7971342442253325


In [50]:
optuna_12 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [51]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(auc_12)

0.7991467012402973


In [52]:
X_train = X_train.values
y_train = y_train.values

In [53]:
auc_bootstrap = []

In [54]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78928288, 0.79737245])

In [55]:
t_12 = auc_bootstrap
print(t_12)

[0.7932386996426406, 0.7925640277857027, 0.7952967922179746, 0.7947341253991501, 0.7934019522689474, 0.7896463493754133, 0.791063582935504, 0.7930138970779366, 0.792506968615343, 0.790270566132635, 0.7896101590682872, 0.7916764406912191, 0.7920692508130932, 0.7936729833281557, 0.7947103507448334, 0.7886132085639475, 0.7943616558148577, 0.7921075544228254, 0.791533528602494, 0.7935649407324283, 0.7941624770442504, 0.7926596547286203, 0.794783788010389, 0.7947454844006568, 0.7914085795859195, 0.7959392362101723, 0.7917826341471662, 0.793830688535122, 0.7937865733432237, 0.7959318396510515, 0.7929576303960542, 0.7934249344347867, 0.7939331837115088, 0.7926541073092797, 0.7930698995969931, 0.7922612971874057, 0.7913723892787933, 0.7946707263209727, 0.7972703526890719, 0.7946263469662485, 0.7950635364428468, 0.7943341828809809, 0.7943008983649378, 0.7926506731925451, 0.7938291035581676, 0.7945050962292342, 0.7926797311033764, 0.7915237545779419, 0.79317239477338, 0.795229430697411, 0.791766

In [56]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [37]:
column_to_drop_12 = '부채 중 금융기관 대출금의 비중'

In [38]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(19949, 190)


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [60]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [61]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7958026208955888


In [62]:
optuna_13 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [63]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(auc_13)

0.7994269779984065


In [64]:
X_train = X_train.values
y_train = y_train.values

In [65]:
auc_bootstrap = []

In [66]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7891896 , 0.79751841])

In [67]:
t_13 = auc_bootstrap
print(t_13)

[0.7934283685515212, 0.7929214400889277, 0.7957609263027984, 0.7923268095681889, 0.7975252698159102, 0.7916518735484253, 0.7949832309438221, 0.7939598641569086, 0.7913332931805838, 0.7952199208356844, 0.7944976996701135, 0.7923836045757229, 0.7952650926788857, 0.794321767228171, 0.7939495618067047, 0.7938906534965648, 0.7951242938927667, 0.796785085578189, 0.7917422172348281, 0.7956343923092691, 0.7913272174355919, 0.7910083729049247, 0.7926321817947433, 0.7915565107683334, 0.7892899937234914, 0.7931340911636477, 0.7927328278313501, 0.7947946186862442, 0.7913678985107556, 0.7955665024630543, 0.7938420475366288, 0.7921701610125256, 0.7901511645354009, 0.7940620951704696, 0.791056714702035, 0.7944477728960487, 0.7959907479611914, 0.7955318971328824, 0.7926010105813062, 0.7955921262571509, 0.7935630915926482, 0.7966567024448798, 0.7897649584841704, 0.7934333876452103, 0.7937532888271805, 0.7949776835244815, 0.7926250493984485, 0.7952917731242855, 0.795681149129425, 0.7964731092809911, 0.7

In [68]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [39]:
column_to_drop_13 = 'Cat_현재 주택의 구조'

In [40]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(19949, 188)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [42]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [43]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7974194741450017


In [44]:
optuna_14 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [45]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(auc_14)

0.7988223092902896


In [46]:
X_train = X_train.values
y_train = y_train.values

In [47]:
auc_bootstrap = []

In [48]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78913646, 0.79737851])

In [49]:
t_14 = auc_bootstrap
print(t_14)

[0.7931346194892993, 0.7955102357811719, 0.7913163867597366, 0.7897789591139345, 0.7921738592920858, 0.7953837017876426, 0.7937408731743707, 0.7946104971967041, 0.7946020439862804, 0.7918050879873539, 0.7944364138945421, 0.7881902838799391, 0.7957368874856559, 0.7959730490518667, 0.7937868375060493, 0.7977294676802067, 0.7931626207488276, 0.7955480110652524, 0.791728216605064, 0.7944340364291104, 0.7910107503703563, 0.7914265426580697, 0.791341218065356, 0.7891703279634315, 0.7938029514384195, 0.7910139203242652, 0.7951485968727348, 0.7906797543497052, 0.7926134262341159, 0.7894690961193425, 0.7914019755152761, 0.7901685992818999, 0.795363097087235, 0.7893782241072882, 0.7922227294148476, 0.7910136561614395, 0.7933839891967971, 0.792480552332769, 0.7977722620579764, 0.7966012282514746, 0.7938008381358135, 0.7952278457204565, 0.791118264640432, 0.7953247934775027, 0.7946271394547257, 0.7921376689849596, 0.7894358116032992, 0.7918666379257512, 0.7931010708104305, 0.7954529124479863, 0.79

In [50]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [41]:
column_to_drop_14 = 'Cat_현재 상업시설 접근용이성'

In [42]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(19949, 184)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [54]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [55]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7954645553323342


In [56]:
optuna_15 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [57]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(auc_15)

0.7984788976168287


In [58]:
X_train = X_train.values
y_train = y_train.values

In [59]:
auc_bootstrap = []

In [60]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78924215, 0.79739905])

In [61]:
t_15 = auc_bootstrap
print(t_15)

[0.7943674673970241, 0.7942834636184389, 0.7892429727405097, 0.7918130128721261, 0.7924092283698196, 0.7936523786277481, 0.7968173134429292, 0.7959524443514592, 0.7938711054474601, 0.7905254832594734, 0.7927790563258543, 0.7938655580281195, 0.7951895421107243, 0.7941479480888348, 0.7953726069489616, 0.7927201480157146, 0.7872535624998679, 0.7902499614322275, 0.7926142187225931, 0.793539052775506, 0.7913501996014312, 0.7972980897857745, 0.7933773851261536, 0.7926633530081806, 0.7912865363604279, 0.7954373268412678, 0.7907798720606602, 0.7945574004687306, 0.7925471213648554, 0.7899295319246058, 0.7953900416954605, 0.7915208487868588, 0.7911224912456439, 0.7950136096687821, 0.7904966895114678, 0.794995382433806, 0.7956478646133818, 0.7941373815758052, 0.7909856549019111, 0.7926591264029687, 0.7924221723482807, 0.7922171819955072, 0.7926237285843197, 0.7954336285617074, 0.7918888276031133, 0.7930543139902746, 0.7946522349231708, 0.7940369997020242, 0.7958412318018229, 0.7956156367486417, 0

In [62]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
column_to_drop_15 = 'Cat_현재 공공기관 접근용이성'

In [44]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(19949, 180)


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [66]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [67]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7969006116138523


In [68]:
optuna_16 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [69]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(auc_16)

0.7984921057581157


In [70]:
X_train = X_train.values
y_train = y_train.values

In [71]:
auc_bootstrap = []

In [72]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78909319, 0.79756697])

In [73]:
t_16 = auc_bootstrap
print(t_16)

[0.7909782583427902, 0.7892889370721883, 0.792600218092829, 0.7931927353109618, 0.7964865815851037, 0.7934143679217571, 0.7935245238200903, 0.7906287709243375, 0.7940570760767807, 0.7948876040009045, 0.7906240159934741, 0.7948968496998055, 0.7949990807133663, 0.7934487090891031, 0.7926596547286202, 0.797162838418996, 0.7950091189007444, 0.7952452804669553, 0.7928231715177528, 0.7948680559517998, 0.7926311251434405, 0.7932719841586837, 0.7958058339831738, 0.7922470323948156, 0.792850380288804, 0.7934022164317731, 0.7957363591600044, 0.7900595000348695, 0.7936441895801502, 0.7918793177413866, 0.7945433998389664, 0.7912894421515111, 0.7905415971918435, 0.7918505239933811, 0.7927986043749591, 0.7918185602914667, 0.7970397385422017, 0.7936669075831637, 0.7954463083773429, 0.7928601543133562, 0.7917290090935412, 0.7933197976301425, 0.7933736868465933, 0.796071053460216, 0.7943698448624557, 0.7931583941436158, 0.7958924793900163, 0.7920322680174898, 0.7910630546098527, 0.7935335053561654, 0.7

In [74]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [45]:
column_to_drop_16 = 'Cat_현재 주변도로의 보행 안전'

In [46]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(19949, 176)


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [60]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [61]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 139, 'subsample': 0.6, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 9}
0.7971850159765905


In [62]:
optuna_17 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [63]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(auc_17)

0.7947769197769199


In [64]:
X_train = X_train.values
y_train = y_train.values

In [65]:
auc_bootstrap = []

In [66]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78779422, 0.79743562])

In [67]:
t_17 = auc_bootstrap
print(t_17)

[0.7945367957683228, 0.7897852990217521, 0.7918666379257512, 0.7913911448394207, 0.7917987480795363, 0.7916790823194764, 0.7921751801062146, 0.7971916321670016, 0.7926210869560624, 0.7903490224918797, 0.7964480138125458, 0.7893721483622962, 0.7907949293417273, 0.7928271339601388, 0.7929689893975609, 0.7918700720424857, 0.7932719841586837, 0.7939902428818685, 0.7937168343572285, 0.7945716652613205, 0.7916695724577498, 0.7867941833459073, 0.7920647600450557, 0.792071628278525, 0.7945066812061885, 0.7907521349639577, 0.7909238408006881, 0.7939022766608973, 0.7945629478880711, 0.7941413440181914, 0.7927172422246314, 0.7929121943900268, 0.7958013432151363, 0.7932355296887317, 0.7907312661007243, 0.7915332644396683, 0.795831986102922, 0.7964015211552157, 0.7930656729917813, 0.7923318286618779, 0.7874506279678694, 0.7964282016006154, 0.7978491334402664, 0.7942887468749538, 0.7963864638741486, 0.7879448766148274, 0.7917065552533533, 0.7936552844188312, 0.7952228266267675, 0.791676440691219, 0.

In [68]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [47]:
column_to_drop_17 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [48]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(19949, 172)


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [74]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [75]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7962653455554285


In [76]:
optuna_18 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [77]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(auc_18)

0.798836045757228


In [78]:
X_train = X_train.values
y_train = y_train.values

In [79]:
auc_bootstrap = []

In [80]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78949281, 0.79753656])

In [81]:
t_18 = auc_bootstrap
print(t_18)

[0.7972259733343476, 0.7953646820641895, 0.7927087890142077, 0.7931911503340074, 0.7910458840261796, 0.7944644151540704, 0.7974050757301989, 0.7955041600361796, 0.7937173626828798, 0.7957463973473826, 0.7932001318700825, 0.7974587007838239, 0.792845361195115, 0.7971842356078809, 0.7938526140496585, 0.7939535242490908, 0.7924163607661145, 0.7961429057488171, 0.7923019782625693, 0.7933047403490754, 0.7944945297162047, 0.7937110227750622, 0.7959859930303279, 0.795417778792163, 0.7933430439588074, 0.7933551954487915, 0.7947552584252092, 0.7942002523283311, 0.7930057080303387, 0.7953424923868273, 0.7941360607616765, 0.7921622361277534, 0.791590323610028, 0.7941793834650978, 0.7929906507492714, 0.7934244061091351, 0.792298279983009, 0.7921498204749436, 0.7904224597574352, 0.7932130758485438, 0.7929217042517533, 0.7916690441320984, 0.7958134947051203, 0.7964358623225618, 0.7929523471395392, 0.7934426333441112, 0.793235265525906, 0.7953921549980664, 0.796266533951263, 0.7952962638923229, 0.795

In [82]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [49]:
column_to_drop_18 = '소득 중 근로/사업소득의 비중(월평균)'

In [50]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(19949, 171)


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [59]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [60]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 182, 'subsample': 0.5, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 4}
0.7974830420286907


In [61]:
optuna_19 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [62]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(auc_19)

0.7996784610085103


In [63]:
X_train = X_train.values
y_train = y_train.values

In [64]:
auc_bootstrap = []

In [65]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7894388, 0.7981082])

In [66]:
t_19 = auc_bootstrap
print(t_19)

[0.7909576536423827, 0.7916605909216747, 0.79550785831574, 0.7946078555684466, 0.7927595082767496, 0.7959891629842368, 0.7957088862261276, 0.7938396700711972, 0.795156521757507, 0.7948860190239501, 0.7951795039233462, 0.7943378811605413, 0.7926559564490598, 0.7948128459212203, 0.7924417203973855, 0.7952384122334861, 0.794856696950293, 0.7882954206845832, 0.7901839207257927, 0.793797404019079, 0.7943159556460049, 0.7925286299670536, 0.7960869032297604, 0.7944583394090783, 0.7885218082262418, 0.7975014951615937, 0.7966107381132012, 0.7958203629385896, 0.7919532833325937, 0.7950902168882465, 0.7923278662194917, 0.787482327506958, 0.794827374876636, 0.7924911188457987, 0.7910926408463355, 0.7956682051509638, 0.7913459729962193, 0.7952627152134542, 0.7925693110422176, 0.79552212310833, 0.7958134947051203, 0.7936320380901661, 0.7982023191382798, 0.795472460497091, 0.7941236451088668, 0.798079483424311, 0.7943640332802895, 0.7936116975525843, 0.7895575906659651, 0.7946237053379911, 0.79120623

In [67]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [51]:
column_to_drop_19 = '자산 중 부동산 자산의 비중'

In [52]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(19949, 170)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [71]:
X_train.shape, X_test.shape, X_val.shape

((12767, 170), (3990, 170), (3192, 170))

In [72]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [73]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 160, 'subsample': 0.6, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 8}
0.7982739255686127


In [74]:
optuna_20 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [75]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(auc_20)

0.7970019632581209


In [76]:
X_train = X_train.values
y_train = y_train.values

In [77]:
auc_bootstrap = []

In [78]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78614315, 0.79604128])

In [79]:
t_20 = auc_bootstrap
print(t_20)

[0.7901334656260764, 0.7895908751820081, 0.7901730900499373, 0.7872731105489726, 0.790767984733502, 0.7910921125206839, 0.7921992189233569, 0.7928889480613619, 0.7884372761220053, 0.7927927927927928, 0.7908467052555722, 0.7942662930347659, 0.790597863873726, 0.7957648887451843, 0.7843416956717449, 0.7880994118678848, 0.7948633010209365, 0.7927103739911623, 0.7897322022937787, 0.7923030349138723, 0.7960546753650202, 0.791720027557466, 0.7935245238200903, 0.7900555375924834, 0.7913618227657637, 0.7869864938830456, 0.7925600653433166, 0.7888258596386675, 0.7962744588360352, 0.789633405396952, 0.7916365521045323, 0.7922449190922097, 0.7888884662283677, 0.7932059434522489, 0.7858894256677509, 0.7979035509823686, 0.7931512617473209, 0.7915802854226499, 0.793827782744039, 0.7930284260333522, 0.7884169355844233, 0.7904142707098372, 0.7886316999617492, 0.7899855344436624, 0.7914666954075822, 0.7928717774776888, 0.7940459812380996, 0.789987119420617, 0.7948588102528988, 0.7912101933037894, 0.789

In [80]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [53]:
column_to_drop_20 = 'Cat_현재 의료시설 접근용이성'

In [54]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(19949, 166)


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [85]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [86]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.795999929002104


In [87]:
optuna_21 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [88]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(auc_21)

0.7984094227936593


In [89]:
X_train = X_train.values
y_train = y_train.values

In [90]:
auc_bootstrap = []

In [91]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78893993, 0.79717202])

In [92]:
t_21 = auc_bootstrap
print(t_21)

[0.7910981882656759, 0.794499284647068, 0.7946667638785865, 0.7930012172623009, 0.7948112609442659, 0.7898389240753773, 0.7937799692725802, 0.792736790273736, 0.7945233234642102, 0.7949198318656447, 0.792963706141046, 0.7934542565084437, 0.7920253997840205, 0.7923410743607788, 0.7888454076877722, 0.7924129266493798, 0.7911016223824106, 0.7945566079802532, 0.7920378154368302, 0.7914386941480537, 0.7966102097875498, 0.7937104944494107, 0.7918798460670382, 0.7954103822330423, 0.7933855741737517, 0.7926044446980407, 0.7915710397237491, 0.7946099688710525, 0.7970608715682608, 0.796510620402246, 0.7936053576447666, 0.7927785280002029, 0.7903331727223353, 0.7942324801930714, 0.792746035972637, 0.7946181579186504, 0.7933055328375526, 0.7935068249107657, 0.79300623635599, 0.7940610385191665, 0.7964094460399879, 0.7898394524010288, 0.7914981307838451, 0.7914524306149922, 0.7936595110240431, 0.7912405720287494, 0.795957727607974, 0.7918156545003835, 0.7932217932217932, 0.7921920865270619, 0.79552

In [93]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [55]:
column_to_drop_21 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [56]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(19949, 162)


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [61]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [62]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7959561744847596


In [63]:
optuna_22 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [64]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(auc_22)

0.7978884937013015


In [65]:
X_train = X_train.values
y_train = y_train.values

In [66]:
auc_bootstrap = []

In [67]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78887535, 0.79698113])

In [68]:
t_22 = auc_bootstrap
print(t_22)

[0.7916848939016427, 0.79133910476275, 0.7932722483215093, 0.7909851265762595, 0.7928131333303747, 0.7918777327644322, 0.7933039478605981, 0.7906522814158281, 0.7918180319658152, 0.7929475922086759, 0.7929280441595712, 0.7956188067025505, 0.7900029691901612, 0.792904005342429, 0.7903643439357725, 0.7952009011122312, 0.7895491374555414, 0.7923814912731169, 0.7952856973792934, 0.7961262634907956, 0.7918782610900836, 0.7952328648141456, 0.7945587212828592, 0.7940216782581314, 0.792941516463684, 0.7938375567685911, 0.7935369394729002, 0.7921212908897638, 0.7946311018971117, 0.7921020070034848, 0.7961286409562273, 0.792984839167105, 0.792543423085295, 0.7937651761543387, 0.7928797023624609, 0.7909396905702325, 0.7918254285249359, 0.7963700857789527, 0.7916679874807954, 0.794012960884882, 0.7958190421244608, 0.7942964075969002, 0.7956621294059718, 0.7918204094312469, 0.796142905748817, 0.7926873918253229, 0.7914101645628739, 0.7952167508817755, 0.7924266631163183, 0.7911187929660836, 0.78903

In [69]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [57]:
column_to_drop_22 = '소득 중 정부 보조금의 비중(월평균)'

In [58]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(19949, 161)


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [61]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [62]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7952490849733366


In [63]:
optuna_23 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [64]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(auc_23)

0.797918608263436


In [65]:
X_train = X_train.values
y_train = y_train.values

In [66]:
auc_bootstrap = []

In [67]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78865638, 0.79701833])

In [68]:
t_23 = auc_bootstrap
print(t_23)

[0.7939646190877717, 0.792492703822753, 0.792291147586714, 0.7928519652657583, 0.7907093405861878, 0.7934294252028242, 0.7906200535510881, 0.7909772016914873, 0.7907014157014157, 0.7925645561113542, 0.7957186602506798, 0.7911922302316391, 0.7929893299351427, 0.7929087602732923, 0.7936095842499784, 0.7923643206894437, 0.7910852442872147, 0.7902444140128869, 0.795032101066584, 0.7960961489286613, 0.7906543947184341, 0.7914511098008635, 0.7925769717641639, 0.7946152521275673, 0.7914777902462632, 0.793307117814507, 0.7891573839849702, 0.7950194212509483, 0.7933908574302664, 0.7920695149759189, 0.7906993023988098, 0.7961407924462112, 0.7936888330977001, 0.7904100441046253, 0.791211514117918, 0.794022206583783, 0.7909824849480022, 0.7918346742238367, 0.7974679464827248, 0.7939102015456696, 0.7935099948646747, 0.7910289776053323, 0.7919015074187489, 0.7903944584979068, 0.7927090531770334, 0.7954140805126027, 0.793424670271961, 0.7920597409513666, 0.7949132277950012, 0.7929436297662897, 0.7894

In [69]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
column_to_drop_23 = 'Cat_현재 주차시설 이용편의성'

In [60]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(19949, 157)


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [74]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [75]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7966508806422172


In [76]:
optuna_24 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [77]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(auc_24)

0.7982176405821726


In [78]:
X_train = X_train.values
y_train = y_train.values

In [79]:
auc_bootstrap = []

In [80]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78879168, 0.79707531])

In [81]:
t_24 = auc_bootstrap
print(t_24)

[0.7930627672006981, 0.7930614463865696, 0.7917905590319383, 0.7950878394228147, 0.7968978831047796, 0.7904546876221753, 0.7920584201372379, 0.7908633475135938, 0.7945912133104251, 0.7923276020566661, 0.7908472335812239, 0.795670054290744, 0.7949502105906047, 0.793835707628811, 0.7905022369308083, 0.7920013609668781, 0.7980892574488634, 0.7934962583977361, 0.793818537045138, 0.7921521979403754, 0.7964371831366904, 0.7957141694826424, 0.7928530219170613, 0.7939767705777558, 0.7945626837252453, 0.7917313865589728, 0.7933898007789635, 0.7959339529536573, 0.7931755647272889, 0.790938105593278, 0.7941989315142024, 0.7917369339783134, 0.7937044187044187, 0.7954938576859759, 0.7909431246869669, 0.7934600680906101, 0.7909394264074068, 0.7954703471944851, 0.7912078158383578, 0.789749108714626, 0.7939693740186351, 0.7908477619068752, 0.7947029541857128, 0.7934970508862134, 0.7951396153366597, 0.7913190283879938, 0.7906831884664396, 0.7908134207395292, 0.7896915212186149, 0.7898307350277793, 0.79

In [82]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [61]:
column_to_drop_24 = 'Cat_현재 대중교통 접근용이성'

In [62]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(19949, 153)


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [65]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [66]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7951285536614069


In [67]:
optuna_25 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [68]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(auc_25)

0.7988146485683432


In [69]:
X_train = X_train.values
y_train = y_train.values

In [70]:
auc_bootstrap = []

In [71]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78858722, 0.79703631])

In [72]:
t_25 = auc_bootstrap
print(t_25)

[0.795708622063302, 0.7936418121147186, 0.7957577563488893, 0.7941009271058532, 0.7907782870837058, 0.7946617447848975, 0.7942932376429913, 0.7931343553264736, 0.7920647600450558, 0.7912659316600203, 0.7959675016325263, 0.7913074052236614, 0.7899488158108847, 0.7946694055068438, 0.7961965308024421, 0.7922583913963224, 0.7929872166325368, 0.7898357541214684, 0.7939178622676158, 0.7935535817309216, 0.7948836415585184, 0.7888845037859816, 0.7945130211140063, 0.7936101125756297, 0.793667700071641, 0.7944374705458449, 0.7909486721063076, 0.7945597779341622, 0.7936394346492868, 0.79065650802104, 0.7939989602551178, 0.7907088122605365, 0.7956716392676983, 0.7922597122104511, 0.7930894476460979, 0.7935165989353181, 0.7918938466968024, 0.7907016798642414, 0.7895900826935309, 0.7872929227609031, 0.789229500436397, 0.7929148360182844, 0.7952796216343013, 0.7961595480068386, 0.7928907972011419, 0.795171579038574, 0.7961487173309834, 0.7924813448212462, 0.792245711580687, 0.7939471843412731, 0.7939

In [73]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [63]:
column_to_drop_25 = 'Cat_현재 교육환경'

In [64]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(19949, 149)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [77]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [78]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7956961240514867


In [79]:
optuna_26 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [80]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(auc_26)

0.7994961886587503


In [81]:
X_train = X_train.values
y_train = y_train.values

In [82]:
auc_bootstrap = []

In [83]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78725194, 0.79604122])

In [84]:
t_26 = auc_bootstrap
print(t_26)

[0.7876894311623375, 0.7914215235643807, 0.7915470009066068, 0.7897982430002134, 0.7954809137075146, 0.7940879831273919, 0.7870625727768585, 0.7921078185856512, 0.7924718349595197, 0.7915834553765587, 0.7945267575809448, 0.7918703362053114, 0.7926556922862341, 0.7911420392947486, 0.7883519515292915, 0.7905215208170873, 0.7911290953162875, 0.7893280331703977, 0.7921276307975815, 0.7948424321577031, 0.7881712641564858, 0.7912374020748404, 0.7868567899356077, 0.7910638470983298, 0.7918185602914667, 0.7886042270278724, 0.7918938466968024, 0.7932891547423566, 0.7938660863537711, 0.7900838030148375, 0.7915475292322584, 0.7939574866914769, 0.7951839946913839, 0.7937524963387033, 0.7917519912593804, 0.7872168438670901, 0.7917467080028656, 0.7926345592601751, 0.7916378729186612, 0.7914862434566868, 0.7910017688342811, 0.7912519310302562, 0.789614385673499, 0.7872604307333372, 0.7903669855640298, 0.7886010570739634, 0.7909835415993052, 0.7907331152405044, 0.7911880036264273, 0.7903513999573113, 

In [85]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [65]:
column_to_drop_26 = 'Cat_현재 문화시설 접근용이성'

In [66]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(19949, 145)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [69]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [70]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.796919599423266


In [71]:
optuna_27 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [72]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(auc_27)

0.7997437092264678


In [73]:
X_train = X_train.values
y_train = y_train.values

In [74]:
auc_bootstrap = []

In [75]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7889527 , 0.79690765])

In [76]:
t_27 = auc_bootstrap
print(t_27)

[0.7892466710200701, 0.7917324432102757, 0.7945727219126234, 0.7920980445610987, 0.7924639100747475, 0.7891399492384714, 0.791921583793505, 0.792742073530251, 0.7919852470345081, 0.791237930400492, 0.7912849513834737, 0.7917990122423622, 0.793257455203268, 0.7875753128216182, 0.7899157954576674, 0.7956357131233978, 0.7937728368762851, 0.7926031238839121, 0.7931797913325007, 0.7924678725171337, 0.7915546616285533, 0.7919020357444002, 0.7934851635590552, 0.7950392334628789, 0.7933776492889794, 0.7935821113161015, 0.7924499094449833, 0.794796996151676, 0.7923326211503552, 0.7922132195531211, 0.7921022711663106, 0.7908371953938457, 0.7925526687841958, 0.79543098693345, 0.7981864693687353, 0.7961991724306996, 0.793235265525906, 0.7944749816670998, 0.7954774795907801, 0.7931359403034279, 0.795618278376899, 0.7890871166733235, 0.793455048996921, 0.7908939904013794, 0.7921500846377694, 0.790787268619781, 0.7897480520633231, 0.7915713038865747, 0.7956703184535697, 0.7933293074918691, 0.79397175

In [77]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [67]:
column_to_drop_27  = 'Cat_이사 계획 중인 주택의 유형'

In [68]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(19949, 126)


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [81]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [82]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7954269924919725


In [83]:
optuna_28 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [84]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(auc_28)

0.7988022329155334


In [85]:
X_train = X_train.values
y_train = y_train.values

In [86]:
auc_bootstrap = []

In [87]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78885267, 0.79680916])

In [88]:
t_28 = auc_bootstrap
print(t_28)

[0.7916621758986291, 0.7931504692588436, 0.795831986102922, 0.7895063430777718, 0.7923088464960386, 0.7913892956996404, 0.7948939439087221, 0.7916690441320983, 0.7879203094720336, 0.792374358876822, 0.791984454546031, 0.7922248427174535, 0.7920090216888246, 0.7923484709198995, 0.7926733911955587, 0.7909085193567953, 0.7935020699799025, 0.7917287449307154, 0.7942676138488947, 0.7921477071723376, 0.7937091736352819, 0.7923313003362265, 0.7939242021754338, 0.7922174461583329, 0.7954864611268552, 0.7934323309939074, 0.7892154998066327, 0.7924335313497874, 0.7936138108551901, 0.7933956123611297, 0.7949845517579508, 0.792230390136794, 0.7925521404585444, 0.7934587472764814, 0.7944953222046818, 0.7982855304283877, 0.7863480123332339, 0.789284182141325, 0.7943967894706812, 0.7925561029009305, 0.7902198468700932, 0.7895055505892945, 0.7931132223004143, 0.7943532026044341, 0.793849972421401, 0.7939168056163131, 0.7888369544773486, 0.7956048060727863, 0.7929800842362419, 0.7932735691356382, 0.792

In [89]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [69]:
column_to_drop_28 = 'Cat_현재 청소/쓰레기 처리상태'

In [70]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(19949, 122)


In [71]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [73]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [74]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7964283930493059


In [75]:
optuna_29 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [76]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(auc_29)

0.7988820100889066


In [77]:
X_train = X_train.values
y_train = y_train.values

In [78]:
auc_bootstrap = []

In [79]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78893571, 0.79718989])

In [80]:
t_29 = auc_bootstrap
print(t_29)

[0.7907487008472229, 0.7929676685834322, 0.7956851115718111, 0.7928295114255706, 0.7969618105086085, 0.789119080375238, 0.7938058572295026, 0.7941278717140787, 0.7890282083631837, 0.7924913830086244, 0.7943196539255654, 0.795104745843662, 0.7926023313954349, 0.7989261252808051, 0.7927457718098112, 0.7971842356078809, 0.7926115770943356, 0.793873218750066, 0.7938259336042587, 0.7971107983423255, 0.7949734569192698, 0.7955020467335738, 0.7937421939884993, 0.7928847214561501, 0.7923104314729931, 0.7920253997840205, 0.7945978173810686, 0.7933438364472847, 0.7918061446386568, 0.7934846352334037, 0.7911174721519549, 0.7846412563161331, 0.7943841096550456, 0.791005995439493, 0.7889933388701862, 0.7906002413391576, 0.7907975709699848, 0.788118431591338, 0.7937009845876841, 0.7900824822007088, 0.7929639703038718, 0.7933850458481002, 0.7948997554908885, 0.7943854304691743, 0.792351905036634, 0.7940018660462009, 0.789502908961037, 0.7924678725171336, 0.7923511125481568, 0.794678387042919, 0.79669

In [81]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [71]:
column_to_drop_29 = '현재 무주택 기간(총 개월)'

In [72]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(19949, 121)


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [85]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [86]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.08, 'n_estimators': 84, 'subsample': 0.6, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 6}
0.7970657230000574


In [87]:
optuna_30 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [89]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(auc_30)

0.7953385299444413


In [90]:
X_train = X_train.values
y_train = y_train.values

In [91]:
auc_bootstrap = []

In [92]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78645994, 0.79526931])

In [93]:
t_30 = auc_bootstrap
print(t_30)

[0.7901781091436264, 0.7900499901731429, 0.7850707850707851, 0.791471978664097, 0.7922470323948155, 0.7932204724076646, 0.7905315590044654, 0.7929092885989437, 0.7943201822512168, 0.7918742986476976, 0.7935282220996507, 0.7879921617606347, 0.7916920262979376, 0.7923429235005589, 0.795966709144049, 0.7906654895571151, 0.7879546506393796, 0.7909145951017873, 0.7890747010205139, 0.7923262812425373, 0.7888773713896866, 0.7908136849023549, 0.7882338707461859, 0.790179429957755, 0.7880407677205705, 0.7895800445061527, 0.7893589402210092, 0.7936056218075922, 0.7866026652972464, 0.7884367477963536, 0.7895779312035469, 0.7932836073230161, 0.7866341006735095, 0.789361053523615, 0.7911475867140891, 0.7888498984558099, 0.790104407715245, 0.7883350451084442, 0.7931708097964256, 0.7857314562979588, 0.7904203464548291, 0.7913895598624663, 0.7922745053286926, 0.7926408991679927, 0.7933271941892632, 0.7937033620531158, 0.7894257734159212, 0.7908025900636738, 0.7930212936370571, 0.7903289461171236, 0.79

In [94]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [73]:
column_to_drop_30 = 'Cat_현재 대기오염 정도'

In [74]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(19949, 117)


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [99]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [100]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7952218415946127


In [101]:
optuna_31= GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [102]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(auc_31)

0.7987060776469643


In [103]:
X_train = X_train.values
y_train = y_train.values

In [104]:
auc_bootstrap = []

In [105]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78885724, 0.79688299])

In [106]:
t_31 = auc_bootstrap
print(t_31)

[0.7935826396417529, 0.790097539481776, 0.7938441608392348, 0.7937506471989231, 0.7977772811516655, 0.7910112786960078, 0.7909711259464953, 0.7909515778973907, 0.7909095760080982, 0.7945175118820439, 0.7911573607386415, 0.793026576893572, 0.793222057384619, 0.7952524128632503, 0.7909222558237337, 0.7943920345398178, 0.7925162143142439, 0.7924306255587044, 0.7916360237788809, 0.7932405487824208, 0.789319579959974, 0.7906551872069114, 0.7924715707966941, 0.7953213593607682, 0.7914706578499682, 0.7908136849023548, 0.7937770634814969, 0.7912130990948725, 0.7921165359589004, 0.7932931171847428, 0.7930741262022051, 0.7904274788511243, 0.7921796708742522, 0.7961363016781736, 0.7968244458392242, 0.7901918456105648, 0.7946918593470318, 0.7945288708835507, 0.791875883624652, 0.7922269560200594, 0.7913359348088411, 0.7933945557098266, 0.7949243226336822, 0.793654491930354, 0.7917205558831174, 0.7925019495216539, 0.7925222900592358, 0.7921144226562945, 0.7941006629430275, 0.7948334506216279, 0.789

In [107]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [75]:
column_to_drop_31 = 'Cat_이사 계획 첫 번째 이유'

In [76]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(19949, 104)


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [79]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [80]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 188, 'subsample': 0.5, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 7}
0.7955198676467128


In [81]:
optuna_32 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [82]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(auc_32)

0.7953287559198889


In [83]:
X_train = X_train.values
y_train = y_train.values

In [84]:
auc_bootstrap = []

In [85]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78590102, 0.79412294])

In [86]:
t_32 = auc_bootstrap
print(t_32)

[0.7867719936685453, 0.78791185626161, 0.7885891697468053, 0.7903955151492097, 0.7890173776873284, 0.7886813625729882, 0.7879810669219536, 0.7914278634721984, 0.7926815802431566, 0.7858997280179545, 0.7884747872432603, 0.790896632029637, 0.7920063800605673, 0.7875407074914464, 0.7881881705773333, 0.7901025585754651, 0.7879306118222374, 0.7892070465962092, 0.7896772564260249, 0.7919833978947279, 0.786708594590368, 0.7917873890780294, 0.7902589429683027, 0.7921685760355711, 0.7916983662057554, 0.792629540166486, 0.7928897405498392, 0.7916756482027418, 0.790855158465996, 0.7905537486818275, 0.7930035947277327, 0.7867751636224543, 0.7924853072636324, 0.7915470009066068, 0.7896191406043622, 0.7932096417318091, 0.7890823617424603, 0.7893050510045584, 0.7903495508175311, 0.791132529433022, 0.7903537774227429, 0.7903630231216439, 0.7872720538976699, 0.7874028144964106, 0.789472530236077, 0.7881308472441477, 0.789931381064386, 0.7895372501283832, 0.7883551214832003, 0.7914519022893407, 0.789152

In [87]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [77]:
column_to_drop_32 = '자산 중 기타자산의 비중'

In [78]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(19949, 103)


In [90]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [92]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [93]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7964977398315121


In [94]:
optuna_33 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [95]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(auc_33)

0.8006310321581257


In [96]:
X_train = X_train.values
y_train = y_train.values

In [97]:
auc_bootstrap = []

In [98]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7880879 , 0.79645221])

In [99]:
t_33 = auc_bootstrap
print(t_33)

[0.7927748297206425, 0.7941138710843144, 0.7924866280777609, 0.7949510030790818, 0.7899955726310405, 0.7922171819955071, 0.7954037781623988, 0.7945780051691381, 0.7925574237150592, 0.7926694287531726, 0.7936967579824723, 0.7892479918341988, 0.7905257474222992, 0.7912992161760635, 0.792182840828161, 0.7912432136570068, 0.7895993283924319, 0.7930992216706502, 0.7937976681819047, 0.7925410456198633, 0.7985592031158533, 0.7938158954168806, 0.7935638840811254, 0.7939202397330476, 0.792410813346774, 0.7910511672826943, 0.7954114388843453, 0.7908224022756043, 0.7902906425073912, 0.7937382315461132, 0.7930730695509021, 0.7955630683463196, 0.7911132455467431, 0.7894424156739427, 0.7921434805671259, 0.7867239160342608, 0.794389657074386, 0.7946081197312724, 0.7909196141954763, 0.791840485806003, 0.7913425388794847, 0.7925777642526411, 0.7928308322396992, 0.7909489362691333, 0.7930952592282641, 0.7912220806309476, 0.7915031498775341, 0.7924752690762543, 0.7945365316054971, 0.7937297783356897, 0.7

In [100]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [79]:
column_to_drop_33 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [80]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(19949, 99)


In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [104]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [105]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7965381921211323


In [106]:
optuna_34 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [107]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(auc_34)

0.8000136836343732


In [108]:
X_train = X_train.values
y_train = y_train.values

In [109]:
auc_bootstrap = []

In [110]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78784296, 0.796099  ])

In [111]:
t_34 = auc_bootstrap
print(t_34)

[0.7985856193984273, 0.7922137478787724, 0.7910081087420989, 0.7944393196856252, 0.7897055218483789, 0.7916899129953318, 0.789540420082292, 0.7947338612363243, 0.7907325869148529, 0.791621758986291, 0.7919604157288886, 0.7960179567322425, 0.7909946364379863, 0.7922002755746599, 0.7924129266493798, 0.7958037206805679, 0.7920113991542562, 0.7924932321484045, 0.7935958477830398, 0.792071364115699, 0.7926958450357464, 0.788670531897133, 0.7896997102662127, 0.7938674071678998, 0.794211611329838, 0.7913013294786694, 0.7925333848979168, 0.7929541962793194, 0.7894582654434872, 0.7926023313954349, 0.7919461509362987, 0.7969375075286406, 0.789671180681033, 0.7892014991768687, 0.7942942942942942, 0.7896024983463407, 0.7964263524608353, 0.7907693055476307, 0.7913552186951203, 0.7921416314273457, 0.7911626439951563, 0.7933758001491991, 0.7910622621213753, 0.7920906480019781, 0.7914083154230938, 0.7927901511645353, 0.7908128924138776, 0.7915789646085213, 0.7920079650375217, 0.7899781378845419, 0.793

In [112]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [81]:
column_to_drop_34 = '자산 중 금융자산의 비중'

In [82]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(19949, 98)


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [85]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [86]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7949254666563748


In [87]:
optuna_35 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [88]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(auc_35)

0.7996457048181187


In [89]:
X_train = X_train.values
y_train = y_train.values

In [90]:
auc_bootstrap = []

In [91]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78772205, 0.79603923])

In [92]:
t_35 = auc_bootstrap
print(t_35)

[0.791976265498433, 0.7917374623039648, 0.7890260950605779, 0.7898423581921119, 0.7915248112292448, 0.7901088984832827, 0.7937377032204619, 0.7927661123473932, 0.793016538706194, 0.7901102192974114, 0.7939318628973802, 0.7908776123061838, 0.7926126337456386, 0.7880582024670695, 0.7904890287895214, 0.7886155860293791, 0.792387302855283, 0.7893671292686071, 0.789403847901385, 0.7909539553628224, 0.7897010310803414, 0.7904116290815799, 0.7903112472077989, 0.7899118330152813, 0.7940488870291826, 0.7896936345212207, 0.7924871564034126, 0.7909967497405922, 0.7938629163998623, 0.7958842903424184, 0.7944794724351374, 0.791667194992318, 0.7925447438994238, 0.7960446371776421, 0.7936222640656139, 0.7942491224510929, 0.7908099866227944, 0.7866237983233058, 0.7879372158928808, 0.7928065292597313, 0.7907608523372069, 0.7945875150308647, 0.7911803429044809, 0.7941059461995423, 0.7951787114348691, 0.7901857698655728, 0.788466862358488, 0.7896880871018802, 0.7950183645996454, 0.7896817471940625, 0.789

In [93]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [101]:
column_to_drop_35 = '현재 주택 거주 기간(총 개월)'

In [116]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(19949, 97)


In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [119]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [120]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 142, 'subsample': 0.30000000000000004, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 8}
0.7959268672137083


In [121]:
optuna_36 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [122]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(auc_36)

0.796958376391874


In [123]:
X_train = X_train.values
y_train = y_train.values

In [124]:
auc_bootstrap = []

In [125]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78782685, 0.79551107])

In [126]:
t_36 = auc_bootstrap
print(t_36)

[0.791128566990636, 0.7908551584659959, 0.7896722373323359, 0.7928556635453188, 0.7922499381858987, 0.7938814077976639, 0.7920349096457471, 0.7954626864725387, 0.7970352477741641, 0.7908633475135938, 0.7890353407594786, 0.7893491661964569, 0.7920998937008789, 0.7906699803251527, 0.7932117550344151, 0.7922612971874057, 0.789710540942068, 0.7917134234868225, 0.7926871276624973, 0.7894825684234552, 0.7917654635634931, 0.7921511412890723, 0.793683814004011, 0.7938930309619965, 0.7949544371958165, 0.7912712149165352, 0.7913824274661714, 0.7953345675020551, 0.7909661068528063, 0.7940800582426197, 0.7941693452777197, 0.787990840946506, 0.7911536624590811, 0.790082218037883, 0.7908892354705163, 0.7896223105582711, 0.7911024148708878, 0.7891684788236513, 0.7897963938604332, 0.7901192008334865, 0.7891542140310612, 0.7918420707829574, 0.7941072670136708, 0.792456249352801, 0.7925323282466139, 0.7867424074320626, 0.7930973725308701, 0.7893586760581834, 0.7908963678668113, 0.793349648029451, 0.7852

In [127]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [103]:
column_to_drop_36 = '총 가구원 수'

In [104]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(19949, 96)


In [105]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [107]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [108]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 142, 'subsample': 0.30000000000000004, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 8}
0.7962934144910834


In [109]:
optuna_37 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [110]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(auc_37)

0.7967916896488325


In [111]:
X_train = X_train.values
y_train = y_train.values

In [112]:
auc_bootstrap = []

In [113]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78756863, 0.79559293])

In [114]:
t_37 = auc_bootstrap
print(t_37)

[0.7922536364654591, 0.7894146785772402, 0.7934690496266851, 0.7965351875450397, 0.7918278059903676, 0.7922996007971377, 0.7948598669042019, 0.7938911818222163, 0.7906802826753565, 0.7904462344117515, 0.7899807795127993, 0.7880391827436164, 0.7927687539756505, 0.7946181579186506, 0.7907804003863117, 0.7916624400614549, 0.7910223735346887, 0.790412421570057, 0.7915036782031857, 0.7902710944582867, 0.789070474415302, 0.7932933813475684, 0.7924308897215302, 0.7910226376975146, 0.7931779421927205, 0.7925471213648554, 0.7907106614003165, 0.7936420762775442, 0.7900188189597056, 0.7933314207944749, 0.7906274501102087, 0.7936544919303541, 0.7941902141409531, 0.7896698598669041, 0.7881897555542876, 0.7945050962292343, 0.7925476496905068, 0.7906292992499889, 0.7934878051873127, 0.7924034167876532, 0.7906403940886699, 0.7896627274706092, 0.7911259253623786, 0.7912339679581059, 0.7886229825884998, 0.7902864159021794, 0.7917041777879216, 0.7913018578043208, 0.7924562493528011, 0.7949879858746854, 0

In [115]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [87]:
column_to_drop_37 = '소득 대비 주거관리비의 비율'

In [88]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(19949, 95)


In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [92]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [93]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7959454422446562


In [94]:
optuna_38 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [95]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(auc_38)

0.7976650119507263


In [96]:
X_train = X_train.values
y_train = y_train.values

In [97]:
auc_bootstrap = []

In [98]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78679432, 0.79590779])

In [99]:
t_38 = auc_bootstrap
print(t_38)

[0.793131977861042, 0.7903086055795415, 0.7961402641205597, 0.7916965170659752, 0.7956079760266952, 0.7925428947596436, 0.7883353092712699, 0.7905022369308083, 0.7909108968222269, 0.7919292445154513, 0.788651248010854, 0.7937136644033196, 0.7899078705728951, 0.7889833006828082, 0.7912249864220308, 0.7910075804164474, 0.7896283863032632, 0.7938042722525481, 0.7915293019972822, 0.7890731160435593, 0.7907637581282901, 0.7913137451314792, 0.7926408991679927, 0.7931256379532241, 0.7916587417818945, 0.7910633187726783, 0.7903968359633384, 0.7934835785821007, 0.7918555430870702, 0.7911861544866471, 0.7901358430915082, 0.7907344360546331, 0.7914299767748043, 0.79441422421718, 0.7928374363103428, 0.7902729435980667, 0.7898146210954092, 0.7939934128357773, 0.7871058954802796, 0.7934201795039234, 0.790084331340489, 0.7900090449351533, 0.7880037849249673, 0.7918027105219223, 0.7941841383959609, 0.7897752608343742, 0.7899794586986705, 0.7908247797410358, 0.7905997130135062, 0.7937812900867087, 0.79

In [100]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [101]:
column_to_drop_38 = '소득 대비 생활비의 비율'

In [102]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(19949, 94)


In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [104]:
X_train.shape, X_test.shape, X_val.shape

((12767, 94), (3990, 94), (3192, 94))

In [105]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [132]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7958038592309855


In [106]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7958038592309855


In [107]:
optuna_39 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [108]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(auc_39)

0.7986366028237949


In [109]:
X_train = X_train.values
y_train = y_train.values

In [110]:
auc_bootstrap = []

In [111]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78786876, 0.79637913])

In [112]:
t_39 = auc_bootstrap
print(t_39)

[0.7927938494440958, 0.7885677725579203, 0.7936957013311693, 0.7919976626873179, 0.7889187128719148, 0.7890752293461653, 0.7941725152316286, 0.7951958820185422, 0.7904956328601649, 0.7939009558467687, 0.7938193295336152, 0.7898973040598658, 0.7903453242123193, 0.7923077898447357, 0.7948736033711403, 0.7948139025725233, 0.7929238175543594, 0.7924205873713264, 0.7915858328419905, 0.7942256119596022, 0.7931134864632401, 0.7922808452365103, 0.7925996897671775, 0.7896241596980513, 0.7952529411889018, 0.791176116299269, 0.7935786771993669, 0.7969100345947637, 0.7897300889911727, 0.7908242514153844, 0.7904258938741697, 0.7907700980361079, 0.7949396440775751, 0.7897768458113286, 0.7907774945952286, 0.7908570076057762, 0.7932862489512736, 0.7905283890505566, 0.7915192638099042, 0.7934320668310816, 0.792122083378241, 0.7928139258188518, 0.7911240762225983, 0.790512803443838, 0.7915192638099041, 0.793541958566589, 0.7938391417455457, 0.7939881295792626, 0.7938320093492507, 0.7938415192109772, 0.7

In [113]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [125]:
column_to_drop_39 = '총 이사 횟수'

In [126]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(19949, 93)


In [140]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [143]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [144]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.796303733952721


In [145]:
optuna_40 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [146]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(auc_40)

0.7977222032024988


In [147]:
X_train = X_train.values
y_train = y_train.values

In [148]:
auc_bootstrap = []

In [149]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78758571, 0.79614063])

In [150]:
t_40 = auc_bootstrap
print(t_40)

[0.7922383150215662, 0.7891325526793507, 0.7961444907257715, 0.7978618132559019, 0.7927222613183205, 0.7935509401026641, 0.7930772961561139, 0.791226835561811, 0.7944691700849337, 0.7918676945770542, 0.789990157293113, 0.7971306105542559, 0.7953018113116634, 0.7944448671049658, 0.7910144486499167, 0.7878178142956468, 0.795722622693066, 0.791388767373989, 0.7903949868235582, 0.7906316767154206, 0.7924816089840722, 0.7895824219715846, 0.7850464820908171, 0.7937044187044187, 0.7910582996789893, 0.7926179170021534, 0.7916307405223661, 0.7901730900499373, 0.7880352203012302, 0.785727229692747, 0.7908905562846449, 0.7900971432375374, 0.7924629855048575, 0.7947357103761046, 0.7904126857328828, 0.7915238866593547, 0.7911833807769769, 0.7931034482758621, 0.7935757714082836, 0.7878193992726011, 0.7928844572933242, 0.7958733275851504, 0.7887949525880561, 0.791047469003134, 0.7932085850805063, 0.790099388621556, 0.7888866170885875, 0.7938045364153737, 0.7893734691764249, 0.7948894531406847, 0.7970

In [151]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [123]:
column_to_drop_40 = '중기부채부담지표'

In [127]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(19949, 92)


In [128]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [130]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [131]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 85, 'subsample': 0.8, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 6}
0.796217463253429


In [132]:
optuna_41 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [133]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(auc_41)

0.7984129889918068


In [134]:
X_train = X_train.values
y_train = y_train.values

In [135]:
auc_bootstrap = []

In [136]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78621247, 0.79503974])

In [137]:
t_41 = auc_bootstrap
print(t_41)

[0.7912240618521407, 0.791820145268421, 0.7946297810829831, 0.7918090504297401, 0.7900852559103789, 0.7919935681635188, 0.7887862352148066, 0.790657564672343, 0.788631964124575, 0.7978591716276444, 0.7898970398970399, 0.7898608495899136, 0.7905994488506803, 0.7916045884026179, 0.7914382979038151, 0.7910409970139034, 0.7887167603916372, 0.7872971493661148, 0.7899221353654852, 0.7910947541489413, 0.788163075108888, 0.788990565160516, 0.7916109283104358, 0.7924176815802432, 0.7901631839439721, 0.7896962761494781, 0.785494238080445, 0.7937707235736792, 0.7921000257822919, 0.7911729463453601, 0.7860047327411859, 0.7903051714628069, 0.7898712840215303, 0.7922686937465263, 0.7920121916427335, 0.7948430925647675, 0.7915167542630597, 0.7915594165594166, 0.790509765571342, 0.7915921727498083, 0.7910860367756919, 0.79115775698288, 0.7854337447933507, 0.7915283774273922, 0.7885616968129284, 0.7929263271012039, 0.7924158324404631, 0.790653470148544, 0.7917229333485492, 0.790713303028574, 0.79039128

In [138]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [125]:
column_to_drop_41 = 'Cat_현재 주택의 유형'

In [126]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(19949, 81)


In [134]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [136]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [137]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.04, 'n_estimators': 104, 'subsample': 0.6, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 8}
0.7951884065389062


In [138]:
optuna_42 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [139]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(auc_42)

0.7971118549936282


In [140]:
X_train = X_train.values
y_train = y_train.values

In [141]:
auc_bootstrap = []

In [142]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78706943, 0.79533997])

In [143]:
t_42 = auc_bootstrap
print(t_42)

[0.7937279291959095, 0.7927354694596074, 0.7898376032612486, 0.7872865828530854, 0.7908078733201886, 0.7923582449444517, 0.7894969652974579, 0.7912949895708516, 0.7913624831728279, 0.7934637663701702, 0.791465902919105, 0.7869923054652119, 0.7911687197401484, 0.7905455596342296, 0.7944747175042742, 0.7942261402852535, 0.7875655387970659, 0.7922962987618158, 0.7942758028964925, 0.790678697698402, 0.7964371831366905, 0.7899369284837265, 0.7905349931212, 0.793133827000822, 0.7925563670637562, 0.7866964431003839, 0.7893132400521563, 0.7876653923451953, 0.7931034482758621, 0.7939894503933913, 0.7935895078752221, 0.7899033798048576, 0.7938790303322323, 0.7897607318789585, 0.7900462918935827, 0.7941247017601698, 0.7886608899539934, 0.7949301342158485, 0.7941825534190066, 0.7928440403809861, 0.7905320873301168, 0.789525230719812, 0.7903342293736383, 0.7883189311760741, 0.7922199557051774, 0.7885799240479043, 0.7931216755108381, 0.7928186807497153, 0.7931565450038356, 0.7924513623405249, 0.7893

In [144]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [118]:
column_to_drop_42 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [119]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(19949, 77)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [123]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [124]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.03, 'n_estimators': 144, 'subsample': 0.8, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 3}
0.7949324838902885


In [125]:
optuna_43 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [126]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(auc_43)

0.7968527112615782


In [127]:
X_train = X_train.values
y_train = y_train.values

In [128]:
auc_bootstrap = []

In [129]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78662956, 0.79473235])

In [130]:
t_43 = auc_bootstrap
print(t_43)

[0.7920486461126856, 0.7894601145832673, 0.7902248659637823, 0.7905328798185941, 0.7935858095956617, 0.7912376662376663, 0.7886076611446069, 0.7916296838710632, 0.7880402393949192, 0.7931240529762698, 0.7910091653934018, 0.7910897350552523, 0.7899583256726115, 0.7952698476097491, 0.7937548738041349, 0.7906924341653405, 0.785377742274294, 0.7920082292003474, 0.7930677862943873, 0.7874794217158749, 0.7887447616511656, 0.7914672237332335, 0.7914936400158075, 0.788937732595368, 0.7915467367437812, 0.789734843922036, 0.7896577083769201, 0.7879142337270415, 0.7898830392672758, 0.7866274966028661, 0.7901296352651033, 0.7928651734070454, 0.7957576242674764, 0.792291147586714, 0.7926896372093416, 0.7890995323261334, 0.7922318430323356, 0.7891917251523163, 0.7934759178601544, 0.7912776869057657, 0.7890028487319127, 0.7940348863994184, 0.7875772940428113, 0.790590731477431, 0.7888903153681478, 0.7925835758348074, 0.7914216556457936, 0.7916850259830557, 0.7933760643120249, 0.7903733254718477, 0.79

In [131]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [121]:
column_to_drop_43 = 'Cat_현재 주택의 점유형태'

In [122]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(19949, 73)


In [123]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [126]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [127]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7952226671515439


In [20]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.02, 'n_estimators': 179, 'subsample': 0.4, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}
0.7952226671515439


In [128]:
optuna_44 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [22]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.795631486518186


In [129]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(auc_44)

0.795631486518186


In [130]:
X_train = X_train.values
y_train = y_train.values

In [131]:
auc_bootstrap = []

In [132]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78564977, 0.79395831])

In [133]:
t_44 = auc_bootstrap
print(t_44)

[0.7890691536011731, 0.7891774603597264, 0.7890538321572805, 0.7878191351097755, 0.7915921727498083, 0.7911541907847326, 0.7927679614871733, 0.7898745860568521, 0.790459178390213, 0.788406104908568, 0.7856170737944137, 0.788331082666058, 0.7910514314455201, 0.7923117522871218, 0.7880006149710583, 0.7876397685510987, 0.7903904960555207, 0.7914962816440649, 0.7921691043612226, 0.7885207515749388, 0.788172056644963, 0.793392178244395, 0.786718632777746, 0.7906792260240536, 0.7895058147521202, 0.7884837687793353, 0.7884187847242035, 0.7862407622259839, 0.7863921275251323, 0.7900267438444778, 0.7899419475774156, 0.7926179170021534, 0.7899754962562844, 0.7873217165089087, 0.7886266808680603, 0.7876749022069218, 0.7908311196488536, 0.7912712149165351, 0.7949776835244816, 0.7895005314956053, 0.7885368655073088, 0.7884169355844233, 0.7869027542672863, 0.7903646080985983, 0.7892313495761771, 0.792336847755567, 0.788979338240422, 0.7900491976846656, 0.7939973752781635, 0.7910511672826945, 0.78612

In [134]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [101]:
column_to_drop_44 = 'Cat_주택 보유 의식'

In [102]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(19949, 71)


In [103]:
comp_45 = comp[[
 '현재 주택의 면적(㎡)',
 '가구주 나이',
 '소득 대비 주택 임대료의 비율',
 '장기부채부담지표',
 'target',
 'Cat_현재 거주 지역_강원도',
 'Cat_현재 거주 지역_경기도',
 'Cat_현재 거주 지역_경상남도',
 'Cat_현재 거주 지역_경상북도',
 'Cat_현재 거주 지역_광주광역시',
 'Cat_현재 거주 지역_대구광역시',
 'Cat_현재 거주 지역_대전광역시',
 'Cat_현재 거주 지역_부산광역시',
 'Cat_현재 거주 지역_서울특별시',
 'Cat_현재 거주 지역_세종특별자치시',
 'Cat_현재 거주 지역_울산광역시',
 'Cat_현재 거주 지역_인천광역시',
 'Cat_현재 거주 지역_전라남도',
 'Cat_현재 거주 지역_전라북도',
 'Cat_현재 거주 지역_제주특별자치도',
 'Cat_현재 거주 지역_충청남도',
 'Cat_현재 거주 지역_충청북도',
    
 'Cat_이사 계획 중인 거주 지역_국내 to 국외',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_비수도권 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 거주 지역_수도권 to 비수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 수도권',
 'Cat_이사 계획 중인 거주 지역_수도권 to 이사 계획 없음 및 모름',
    
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_무상 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 없는 월세(사글세, 연세, 일세 포함) to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_보증금 있는 월세 to 전세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 무상이나 기타',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 없는 월세(사글세, 연세, 일세 포함)',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 보증금 있는 월세',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 이사 계획 없음 및 모름',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 자가',
 'Cat_이사 계획 중인 주택의 점유형태_전세 to 전세',
    
 'Cat_현재 가장 필요한 주거지원 1순위_공공분양주택공급',
 'Cat_현재 가장 필요한 주거지원 1순위_없음',
 'Cat_현재 가장 필요한 주거지원 1순위_월세보조금 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_임대 후 분양전환 공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_장기공공임대주택 공급',
 'Cat_현재 가장 필요한 주거지원 1순위_전세자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주거상담과 정보제공 등',
 'Cat_현재 가장 필요한 주거지원 1순위_주택 구입자금 대출 지원',
 'Cat_현재 가장 필요한 주거지원 1순위_주택개량 및 개보수 지원',
 'Cat_가구주 성별_남',
 'Cat_가구주 성별_여',
      
 'Cat_가구주 최종 학력_고등학교 졸업',
 'Cat_가구주 최종 학력_대학 졸업 이상',
 'Cat_가구주 최종 학력_중학교 졸업 이하',
 'Cat_가구주 종사상 지위_무급가족종사자',
 'Cat_가구주 종사상 지위_무직 및 기타',
 'Cat_가구주 종사상 지위_사업자 및 자영자',
 'Cat_가구주 종사상 지위_상용근로자',
 'Cat_가구주 종사상 지위_임시일용근로자']]

In [104]:
X_45 = comp_45.drop('target', axis=1)
y_45 = comp_45['target']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [22]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [23]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.79402581599079


In [24]:
optuna_45 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [25]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(auc_45)

0.7942709158842163


In [26]:
X_train = X_train.values
y_train = y_train.values

In [27]:
auc_bootstrap = []

In [28]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78397958, 0.79269626])

In [29]:
np.mean(auc_bootstrap)

0.788463985559275

In [30]:
t_45 = auc_bootstrap
print(t_45)

[0.7888086890549945, 0.788507411352239, 0.7885109775503863, 0.7864296386463874, 0.7875774261242241, 0.7883378188181145, 0.7894313208352618, 0.7862552911813996, 0.789689407916009, 0.7907418326137537, 0.7867228593829579, 0.7904787264393176, 0.7907806645491373, 0.7866982922401642, 0.7908831597255243, 0.7914320900774103, 0.7840261531763995, 0.7887703854452621, 0.7912971028734576, 0.7876638073682408, 0.7918462973881694, 0.7891938384549222, 0.7893032018647781, 0.7849261559236929, 0.7855291075734425, 0.79130872603779, 0.7912038533959717, 0.7855774493705528, 0.7897159562799958, 0.7848974942571003, 0.7864758671408918, 0.79123093008561, 0.7884317287026646, 0.7880093323443076, 0.7856733404762961, 0.7902098086827152, 0.7877861147565581, 0.788871823970346, 0.7849387036579155, 0.7878769867686124, 0.786301519675904, 0.7864161663422748, 0.7855098236871636, 0.7912918196169427, 0.7905231057940418, 0.7883045343020713, 0.7880835620983404, 0.7868755454962352, 0.792302242425395, 0.787111707062446, 0.7887933

In [31]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [19]:
# 46.
column_to_drop_45 = 'Cat_가구주 성별'

In [20]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(19949, 69)


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [35]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [36]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7940245776553936


In [37]:
optuna_46 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [38]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(auc_46)

0.7958914227387134


In [39]:
X_train = X_train.values
y_train = y_train.values

In [40]:
auc_bootstrap = []

In [41]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78421326, 0.79312856])

In [42]:
np.mean(auc_bootstrap)

0.7887157377512981

In [43]:
t_46 = auc_bootstrap
print(t_46)

[0.7901255407413041, 0.7869492469246164, 0.7881366588263139, 0.7906974532590296, 0.7910917162764454, 0.7949420215430069, 0.7866441388608876, 0.786641761395456, 0.7880214838342916, 0.7860666789238218, 0.7857348904146935, 0.7902249980451952, 0.7892236888542307, 0.7892611999754857, 0.7923270737310146, 0.7931293362327845, 0.7877541510546436, 0.7875915588354012, 0.7864507716724466, 0.7882256816985881, 0.7878981197946715, 0.7892128581783753, 0.7837991052276767, 0.7879773686423933, 0.7872297878455513, 0.7890301895843768, 0.787343642023445, 0.7889222790700623, 0.7883714995783961, 0.7858660472576728, 0.7874944789969421, 0.7885231290403704, 0.7885624893014055, 0.7905220491427388, 0.7919664914738807, 0.78804856052393, 0.7873687374918902, 0.7890820975796345, 0.793529014588128, 0.7911577569828802, 0.7893467887310252, 0.790131880649122, 0.7911869469751243, 0.7902312058715999, 0.7866755742371507, 0.7889840931712853, 0.7897924314180471, 0.7918492031792524, 0.7899260978078712, 0.7893701671411031, 0.790

In [44]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [21]:
# 47.
column_to_drop_46 = '가구주 나이'

In [22]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(19949, 68)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [48]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [49]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7935467865815627


In [50]:
optuna_47 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [51]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(auc_47)

0.795694489352125


In [52]:
X_train = X_train.values
y_train = y_train.values

In [53]:
auc_bootstrap = []

In [54]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78540983, 0.79382609])

In [55]:
np.mean(auc_bootstrap)

0.7896835569735815

In [56]:
t_47 = auc_bootstrap
print(t_47)

[0.7916192494394465, 0.7886109631799287, 0.7875020076374755, 0.7946058743472537, 0.7854701992633026, 0.7934373500875964, 0.790474103589867, 0.7885496774043572, 0.7892040087237132, 0.7882363802930306, 0.7895908751820082, 0.7896623312263706, 0.7903296065241878, 0.7867874471938511, 0.7869982491287909, 0.7926732591141459, 0.7908453844414436, 0.7923878311809347, 0.7911326615144348, 0.7892814084316547, 0.7923120164499475, 0.7891635918113751, 0.7905274644806666, 0.7910572430276863, 0.7918568639011989, 0.7885014676886597, 0.7922673729323975, 0.7906569042652787, 0.7881999258230784, 0.7912889138258596, 0.7856367539249312, 0.7903035864858525, 0.7893887906203176, 0.7895626097596542, 0.7878098894108747, 0.7881492065605367, 0.7927062794673632, 0.7890308499914411, 0.7885980192014674, 0.7918892238473518, 0.7889202978488693, 0.7895719875399678, 0.7903891752413921, 0.7901003131914462, 0.7914998478422124, 0.7919369052373979, 0.7891429871109674, 0.7884609186949089, 0.7869764556956675, 0.790165297246578, 0

In [57]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [23]:
# 48
column_to_drop_47 = 'Cat_가구주 종사상 지위'

In [24]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(19949, 63)


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [61]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [62]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 140, 'subsample': 0.8, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 8}
0.7941331383918233


In [63]:
optuna_48 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [64]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(auc_48)

0.794652102841758


In [65]:
X_train = X_train.values
y_train = y_train.values

In [66]:
auc_bootstrap = []

In [67]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78447154, 0.79245824])

In [68]:
np.mean(auc_bootstrap)

0.7884876394251393

In [69]:
t_48 = auc_bootstrap
print(t_48)

[0.789027019630468, 0.7869491148432035, 0.7910647716682199, 0.7871183111330895, 0.7908605738039236, 0.7827653040091463, 0.7887203265897849, 0.7907797399792473, 0.7887439691626884, 0.786477716280672, 0.7907343039732202, 0.7862213462582921, 0.7928246244132944, 0.7916728744930716, 0.7875064984055132, 0.7866210246136355, 0.7916286272197601, 0.7902054499960904, 0.7915750021661352, 0.7907176617151986, 0.7886931178187334, 0.7893918284928139, 0.7866794045981238, 0.7882736272514598, 0.789676992263199, 0.7894640770256535, 0.790003101271574, 0.7848902297793924, 0.7882717781116796, 0.7863178977710996, 0.7899584577540243, 0.7868626015177739, 0.7863218602134858, 0.7884776930343433, 0.7903025298345495, 0.7895195512190587, 0.7880329749172114, 0.7856539245086043, 0.7860044685783601, 0.7859190119042336, 0.7865761169332598, 0.7878039457472954, 0.7879880672368357, 0.7878030211774054, 0.788469768149571, 0.7896916533000277, 0.7915079048083975, 0.7894943236692005, 0.7854803695320937, 0.7893972438307415, 0.78

In [70]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [25]:
# 49
column_to_drop_48 = 'Cat_이사 계획 중인 주택의 점유형태'

In [26]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(19949, 39)


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [74]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [75]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.05, 'n_estimators': 115, 'subsample': 0.7000000000000001, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 7}
0.7893230309332054


In [76]:
optuna_49 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)

In [77]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(auc_49)

0.7935312599721467


In [78]:
X_train = X_train.values
y_train = y_train.values

In [79]:
auc_bootstrap = []

In [80]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78169775, 0.79034429])

In [81]:
np.mean(auc_bootstrap)

0.7860589672844908

In [82]:
t_49 = auc_bootstrap
print(t_49)

[0.7880575420600051, 0.7885391108913276, 0.7877853222680808, 0.7859731652835101, 0.7857694957448653, 0.7834052384544995, 0.7869575680536272, 0.7853457785723796, 0.7847358266077478, 0.7860440930022212, 0.7844898910169846, 0.7904301204793817, 0.7856474525193737, 0.7884364836335278, 0.7865858909578121, 0.7854927851849034, 0.7889720737627142, 0.7859673537013439, 0.787436759419518, 0.7879409141724412, 0.7855128615596596, 0.785736343310235, 0.7934823898493849, 0.7879572922676371, 0.786371258661899, 0.7848038485353757, 0.7894014704359532, 0.7876294662008948, 0.7840791178229601, 0.7902257905336723, 0.7843265063092649, 0.7888254633944289, 0.789129250644029, 0.7906833205478525, 0.7872812995965706, 0.7867565401432396, 0.7828458736709968, 0.7849833471754655, 0.787424607929534, 0.7838560323166234, 0.7870948006415988, 0.783171718516546, 0.7865127178550825, 0.788505298049633, 0.7881798494483224, 0.7889670546690252, 0.7875945967078972, 0.7866755742371507, 0.7843119773538493, 0.7843856787822305, 0.7843

In [83]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [27]:
# 50
column_to_drop_49 = 'Cat_현재 거주 지역'

In [28]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(19949, 22)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 157, 'subsample': 0.6, 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 8}
0.7820273779445036


In [89]:
optuna_50 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [90]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(auc_50)

0.7862523853903164


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77983171, 0.78528307])

In [94]:
np.mean(auc_bootstrap)

0.7825878880287748

In [95]:
t_50 = auc_bootstrap
print(t_50)

[0.7834560897984544, 0.780708532247941, 0.782363380269784, 0.7805075043375537, 0.7834644109274651, 0.7832044747069378, 0.781535097729679, 0.7798901558138012, 0.7822611492562231, 0.7841323466323467, 0.7796355028497886, 0.7829695018734427, 0.7862513287390134, 0.7845642528524303, 0.7828318730412327, 0.7858426688475949, 0.7835216021792377, 0.7829366136016381, 0.7839255071397928, 0.7811367401884644, 0.7799099680257315, 0.7799818203143327, 0.7835250362959723, 0.7823345865217787, 0.7826457703304995, 0.7813735621617395, 0.7839593199814874, 0.7819457388422906, 0.7807650630926494, 0.7801835086317844, 0.7797547723656099, 0.7842549181834896, 0.7822677533268667, 0.7825937302538287, 0.7805115988613525, 0.7837235546595152, 0.784108704059443, 0.7829988239471, 0.7820426865993368, 0.7827159055607331, 0.7800590879408613, 0.7829511425570539, 0.7831961535779269, 0.7835235834004306, 0.7833809354745316, 0.7823076419135532, 0.7820956512458976, 0.78350192204872, 0.7798118315359694, 0.7820845564072165, 0.783887

In [96]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [29]:
# 51
column_to_drop_50 = 'Cat_가구주 최종 학력'

In [30]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(19949, 19)


In [99]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [100]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [101]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.06999999999999999, 'n_estimators': 95, 'subsample': 0.8, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 7}
0.7800679185687154


In [102]:
optuna_51 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [103]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(auc_51)

0.784130893736805


In [104]:
X_train = X_train.values
y_train = y_train.values

In [105]:
auc_bootstrap = []

In [106]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77627234, 0.78268315])

In [107]:
np.mean(auc_bootstrap)

0.7795671754838934

In [108]:
t_51 = auc_bootstrap
print(t_51)

[0.780816046518017, 0.7803501953748259, 0.7776032982313771, 0.7778118547822981, 0.778510829619204, 0.7811281548966278, 0.7793086013529363, 0.7796899203918908, 0.7802805884702437, 0.7806521334846458, 0.7809642418632565, 0.7780715268399997, 0.778779219050155, 0.7794554758840474, 0.7793878502006579, 0.781834922536893, 0.7813611465089298, 0.7813878269543294, 0.7800983161204837, 0.7794369844862457, 0.7772339986009937, 0.7811480991899711, 0.7777513614952039, 0.7805205803974277, 0.7772836612122326, 0.7791372917604443, 0.7808238393213762, 0.7819041331972367, 0.782396004378763, 0.779740375491607, 0.7797319222811835, 0.7768162250920871, 0.7804574454820761, 0.7766089893552948, 0.7780119581227956, 0.7785126787589842, 0.782130652820308, 0.7806731344292921, 0.7773629100599544, 0.7783944658944659, 0.7796551829803061, 0.7788711477135122, 0.7811377968397673, 0.7824386666751199, 0.7783420295735567, 0.7818934346027943, 0.7762515770520696, 0.777354192686705, 0.7767055408681025, 0.7797239973964112, 0.78005

In [109]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [31]:
# 52
column_to_drop_51 = '소득 대비 주택 임대료의 비율'

In [32]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(19949, 18)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [113]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [114]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.060000000000000005, 'n_estimators': 78, 'subsample': 0.6, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 6}
0.7762641650090171


In [115]:
optuna_52 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [116]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(auc_52)

0.77660278152889


In [117]:
X_train = X_train.values
y_train = y_train.values

In [118]:
auc_bootstrap = []

In [119]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76885508, 0.77632101])

In [120]:
np.mean(auc_bootstrap)

0.772572292780113

In [121]:
t_52 = auc_bootstrap
print(t_52)

[0.7732226860921442, 0.7762246324438442, 0.772007537093744, 0.7728315930286375, 0.7716494643834545, 0.7741477843078828, 0.7718125849283485, 0.7729764863385553, 0.7740587614356088, 0.773808335076808, 0.7733378610841665, 0.7731440976514868, 0.7748651184611774, 0.7704266546631078, 0.7710293421500318, 0.7717962068331526, 0.7704672036568587, 0.7730316963691348, 0.7716008584235184, 0.7719063627314859, 0.7746907709961897, 0.7737742580722877, 0.7732813302394583, 0.7697150000105666, 0.7745551233851725, 0.7708608062672102, 0.7733620319827216, 0.7738122975191941, 0.772789459057932, 0.7713071093612967, 0.7723148905414915, 0.7697513223991056, 0.7737051794933568, 0.7733370685956893, 0.7705596606458676, 0.7723827803877065, 0.7726013751260057, 0.7743580579171713, 0.773354503342188, 0.7729754296872523, 0.7690474605499236, 0.7727351735972426, 0.7696677148647592, 0.7731240212767306, 0.7723810633293391, 0.7736835181416462, 0.774322792179935, 0.7731534754318006, 0.7733695606232551, 0.7688959631693622, 0.77

In [122]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [37]:
# 53
column_to_drop_52 = '장기부채부담지표'

In [38]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(19949, 17)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [40]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [127]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.09999999999999999, 'n_estimators': 87, 'subsample': 0.5, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 3}
0.7669731409180276


In [128]:
optuna_53 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [129]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(auc_53)

0.7693873060252371


In [130]:
X_train = X_train.values
y_train = y_train.values

In [131]:
auc_bootstrap = []

In [132]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76364638, 0.76962718])

In [133]:
np.mean(auc_bootstrap)

0.7667421101828429

In [134]:
t_53 = auc_bootstrap
print(t_53)

[0.7690638386451194, 0.7665170448421679, 0.7653782389004063, 0.7661868413099939, 0.7679912054912055, 0.7646944534259805, 0.7677382695855602, 0.7666559944885069, 0.7656529682391752, 0.7665607637898278, 0.7657907291527982, 0.7693359263556308, 0.7669603100637582, 0.765243912103518, 0.7683014647300362, 0.7655632849598367, 0.7685138516419304, 0.7679444486710496, 0.7680449626262433, 0.7653226326255882, 0.7678331040400005, 0.765671327555564, 0.7695521436284982, 0.7661616137601359, 0.7680971347843268, 0.7662569765402278, 0.7705391880268728, 0.7670404834813701, 0.7649724848000711, 0.7662660901577157, 0.769680130517569, 0.7682841620649502, 0.7673285530428386, 0.7657865025475863, 0.7680365094158198, 0.7634063954876762, 0.7662427117476378, 0.7644835194096278, 0.7662127292669164, 0.7664219462249019, 0.7664508720543204, 0.7636667279524423, 0.7689087750664105, 0.7669683670299434, 0.7646874531110985, 0.7670452384122335, 0.7666297102873457, 0.7635478546808595, 0.7676892673813857, 0.7637779405020784, 0.

In [135]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [35]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [36]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(19949, 10)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [139]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [140]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.06999999999999999, 'n_estimators': 147, 'subsample': 0.5, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 7}
0.7660751413663051


In [141]:
optuna_54 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [142]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(auc_54)

0.7638071304943226


In [143]:
X_train = X_train.values
y_train = y_train.values

In [144]:
auc_bootstrap = []

In [145]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75834865, 0.76475698])

In [146]:
np.mean(auc_bootstrap)

0.7614565506965446

In [147]:
t_54 = auc_bootstrap
print(t_54)

[0.760901999818256, 0.7602584991747553, 0.7595352213578817, 0.759418065144666, 0.7646821698545836, 0.7603322006031366, 0.7601298518786204, 0.7607218407711018, 0.7627794050207842, 0.7616568450928056, 0.7635485150879239, 0.7639204563465648, 0.763061266755848, 0.7617568307223479, 0.7635244762707817, 0.761133802697842, 0.7618233997544342, 0.7623299319727892, 0.7609479641499345, 0.7620916571039724, 0.7592577183094424, 0.7626161523944776, 0.7622807976872016, 0.7604444698040757, 0.7634309626304701, 0.761216881906537, 0.7574347306490163, 0.760888791676969, 0.7627307990608484, 0.7586723334876044, 0.7640930867531852, 0.7580360973218117, 0.7625803583315897, 0.7632205569397686, 0.761056931315552, 0.7612773751936313, 0.7609581344187256, 0.7623481592077651, 0.762210530375555, 0.7610173068916912, 0.7628524460421012, 0.7640279706166406, 0.759756457724438, 0.7624301817651572, 0.7619581227955613, 0.7622683820343918, 0.7645072940639444, 0.760529266071138, 0.7620382962131731, 0.7643848545942142, 0.7587325

In [148]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [43]:
# 55
column_to_drop_54 = '현재 주택의 면적(㎡)'

In [44]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(19949, 9)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [46]:
def objective(trial):
    params = {
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),        
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'min_samples_split' :trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf' :trial.suggest_int('min_samples_leaf', 3, 10),
        }
        
    clf = GradientBoostingClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [47]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'learning_rate': 0.01, 'n_estimators': 127, 'subsample': 0.9, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 5}
0.7352387324894216


In [48]:
optuna_55 = GradientBoostingClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [49]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(auc_55)

0.7423720342439556


In [50]:
X_train = X_train.values
y_train = y_train.values

In [51]:
auc_bootstrap = []

In [52]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74119704, 0.74239105])

In [53]:
np.mean(auc_bootstrap)

0.7419330434410484

In [54]:
t_55 = auc_bootstrap
print(t_55)

[0.74155524278677, 0.7423720342439556, 0.7414199914199915, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.74155524278677, 0.7423720342439556, 0.7414199914199915, 0.7423720342439556, 0.74155524278677, 0.7422367828771771, 0.7414199914199915, 0.7422367828771771, 0.74155524278677, 0.7423720342439556, 0.7422367828771771, 0.74155524278677, 0.7423720342439556, 0.7422558026006303, 0.7422367828771771, 0.7411970379950676, 0.7422367828771771, 0.7422367828771771, 0.7414199914199915, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7409777828497042, 0.74155524278677, 0.7422066683150428, 0.74155524278677, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423910539674088, 0.7423720342439556, 0.74155524278677, 0.74155524278677, 0.74155524278677, 0.7423820724313337, 0.7415742625102232, 0.74155524278677, 0.7423720342439556, 0.7422066683150428, 0.7414199914199915, 0.74155524278677, 0.74155524278677, 0.7422367828771771, 0.742

In [160]:
t_55 = auc_bootstrap
print(t_55)

[0.74155524278677, 0.7423720342439556, 0.7414199914199915, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.74155524278677, 0.7423720342439556, 0.7414199914199915, 0.7423720342439556, 0.74155524278677, 0.7422367828771771, 0.7414199914199915, 0.7422367828771771, 0.74155524278677, 0.7423720342439556, 0.7422367828771771, 0.74155524278677, 0.7423720342439556, 0.7422558026006303, 0.7422367828771771, 0.7411970379950676, 0.7422367828771771, 0.7422367828771771, 0.7414199914199915, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7409777828497042, 0.74155524278677, 0.7422066683150428, 0.74155524278677, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423910539674088, 0.7423720342439556, 0.74155524278677, 0.74155524278677, 0.74155524278677, 0.7423820724313337, 0.7415742625102232, 0.74155524278677, 0.7423720342439556, 0.7422066683150428, 0.7414199914199915, 0.74155524278677, 0.74155524278677, 0.7422367828771771, 0.742

In [161]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc