In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [3]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [4]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [5]:
고령가구 = pd.read_csv('고령가구_변수추가.csv', encoding='cp949')
고령가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [6]:
고령가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [7]:
cat = 고령가구.select_dtypes(include = 'object')
num = 고령가구.select_dtypes(exclude = 'object')
num_고령 = num.drop('target',axis=1)
target = 고령가구.target

In [8]:
scaler=RobustScaler()
scaler.fit(num_고령)
num_scaled_고령=scaler.transform(num_고령)
num_df_scaled_고령=pd.DataFrame(data=num_scaled_고령, columns=num_고령.columns)

In [9]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [10]:
comp =pd.concat([num_df_scaled_고령, target,cat2],axis=1)

In [11]:
X =comp.drop('target', axis = 1)
y=comp.target
X.shape

(10564, 210)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)

{'n_estimators': 172, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 568, 'subsample': 0.9, 'colsample_bytree': 0.2, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}


In [16]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.8519372228939943


In [17]:
optuna_0 = LGBMClassifier(**study.best_trial.params, random_state = 0)

In [18]:
optuna_0.fit(X_train, y_train)

In [19]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.855


In [20]:
X_train = X_train.values
y_train = y_train.values

In [21]:
from sklearn.utils import resample
from numpy.random import RandomState

In [22]:
from scipy.stats import shapiro

In [23]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [24]:
auc_bootstrap = []
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [25]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84775104, 0.85516119])

In [26]:
np.mean(auc_bootstrap)

0.8515342901832132

In [27]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9985254406929016, pvalue=0.07753599435091019)

In [28]:
t_0 = auc_bootstrap
print(t_0)

[0.8466001577569826, 0.8475942060162777, 0.8512737083647053, 0.8513866480226597, 0.8474740955863898, 0.8541214011688358, 0.8505763507941629, 0.8518796385930946, 0.8496172600480443, 0.8497570901007494, 0.8547757341077766, 0.8522327991108243, 0.8507251443117851, 0.8521386827291957, 0.8500242013552759, 0.8505422896274785, 0.8517935893298914, 0.850489405184468, 0.8509456455487434, 0.8517398085403892, 0.8504185938116238, 0.8480728550428455, 0.8501425190921803, 0.8490946900433831, 0.8536624717650854, 0.850468789215159, 0.8483103868631456, 0.853538775949231, 0.8518957728299451, 0.8535979348176832, 0.8516555519701696, 0.8513642393603671, 0.8515327525008067, 0.8516689971675451, 0.8515067584525475, 0.8536167580940088, 0.8459126599978488, 0.852830662220788, 0.8476184073715535, 0.8511132623426912, 0.8493107095478828, 0.8532618048832956, 0.8524210318740812, 0.849629808898928, 0.8531192857911154, 0.8490301530959808, 0.8512979097199813, 0.8485604675343301, 0.8557563371696963, 0.8530386146068625, 0.85

In [29]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [30]:
# 1.
column_to_drop = 'Cat_가구주 동거 여부'

In [31]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(10564, 208)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [33]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [34]:
print(study.best_trial.params)

{'n_estimators': 172, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 568, 'subsample': 0.9, 'colsample_bytree': 0.2, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}


In [35]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.8519372228939943


In [36]:
optuna_1 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [37]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.855


In [38]:
X_train = X_train.values
y_train = y_train.values

In [39]:
auc_bootstrap = []

In [40]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84759123, 0.85526135])

In [41]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9993974566459656, pvalue=0.8036234974861145)

In [42]:
np.mean(auc_bootstrap)

0.8514348535369832

In [43]:
t_1 = auc_bootstrap
print(t_1)

[0.8491018608153167, 0.8465705783227564, 0.8478900003585386, 0.8502079523860744, 0.8485201319422035, 0.8514484959305868, 0.8528127352909541, 0.8488965974687175, 0.8521987379441396, 0.8492273493241548, 0.8485228209816787, 0.8501577569825391, 0.8505808325266215, 0.8504239718905741, 0.8505790398336381, 0.8527266860277509, 0.8502608368290846, 0.8508255351188556, 0.8489898175038544, 0.8525752034706536, 0.8556147144240077, 0.85202484672475, 0.8513579649349253, 0.8531757556200924, 0.8436431106808648, 0.8504696855616507, 0.8519504499659389, 0.8527481983435518, 0.8478317378365781, 0.8526621490803485, 0.8484762109641103, 0.8494944605786813, 0.8517156071851135, 0.852538453264494, 0.8535333978702808, 0.8515659173209995, 0.8501846473772903, 0.8495733390699509, 0.8472338747266144, 0.8552938223799793, 0.8476291635294539, 0.850668674482808, 0.8516582410096445, 0.8537664479581226, 0.8535826969273242, 0.8522847872073429, 0.8553511885554481, 0.8520866946326772, 0.8510881646409236, 0.8504562403642753, 0.8

In [44]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [45]:
#### 2. 
column_to_drop_1 = '자산 중 부동산 자산의 비중'

In [46]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(10564, 207)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [48]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [49]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 125, 'learning_rate': 0.08, 'max_depth': 2, 'num_leaves': 746, 'subsample': 0.4, 'colsample_bytree': 0.5, 'reg_alpha': 4, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8509757490259304


In [50]:
optuna_2 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [51]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.854


In [52]:
X_train = X_train.values
y_train = y_train.values

In [53]:
auc_bootstrap = []

In [54]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84884984, 0.85409821])

In [55]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9980940818786621, pvalue=0.01901491917669773)

In [56]:
np.mean(auc_bootstrap)

0.8515775297138862

In [57]:
t_2 = auc_bootstrap
print(t_2)

[0.8512562296081173, 0.8532752500806712, 0.8526863504356244, 0.8500255458750134, 0.8523520131942204, 0.8503648130221217, 0.8510097343228997, 0.8508058154960381, 0.8506063784016351, 0.8508179161736762, 0.8497624681796996, 0.8514964504678928, 0.8518200315513966, 0.851254436915134, 0.8500071707719337, 0.8540268366139616, 0.8512737083647055, 0.8502626295220681, 0.8514614929547165, 0.8509725359434943, 0.8540945107740847, 0.8523954859990679, 0.849205837008354, 0.8485653974400345, 0.8523013696174393, 0.8520978989638235, 0.8509259259259259, 0.8525187336416765, 0.85274730199706, 0.8501362446667384, 0.8500811193574989, 0.8526209171417303, 0.8516994729482629, 0.8525890968412749, 0.8513705137858091, 0.8506041375354055, 0.8504436915133913, 0.8528073572120038, 0.8527464056505683, 0.8490265677100141, 0.8510800975224984, 0.8528974400344196, 0.8494061704492489, 0.8524331325517192, 0.8526343623391058, 0.8522292137248575, 0.8521924635186977, 0.8517317414219641, 0.8518778459001111, 0.849154745258327, 0.85

In [58]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [59]:
#### 3.
column_to_drop_2 = '부채 중 임대 보증금의 비중'

In [60]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(10564, 206)


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [62]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [63]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 125, 'learning_rate': 0.08, 'max_depth': 2, 'num_leaves': 746, 'subsample': 0.4, 'colsample_bytree': 0.5, 'reg_alpha': 4, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8509757490259304


In [64]:
optuna_3 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [65]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.854


In [66]:
X_train = X_train.values
y_train = y_train.values

In [67]:
auc_bootstrap = []

In [68]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84896299, 0.85408472])

In [69]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981704354286194, pvalue=0.02438974194228649),
 0.8515778878043097)

In [70]:
t_3 = auc_bootstrap
print(t_3)

[0.8499417374780395, 0.8529458427449714, 0.8522059087160733, 0.8519813739199025, 0.852368147431071, 0.8492255566311713, 0.8507838550069914, 0.8516819941916749, 0.8508076081890215, 0.8513624466673837, 0.8525187336416766, 0.8529552543831345, 0.8526832132229035, 0.8511939335269442, 0.8537713778638271, 0.8543114266250762, 0.8504069413072317, 0.8494729482628804, 0.8525805815496037, 0.8514735936323545, 0.85204859990678, 0.8520472553870424, 0.8530735721200386, 0.8532407407407407, 0.8528987845541574, 0.8506615037108743, 0.8523475314617619, 0.8518984618694203, 0.8505857624323259, 0.8505037467283354, 0.8521548169660463, 0.8523968305188053, 0.853078950198989, 0.8518370621347389, 0.8493483561005342, 0.8526415331110394, 0.8515775698253917, 0.8511473235093757, 0.8514718009393711, 0.8503849808181851, 0.852144060808146, 0.8508555627263275, 0.8519244559176795, 0.8514193646696067, 0.8539936717937686, 0.852069215876089, 0.8507851995267292, 0.8536306514646301, 0.8527132408303753, 0.8501048725395288, 0.851

In [71]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [72]:
### 4. 
column_to_drop_3 = '부채 중 비금융기관 대출금의 비중'

In [73]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(10564, 205)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8522941018406557


In [77]:
optuna_4 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [78]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.856


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84836536, 0.85531905])

In [82]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9993829727172852, pvalue=0.7864808440208435),
 0.8519941275859596)

In [83]:
t_4 = auc_bootstrap
print(t_4)

[0.8534796170807788, 0.8522910616327848, 0.8534975440106127, 0.8583046502455989, 0.8519576207378725, 0.850143415438672, 0.8515811552113585, 0.8515569538560828, 0.8499811767236745, 0.8530269621024704, 0.8511132623426911, 0.8538471191423758, 0.8529355347603169, 0.8537772041160231, 0.8535477394141479, 0.8521225484923451, 0.8531309382955075, 0.8518778459001113, 0.8507950593381377, 0.852033810189667, 0.8525106665232512, 0.8513893370621348, 0.854681617726148, 0.8541016815460184, 0.8519629988168227, 0.8530323401814204, 0.8547542217919759, 0.8520705603958265, 0.8524255136065397, 0.8506489548599905, 0.8505888996450468, 0.8530753648130222, 0.8488069628195474, 0.8518428883869349, 0.8527248933347675, 0.8530914990498728, 0.8520158832598328, 0.851854540891327, 0.8498709261051952, 0.8524515076547992, 0.8526227098347137, 0.8520311211501919, 0.8527795704707613, 0.8503262701229787, 0.8524183428346062, 0.8505906923380302, 0.850337026280879, 0.8524031049442473, 0.8490238786705389, 0.8490911046574163, 0.85

In [84]:
## 5.현재 주택의 위치
column_to_drop_4 = '부채 중 금융기관 대출금의 비중'

In [85]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(10564, 204)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 122, 'learning_rate': 0.09, 'max_depth': 3, 'num_leaves': 602, 'subsample': 0.4, 'colsample_bytree': 0.4, 'reg_alpha': 8, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8514389918939496


In [89]:
optuna_5 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [90]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.855


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84889563, 0.85535914])

In [94]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984696507453918, pvalue=0.06472457945346832),
 0.8521926214997669)

In [95]:
t_5 = auc_bootstrap
print(t_5)

[0.8518052418342834, 0.8533971532035425, 0.8547461546735506, 0.8552059804237926, 0.851928937650138, 0.8536077946290919, 0.8530959807823313, 0.8534204582123266, 0.8531865117779929, 0.8505844179125883, 0.8526935212075579, 0.8511428417769173, 0.8536633681115771, 0.8501443117851637, 0.8515981857947009, 0.8539313757125955, 0.8553995912659997, 0.8530269621024703, 0.8532680793087375, 0.8508434620486895, 0.8518384066544763, 0.8514117457244272, 0.8548671614499301, 0.8530027607471945, 0.8534374887956688, 0.8563165537270088, 0.8517523573912732, 0.8545955684629452, 0.8522435552687248, 0.8514189164963608, 0.8518330285755261, 0.8528243877953461, 0.8512073787243195, 0.8496262235129612, 0.8552732064106701, 0.8535459467211646, 0.8516080456061095, 0.8521368900362125, 0.8528853393567818, 0.8526899358215911, 0.8531210784840988, 0.8520230540317666, 0.8530153095980783, 0.8523403606898283, 0.8541456025241116, 0.8523045068301602, 0.8514207091893442, 0.8508533218600983, 0.8518025527948083, 0.8538453264493923, 

In [96]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [97]:
## 6
column_to_drop_5 = '소득 중 재산소득의 비중(월평균)'

In [98]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(10564, 203)


In [99]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [100]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [101]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 846, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8513452237001209


In [102]:
optuna_6 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [103]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.856


In [104]:
X_train = X_train.values
y_train = y_train.values

In [105]:
auc_bootstrap = []

In [106]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8484923 , 0.85580089])

In [107]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9972283244132996, pvalue=0.0012343010166659951),
 0.8523021104478147)

In [108]:
t_6 = auc_bootstrap
print(t_6)

[0.849334014556667, 0.851415331110394, 0.8503576422501883, 0.8535486357606397, 0.8530421999928293, 0.8530547488437131, 0.8512235129611703, 0.8569744720519163, 0.852866516080456, 0.8550894553798716, 0.854279158151375, 0.8536122763615502, 0.8522713420099675, 0.8532134021727439, 0.8525259044136101, 0.8521539206195546, 0.854135742712703, 0.8517003692947547, 0.8531972679358932, 0.8528593453085225, 0.8518375103079845, 0.8522749273959341, 0.8524613674662076, 0.8522014269836148, 0.8546072209673371, 0.8513292818471909, 0.8508515291671148, 0.8527383385321429, 0.8538256068265749, 0.8546502455989387, 0.8476336452619123, 0.8482736366569861, 0.8530619196156466, 0.8534482449535692, 0.8512459216234628, 0.8531578286902585, 0.8534823061202538, 0.8523493241547453, 0.8527894302821699, 0.8534007385895092, 0.8519746513212147, 0.8515533684701158, 0.8506399913950737, 0.8526594600408735, 0.8544889032304328, 0.85302158402352, 0.8529982790147359, 0.8505602165573124, 0.8531336273349825, 0.8525940267469793, 0.8521

In [109]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [110]:
## 7 .
column_to_drop_6 = 'Cat_가구주 주민등록상 등재 여부'

In [111]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(10564, 201)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [113]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [114]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8504173384388016


In [115]:
optuna_7 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [116]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.856


In [117]:
X_train = X_train.values
y_train = y_train.values

In [118]:
auc_bootstrap = []

In [119]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84877823, 0.85567499])

In [120]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990171790122986, pvalue=0.3483233153820038),
 0.852209962443082)

In [121]:
t_7 = auc_bootstrap
print(t_7)

[0.8511096769567245, 0.8506785342942168, 0.8535952457782079, 0.8521835000537809, 0.8518554372378188, 0.8538524972213257, 0.8527356494926679, 0.8494460578681295, 0.8529391201462838, 0.8533586103043993, 0.8550930407658385, 0.8534625864974366, 0.8515775698253918, 0.851729052382489, 0.8494469542146212, 0.8529964863217526, 0.8517523573912733, 0.8510290057724714, 0.8520849019396939, 0.8519226632246961, 0.8545695744146857, 0.8542334444802984, 0.8527562654619769, 0.8537207342870461, 0.8457719335986518, 0.8507197662328349, 0.8507484493205694, 0.8535056111290381, 0.8525545875013445, 0.8531201821376071, 0.8490839338854829, 0.8497687426051416, 0.8495572048331003, 0.8554883295686782, 0.8504203865046073, 0.8544530493707647, 0.8561103940339179, 0.8541205048223441, 0.8502070560395827, 0.8506507475529741, 0.8547990391165609, 0.8550455344017783, 0.8514673192069127, 0.8526209171417303, 0.8492183858592377, 0.8529444982252341, 0.8543069448926176, 0.8534114947474097, 0.8499874511491161, 0.8500743967588111, 

In [122]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [123]:
## 8 .
column_to_drop_7 = 'Cat_현재 주택의 위치'

In [124]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(10564, 197)


In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [126]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [127]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 185, 'learning_rate': 0.09, 'max_depth': 2, 'num_leaves': 392, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6, 'reg_alpha': 10, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.851290642214161


In [128]:
optuna_8 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [129]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.855


In [130]:
X_train = X_train.values
y_train = y_train.values

In [131]:
auc_bootstrap = []

In [132]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84895605, 0.85464628])

In [133]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9968605637550354, pvalue=0.00041273710667155683),
 0.8519538090244164)

In [134]:
t_8 = auc_bootstrap
print(t_8)

[0.8500761894517944, 0.8528978882076657, 0.8529337420673335, 0.848313075902621, 0.8513552758954502, 0.8524290989925064, 0.8514942096016636, 0.8511903481409773, 0.852863827040981, 0.8538560826072927, 0.8521673658169302, 0.8504652038291922, 0.8532653902692624, 0.8555555555555554, 0.8533012441289304, 0.852654978308415, 0.8524353734179485, 0.8521996342906315, 0.852493635939909, 0.8501452081316553, 0.8524837761285002, 0.8510549998207307, 0.8496441504427952, 0.8542316517873149, 0.8530789501989889, 0.8497499193288156, 0.8522148721809902, 0.8544602201426984, 0.8512665375927719, 0.8523753182030047, 0.8527661252733856, 0.8522381771897745, 0.8519517944856763, 0.8533783299272166, 0.8545390986339679, 0.8526119536768133, 0.8530870173174143, 0.8511365673514755, 0.851298806066473, 0.8487173281703775, 0.8508829012943244, 0.85148703882973, 0.8511437381234089, 0.8522543114266252, 0.8522991287512101, 0.8532501523789036, 0.8526675271592988, 0.8500286830877344, 0.8531596213832419, 0.850152378903589, 0.85338

In [135]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [136]:
#9.
column_to_drop_8 = 'Cat_이사 예상 기간'

In [137]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(10564, 193)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [139]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [140]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 80, 'learning_rate': 0.09, 'max_depth': 6, 'num_leaves': 430, 'subsample': 0.8, 'colsample_bytree': 0.4, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8511870773433651


In [141]:
optuna_9 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [142]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.855


In [143]:
X_train = X_train.values
y_train = y_train.values

In [144]:
auc_bootstrap = []

In [145]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84743132, 0.85487993])

In [146]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991679191589355, pvalue=0.5134629607200623),
 0.8512325288623571)

In [147]:
t_9 = auc_bootstrap
print(t_9)

[0.8522543114266251, 0.8491914954644868, 0.8532349144885446, 0.8540245957477323, 0.8487827614642716, 0.8536427521422681, 0.852404001290739, 0.8528530708830806, 0.849793840306909, 0.8492533433724141, 0.8498287978200854, 0.8493564232189594, 0.851161665053243, 0.8526280879136638, 0.8496450467892869, 0.852872790505898, 0.8495823025348679, 0.8520822129002187, 0.8463124305331469, 0.8493438743680757, 0.8488168226309561, 0.852155713312538, 0.8506507475529741, 0.8480101107884264, 0.8511966225664194, 0.8520391882686171, 0.8528799612778315, 0.850095909074612, 0.8520042307554409, 0.850525259044136, 0.8506695708293, 0.8488230970563981, 0.8496280162059446, 0.8496333942848947, 0.8498243160876269, 0.8486214190957657, 0.8507538273995195, 0.8492811301136567, 0.849800114732351, 0.8508846939873077, 0.8542477860241655, 0.8539421318704958, 0.8520176759528163, 0.8512826718296225, 0.8535154709404468, 0.8506068265748807, 0.850056469828977, 0.8532331217955613, 0.8544826288049908, 0.8535360869097557, 0.851203793

In [148]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [149]:
# 10.
column_to_drop_9 = 'Cat_소득 계층'

In [150]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(10564, 191)


In [151]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [152]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [153]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 132, 'learning_rate': 0.09, 'max_depth': 2, 'num_leaves': 878, 'subsample': 0.4, 'colsample_bytree': 0.8, 'reg_alpha': 3, 'reg_lambda': 9, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8498939159836983


In [154]:
optuna_10 = LGBMClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [155]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.854


In [156]:
X_train = X_train.values
y_train = y_train.values

In [157]:
auc_bootstrap = []

In [158]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84898406, 0.85453372])

In [159]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9957255721092224, pvalue=1.820829311327543e-05),
 0.8519408402352013)

In [160]:
t_10 = auc_bootstrap
print(t_10)

[0.8521682621634219, 0.8530502671112544, 0.851047829048797, 0.8525178372951849, 0.8508049191495464, 0.8505727654081962, 0.850415008425657, 0.8500582625219606, 0.8511643540927181, 0.8490238786705387, 0.8536503710874476, 0.8483444480298306, 0.8530323401814206, 0.8525066329640386, 0.8504526549783084, 0.8522641712380337, 0.8498037001183177, 0.8515569538560827, 0.85090172457065, 0.8518240651106092, 0.8515762253056541, 0.8518760532071277, 0.8520920727116276, 0.8501098024452333, 0.8530287547954537, 0.8539968090064897, 0.8515237890358897, 0.8515067584525474, 0.8512763974041806, 0.8546067727940914, 0.8539779857301639, 0.8488710515937041, 0.8527535764225019, 0.8507896812591875, 0.8517093327596716, 0.8526124018500592, 0.8505709727152128, 0.8528620343479976, 0.8517711806675989, 0.8517093327596716, 0.8526764906242157, 0.8507273851780145, 0.8554417195511096, 0.8518289950163135, 0.8521359936897207, 0.8529037144598616, 0.8515605392420494, 0.8500040335592125, 0.848984439424904, 0.8520822129002187, 0.85

In [161]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [162]:
# 11.
column_to_drop_10 = 'Cat_이사 계획 첫 번째 이유'

In [163]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(10564, 179)


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [165]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [166]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 183, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 660, 'subsample': 0.2, 'colsample_bytree': 0.4, 'reg_alpha': 10, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.851010737157956


In [167]:
optuna_11 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [168]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.854


In [169]:
X_train = X_train.values
y_train = y_train.values

In [170]:
auc_bootstrap = []

In [171]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.846093  , 0.85387502])

In [172]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999437689781189, pvalue=0.8486235737800598),
 0.8500138879925424)

In [173]:
t_11 = auc_bootstrap
print(t_11)

[0.8475458033057259, 0.8510164569215874, 0.8536346850238428, 0.8467525366605715, 0.8507681689433868, 0.8491287512100677, 0.8511840737155355, 0.846189631063784, 0.8539134487827615, 0.8523000250977019, 0.8517774550930408, 0.8529588397691013, 0.8523995195582805, 0.8496118819690941, 0.8484161557491663, 0.851256677781363, 0.8500699150263525, 0.8496540102542041, 0.849569753683984, 0.8512997024129647, 0.8545525438313434, 0.8484654548062098, 0.8521028288695277, 0.8463877236384498, 0.8507843031802373, 0.8489225915169769, 0.8502025743071241, 0.8489674088415617, 0.8500699150263527, 0.852254311426625, 0.8508425657021979, 0.8504625147897171, 0.8509770176759528, 0.8531721702341257, 0.8474041805600372, 0.8539555770678713, 0.8526003011724212, 0.8524326843784734, 0.8536266179054176, 0.8482754293499697, 0.8484233265211, 0.8479052382488975, 0.8486913341221181, 0.8481884837402747, 0.8525536911548528, 0.8527562654619769, 0.8496602846796457, 0.8509564017066437, 0.8502321537413503, 0.8486572729554337, 0.8495

In [174]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [175]:
# 12
column_to_drop_11 = 'Cat_현재 주택의 구조'

In [176]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(10564, 177)


In [177]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [178]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [179]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 155, 'learning_rate': 0.05, 'max_depth': 4, 'num_leaves': 724, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8519596152984907


In [180]:
optuna_12 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [181]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.856


In [182]:
X_train = X_train.values
y_train = y_train.values

In [183]:
auc_bootstrap = []

In [184]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.85008549, 0.85614304])

In [185]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9993724822998047, pvalue=0.7737897038459778),
 0.8531838213939981)

In [186]:
t_12 = auc_bootstrap
print(t_12)

[0.8519898892115736, 0.8530430963393208, 0.8559176795382022, 0.8509797067154279, 0.8517935893298914, 0.8531954752429098, 0.854147395217095, 0.8546394894410383, 0.8511786956365852, 0.8549603814850668, 0.8530986698218064, 0.8557966727618228, 0.8550222293929941, 0.8548689541429135, 0.8564017066437202, 0.8541922125416801, 0.8515695027069663, 0.8515524721236242, 0.853458104764978, 0.8508990355311749, 0.8537126671686206, 0.8518823276325697, 0.8524864651679753, 0.8546717579147395, 0.8546359040550714, 0.8537144598616041, 0.853956473414363, 0.8531372127209494, 0.8535253307518553, 0.8547748377612849, 0.8542316517873147, 0.8519934745975404, 0.855473988024811, 0.8534177691728514, 0.8514807644042881, 0.8562762181348822, 0.8548967408841563, 0.8529453945717257, 0.8545140009322003, 0.851540819619232, 0.8542442006381987, 0.8536768133089526, 0.8518563335843103, 0.8520499444265175, 0.8545373059409845, 0.8543804453049371, 0.8557204833100284, 0.8528028754795454, 0.8529462909182174, 0.8517066437201963, 0.85

In [187]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [188]:
# 13.
column_to_drop_12 = 'Cat_가구주 최종 학력'

In [189]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(10564, 174)


In [190]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [191]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [192]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 151, 'learning_rate': 0.08, 'max_depth': 2, 'num_leaves': 672, 'subsample': 0.4, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8516657149894756


In [193]:
optuna_13 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [194]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.854


In [195]:
X_train = X_train.values
y_train = y_train.values

In [196]:
auc_bootstrap = []

In [197]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84938075, 0.85429991])

In [198]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979087114334106, pvalue=0.010426516644656658),
 0.8519407618048833)

In [199]:
t_13 = auc_bootstrap
print(t_13)

[0.8521126886809366, 0.8517451866193396, 0.8522471406546915, 0.8496979312322971, 0.8527948083611201, 0.851842440213689, 0.8517846258649744, 0.85310584059374, 0.852985730163852, 0.8536660571510525, 0.852183500053781, 0.8522865799003263, 0.8511087806102327, 0.8537216306335378, 0.8542307554408232, 0.8527356494926679, 0.8514511849700619, 0.8511903481409773, 0.8517039546807215, 0.8499959664407875, 0.8497732243376, 0.8526182281022552, 0.85072245527231, 0.8528190097163959, 0.8519253522641712, 0.851388440715643, 0.8523851780144133, 0.8532940733569969, 0.8528660679072102, 0.8513983005270519, 0.8532116094797605, 0.8509097916890753, 0.8502115377720411, 0.8523493241547452, 0.854514897278692, 0.851179591983077, 0.8526182281022552, 0.851493313255172, 0.8520607005844179, 0.8508631816715069, 0.8523672510845792, 0.8524255136065398, 0.8518886020580114, 0.8533657810763329, 0.8520571151984511, 0.852126133878312, 0.8510209386540463, 0.8524506113083073, 0.8520517371195011, 0.851594600408734, 0.8488159262844

In [200]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [201]:
#14.
column_to_drop_13 = '총 가구원 수'

In [202]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(10564, 173)


In [203]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [204]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [205]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 180, 'learning_rate': 0.060000000000000005, 'max_depth': 4, 'num_leaves': 678, 'subsample': 0.2, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 4, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8521191611805276


In [206]:
optuna_14 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [207]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.856


In [208]:
X_train = X_train.values
y_train = y_train.values

In [209]:
auc_bootstrap = []

In [210]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84949327, 0.85670494])

In [211]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9978638291358948, pvalue=0.009022600017488003),
 0.853203040855473)

In [212]:
t_14 = auc_bootstrap
print(t_14)

[0.854288121616292, 0.850394392456348, 0.852254311426625, 0.8548976372306478, 0.8533989458965257, 0.8527974974005952, 0.8525563801943279, 0.8496925531533469, 0.8552767917966368, 0.8543768599189703, 0.8558934781829265, 0.8537063927431788, 0.8545713671076693, 0.8548142770069198, 0.8503235810835035, 0.851457459395504, 0.8544010612742461, 0.8554148291563587, 0.8517747660535656, 0.8524362697644402, 0.8511034025312825, 0.8566213115341866, 0.852630776953139, 0.8532098167867772, 0.8562842852533075, 0.8545901903839948, 0.8538327775985086, 0.8531721702341256, 0.8533801226202001, 0.8555134272704457, 0.8557330321609122, 0.8536875694668531, 0.8532779391201462, 0.8517451866193395, 0.8533872933921336, 0.8544261589760138, 0.8563371696963178, 0.8543992685812628, 0.8506211681187479, 0.8557025563801943, 0.8564330787709297, 0.8523663547380875, 0.8516824423649205, 0.8555896167222401, 0.8515470940446739, 0.854715678892833, 0.8537852712344484, 0.8530843282779392, 0.855231974472052, 0.8508811086013409, 0.8539

In [213]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [214]:
## 15.
column_to_drop_14 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [215]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(10564, 169)


In [216]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [217]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [218]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 846, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8516587173630704


In [219]:
optuna_15 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [220]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.857


In [221]:
X_train = X_train.values
y_train = y_train.values

In [222]:
auc_bootstrap = []

In [223]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84860524, 0.85566507])

In [224]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991265535354614, pvalue=0.46422335505485535),
 0.8521931725287727)

In [225]:
t_15 = auc_bootstrap
print(t_15)

[0.8547757341077766, 0.8525743071241618, 0.8490167078986053, 0.851415331110394, 0.8506104119608476, 0.8562439496611811, 0.8508846939873078, 0.8532465669929369, 0.8528674124269477, 0.8558907891434513, 0.851343623391058, 0.8530027607471944, 0.8525841669355706, 0.8514251909218027, 0.8488266824423649, 0.851615216378043, 0.8525886486680292, 0.8533093112473559, 0.8489378294073358, 0.8540479007565166, 0.84968358968843, 0.8554811587967445, 0.8478398049550034, 0.8515461976981822, 0.8501048725395289, 0.8551172421211144, 0.853610483668567, 0.8515040694130722, 0.8506561256319242, 0.8511473235093757, 0.8509564017066438, 0.8519208705317127, 0.8492560324118891, 0.8511455308163923, 0.851824065110609, 0.855082284607938, 0.8564277006919795, 0.8521001398300527, 0.8558872037574845, 0.8551163457746227, 0.8555268724678211, 0.8528916137822237, 0.8542764691118999, 0.850370191101072, 0.8554237926212757, 0.8546574163708723, 0.8497239252805564, 0.8539394428310206, 0.8544180918575884, 0.8546260442436627, 0.851072

In [226]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [227]:
# 16.
column_to_drop_15 = 'Cat_가구주 성별'

In [228]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(10564, 167)


In [229]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [230]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [231]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 1000, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8511912759192083


In [232]:
optuna_16 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [233]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.856


In [234]:
X_train = X_train.values
y_train = y_train.values

In [235]:
auc_bootstrap = []

In [236]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84851088, 0.8553076 ])

In [237]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9985681772232056, pvalue=0.08899028599262238),
 0.8520636643540928)

In [238]:
t_16 = auc_bootstrap
print(t_16)

[0.8537691369975978, 0.8515444050051988, 0.8554237926212757, 0.8523000250977018, 0.8536060019361084, 0.8531282492560324, 0.8535495321071314, 0.8533299272166648, 0.8503558495572048, 0.8538354666379835, 0.8511634577462264, 0.849827005127102, 0.8527607471944354, 0.850011652504392, 0.8537646552651392, 0.8526504965759564, 0.8512181348822201, 0.8529202968699581, 0.8502518733641677, 0.8529588397691013, 0.8550231257394857, 0.8526433258040229, 0.8514467032376036, 0.8518787422466028, 0.8541411207916532, 0.8522847872073429, 0.8507986447241046, 0.853234018142053, 0.8524927395934172, 0.8532555304578537, 0.8514475995840953, 0.8517093327596714, 0.8534563120719946, 0.8509071026496002, 0.8510442436628303, 0.8479070309418809, 0.8520571151984511, 0.8520382919221254, 0.8532447742999535, 0.8521969452511564, 0.8513301781936826, 0.8522991287512102, 0.8507690652898785, 0.8538784912695851, 0.8564483166612886, 0.8489683051880537, 0.8522175612204652, 0.8499569753683983, 0.8508945537987165, 0.8509832921013948, 0.

In [239]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [240]:
# 17.
column_to_drop_16 = 'Cat_현재 상업시설 접근용이성'

In [241]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(10564, 163)


In [242]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [243]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [244]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 121, 'learning_rate': 0.09, 'max_depth': 4, 'num_leaves': 394, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.852079974472659


In [245]:
optuna_17 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [246]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.855


In [247]:
X_train = X_train.values
y_train = y_train.values

In [248]:
auc_bootstrap = []

In [249]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84861398, 0.85576203])

In [250]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988976120948792, pvalue=0.24764730036258698),
 0.8522601856333584)

In [251]:
t_17 = auc_bootstrap
print(t_17)

[0.8537736187300562, 0.8516681008210534, 0.8494756373023556, 0.8529462909182173, 0.8511374636979671, 0.8482198558674842, 0.8511025061847909, 0.8516600337026281, 0.8491879100785199, 0.8526101609838299, 0.8502169158509914, 0.8528521745365889, 0.8525913377075043, 0.8512226166146786, 0.8546242515506794, 0.853296762396472, 0.8520114015273743, 0.8544691836076155, 0.8517066437201964, 0.8530045534401779, 0.8508434620486895, 0.8519970599835072, 0.8522552077731168, 0.8547246423577499, 0.8524317880319817, 0.8504087340002151, 0.8524775017030584, 0.8558872037574846, 0.8513427270445664, 0.8547407765946005, 0.8523529095407121, 0.8523125739485855, 0.853034132874404, 0.8506328206231402, 0.8538614606862428, 0.8524497149618155, 0.8524290989925067, 0.8503773618730057, 0.8498099745437595, 0.8534410741816356, 0.852078627514252, 0.8498359685920189, 0.8523242264529778, 0.853081639238464, 0.8532098167867771, 0.8500869456096949, 0.8557617152486464, 0.8526944175540496, 0.8560333082356315, 0.8547004410024741, 0.8

In [252]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [253]:
## 18.
column_to_drop_17 ='Cat_현재 대기오염 정도'

In [254]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(10564, 159)


In [255]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [256]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [257]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 103, 'learning_rate': 0.08, 'max_depth': 10, 'num_leaves': 374, 'subsample': 0.5, 'colsample_bytree': 0.2, 'reg_alpha': 2, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.852935084419365


In [258]:
optuna_18 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [259]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.856


In [260]:
X_train = X_train.values
y_train = y_train.values

In [261]:
auc_bootstrap = []

In [262]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84371524, 0.85453143])

In [263]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9947937726974487, pvalue=1.8216071566712344e-06),
 0.8493069968807142)

In [264]:
t_18 = auc_bootstrap
print(t_18)

[0.8513669283998423, 0.8461994908751929, 0.849477429995339, 0.8432451328385502, 0.8463796565200246, 0.8458077874583199, 0.8497427485568821, 0.8500878419561866, 0.8479778423147252, 0.8469855867484134, 0.8503101358861281, 0.8493438743680758, 0.8493582159119428, 0.8483229357140296, 0.8570775518984619, 0.8491260621705927, 0.8499148470832885, 0.8525070811372844, 0.8530090351726364, 0.8498852676490625, 0.8467480549281129, 0.8473109605249005, 0.8480396902226525, 0.8520114015273744, 0.8486805779642178, 0.8480791294682872, 0.8508407730092145, 0.8565576709332761, 0.8461206123839231, 0.8516421067727942, 0.8478720734287045, 0.848055824459503, 0.8484995159728945, 0.8529758703524435, 0.8516403140798107, 0.851200207952386, 0.8541940052346635, 0.8488006883941057, 0.8458696353662472, 0.8440697716109138, 0.8503576422501883, 0.8450584417912588, 0.8538865583880106, 0.8463402172743895, 0.8490973790828583, 0.8527015883259832, 0.8469120863360942, 0.8502455989387258, 0.8513364526191244, 0.8527759850847944, 0.

In [265]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [266]:
# 19
column_to_drop_18 = 'Cat_현재 공공기관 접근용이성'

In [267]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(10564, 155)


In [268]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [269]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [270]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 1000, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8504327332168928


In [271]:
optuna_19 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [272]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.856


In [273]:
X_train = X_train.values
y_train = y_train.values

In [274]:
auc_bootstrap = []

In [275]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84842736, 0.85550339])

In [276]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984383583068848, pvalue=0.05846213549375534),
 0.8521406972679358)

In [277]:
t_19 = auc_bootstrap
print(t_19)

[0.8516071492596178, 0.8521342009967372, 0.8526182281022552, 0.8538749058836184, 0.8511544942813094, 0.8502715929869851, 0.853045785378796, 0.8498763041841454, 0.8542316517873149, 0.8499578717148901, 0.8526101609838299, 0.8506355096626153, 0.8530959807823313, 0.8534527266860277, 0.8507206625793268, 0.852827973181313, 0.8544673909146319, 0.8503065505001615, 0.8512127568032697, 0.8539528880283962, 0.8549603814850668, 0.8483023197447206, 0.8539008999318776, 0.8506516438994659, 0.8538480154888672, 0.85144491054462, 0.8534733426553367, 0.8522229392994156, 0.8504311426625075, 0.8530377182603708, 0.8518536445448353, 0.8518186870316589, 0.8515013803735971, 0.8480289340647521, 0.8536544046466602, 0.8516734789000037, 0.8530556451902046, 0.8522381771897745, 0.8540954071205765, 0.8522749273959341, 0.8501317629342799, 0.8497418522103906, 0.8518294431895594, 0.8548483381736044, 0.8521467498476212, 0.8552543831343444, 0.8518016564483167, 0.853669642537019, 0.8529104370585494, 0.8494899788462229, 0.85

In [278]:
# 20.
column_to_drop_19 = 'Cat_현재 청소/쓰레기 처리상태'

In [279]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(10564, 151)


In [280]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [281]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [282]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 125, 'learning_rate': 0.08, 'max_depth': 3, 'num_leaves': 850, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8529014958126204


In [283]:
optuna_20 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [284]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.856


In [285]:
X_train = X_train.values
y_train = y_train.values

In [286]:
auc_bootstrap = []

In [287]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.85002059, 0.85593751])

In [288]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988405704498291, pvalue=0.20889036357402802),
 0.8530461322648882)

In [289]:
t_20 = auc_bootstrap
print(t_20)

[0.8530350292208956, 0.8508058154960382, 0.8525348678785272, 0.8511482198558674, 0.8528145279839374, 0.8546287332831379, 0.8543427987522856, 0.8545364095944927, 0.8511724212111433, 0.8520544261589761, 0.8529704922734932, 0.8535567028790649, 0.8526953139005413, 0.8535979348176832, 0.8530475780717794, 0.8536992219712453, 0.8558504535513248, 0.8513929224481016, 0.8526343623391057, 0.8515479903911656, 0.8528441074181636, 0.8543132193180596, 0.8548662651034382, 0.853191889856943, 0.8514538740095371, 0.8513023914524399, 0.8531820300455344, 0.8547784231472517, 0.8490149152056219, 0.8527213079488007, 0.8549182531999571, 0.8545848123050447, 0.8528252841418378, 0.853018894984045, 0.8531264565630489, 0.8554937076476281, 0.854386719730379, 0.8525994048259296, 0.8515524721236242, 0.8527670216198774, 0.8521628840844717, 0.8526585636943818, 0.8539627478398049, 0.8511249148470833, 0.8532662866157541, 0.8536373740633179, 0.8490857265784663, 0.8539394428310207, 0.8530708830805637, 0.8531856154315012, 0.

In [290]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [291]:
# 21
column_to_drop_20 = '자산 중 기타자산의 비중'

In [292]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(10564, 150)


In [293]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [294]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [295]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.851175881141117


In [296]:
optuna_21 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [297]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.855


In [298]:
X_train = X_train.values
y_train = y_train.values

In [299]:
auc_bootstrap = []

In [300]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8483552 , 0.85527251])

In [301]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990760087966919, pvalue=0.4079742431640625),
 0.8518362070201857)

In [302]:
t_21 = auc_bootstrap
print(t_21)

[0.8486106629378651, 0.8488069628195476, 0.8490364275214227, 0.8509357857373345, 0.852576996163637, 0.8552104621562511, 0.8512530923953963, 0.8559114051127603, 0.8515148255709728, 0.8549935463052597, 0.850973432289986, 0.855440823204618, 0.8508353949302642, 0.8526253988741888, 0.8508694560969489, 0.8543320425943852, 0.8487182245168693, 0.8522238356459073, 0.8516734789000037, 0.8534052203219678, 0.8479509519199742, 0.8522408662292495, 0.8526433258040228, 0.8510756157900398, 0.8493913807321358, 0.8507269370047686, 0.8536544046466601, 0.8500017926929835, 0.852845900111147, 0.8503666057151054, 0.8550123695815856, 0.852126133878312, 0.8502527697106594, 0.8529462909182175, 0.8506919794915923, 0.8532483596859202, 0.8495939550392599, 0.8463393209278979, 0.8517102291061632, 0.8536714352300024, 0.8538722168441433, 0.8534984403571045, 0.8520140905668494, 0.8542092431250224, 0.848838334946757, 0.8528799612778315, 0.8509429565092682, 0.8537072890896705, 0.8509949446057867, 0.8508371876232478, 0.855

In [303]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [304]:
# 22
column_to_drop_21 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [305]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(10564, 149)


In [306]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [307]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [308]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 144, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 968, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8520071991580456


In [309]:
optuna_22 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [310]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.855


In [311]:
X_train = X_train.values
y_train = y_train.values

In [312]:
auc_bootstrap = []

In [313]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84841519, 0.85526276])

In [314]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987152814865112, pvalue=0.1420203298330307),
 0.8518491565379513)

In [315]:
t_22 = auc_bootstrap
print(t_22)

[0.853519952672905, 0.8533980495500341, 0.8492614104908395, 0.8487513893370621, 0.8515040694130724, 0.8518886020580115, 0.8528530708830806, 0.8537978200853321, 0.8516949912158043, 0.8539735039977054, 0.8518626080097524, 0.8537261123659963, 0.8524371661109319, 0.8513319708866659, 0.8553709081782654, 0.852266860277509, 0.8528441074181635, 0.8524120684091643, 0.8531748592736007, 0.8499058836183716, 0.8538184360546412, 0.8496136746620774, 0.8537207342870461, 0.8523251227994694, 0.8514655265139291, 0.850011652504392, 0.854175181958338, 0.8541769746513211, 0.8535916603922412, 0.8535952457782081, 0.8531623104227171, 0.8495410705962498, 0.852989315549819, 0.8485299917536122, 0.8521324083037538, 0.8487729016528629, 0.8511509088953426, 0.8459646480943673, 0.8538498081818507, 0.8515632282815244, 0.8513965078340683, 0.8516188017640098, 0.8551539923272741, 0.8531228711770822, 0.8539761930371804, 0.852992900935786, 0.8480594098454699, 0.8464065469147753, 0.8538238141335914, 0.8506077229213724, 0.852

In [316]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [317]:
# 23.
column_to_drop_22 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [318]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(10564, 145)


In [319]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [320]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [321]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 151, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 652, 'subsample': 0.2, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8507070401719737


In [322]:
optuna_23 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [323]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.855


In [324]:
X_train = X_train.values
y_train = y_train.values

In [325]:
auc_bootstrap = []

In [326]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84693427, 0.85481251])

In [327]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989327788352966, pvalue=0.2744574546813965),
 0.8508832127747301)

In [328]:
t_23 = auc_bootstrap
print(t_23)

[0.8518043454877918, 0.8523045068301602, 0.8514628374744541, 0.8517433939263561, 0.846981105015955, 0.8472491126169732, 0.8477403104944248, 0.8502948979957693, 0.8503361299343876, 0.8516259725359434, 0.8507538273995197, 0.848940518446811, 0.8523152629880607, 0.8475511813846761, 0.8513310745401743, 0.8493178803198165, 0.8506803269872002, 0.8520481517335341, 0.8498458284034276, 0.8523089885626188, 0.8504562403642751, 0.8505162955792192, 0.8488670180344914, 0.8531730665806174, 0.8536373740633179, 0.8523511168477287, 0.8496970348858053, 0.8508882793732746, 0.8497777060700584, 0.8472768993582158, 0.8473154422573592, 0.8492300383636299, 0.8540667240328421, 0.8527419239181098, 0.8527338567996845, 0.851729052382489, 0.8514906242156968, 0.8467229572263453, 0.852845900111147, 0.8534312143702271, 0.848721809902836, 0.8474104549854791, 0.8500340611666846, 0.8494496432540963, 0.8517281560359972, 0.8553251945071888, 0.8472222222222223, 0.8541196084758524, 0.8514915205621886, 0.8496307052454196, 0.85

In [329]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [330]:
# 24
column_to_drop_23 = 'Cat_가구주 장애 여부'

In [331]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(10564, 143)


In [332]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [333]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [334]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 87, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 436, 'subsample': 0.5, 'colsample_bytree': 0.4, 'reg_alpha': 10, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.851126897756281


In [335]:
optuna_24 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [336]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.854


In [337]:
X_train = X_train.values
y_train = y_train.values

In [338]:
auc_bootstrap = []

In [339]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84822998, 0.85461825])

In [340]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979223012924194, pvalue=0.010892732068896294),
 0.8515439404736295)

In [341]:
t_24 = auc_bootstrap
print(t_24)

[0.852108206948478, 0.8532465669929368, 0.8514511849700622, 0.848934244021369, 0.850394392456348, 0.8506005521494389, 0.8482306120253845, 0.8516430031192856, 0.8538148506686746, 0.8516824423649206, 0.8492363127890716, 0.849307124161916, 0.8515686063604746, 0.8533021404754222, 0.8522543114266251, 0.8514493922770786, 0.8525832705890788, 0.8489324513283854, 0.8534123910939012, 0.851406367645477, 0.8511652504392099, 0.8517962783693664, 0.8500537807895019, 0.8509313040048762, 0.851125811193575, 0.8514915205621885, 0.8514816607507798, 0.853359506650891, 0.8509725359434943, 0.8535423613351977, 0.8529606324620846, 0.8508443583951812, 0.8508685597504572, 0.8544324334014557, 0.8549818938008675, 0.8524120684091643, 0.851365135706859, 0.8511042988777743, 0.849011329819655, 0.8525832705890789, 0.8505664909827542, 0.851937901115055, 0.8528674124269477, 0.8543141156645513, 0.849533899824316, 0.8550544978666954, 0.854544476712918, 0.8540120468968485, 0.8518742605141444, 0.8486949195080851, 0.850475063

In [342]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [343]:
column_to_drop_24 = 'Cat_현재 주차시설 이용편의성'

In [344]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(10564, 139)


In [345]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [346]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [347]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 152, 'learning_rate': 0.060000000000000005, 'max_depth': 5, 'num_leaves': 864, 'subsample': 0.4, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8523584800035827


In [348]:
optuna_25 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [349]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.856


In [350]:
X_train = X_train.values
y_train = y_train.values

In [351]:
auc_bootstrap = []

In [352]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84903004, 0.85590184])

In [353]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999056339263916, pvalue=0.38725990056991577),
 0.8525219506292353)

In [354]:
t_25 = auc_bootstrap
print(t_25)

[0.854714782546341, 0.8510872682944319, 0.8524084830231974, 0.8528889247427485, 0.8513642393603671, 0.8547470510200423, 0.8497687426051415, 0.8516967839087877, 0.8523224337599942, 0.8519119070667958, 0.8530162059445701, 0.8512405435445126, 0.8511535979348177, 0.853407013014951, 0.8520615969309095, 0.8535764225018824, 0.8503477824387796, 0.8531900971639595, 0.8509788103689362, 0.8542092431250223, 0.8562609802445234, 0.8522605858520669, 0.8550249184324692, 0.8525303861460687, 0.8512835681761142, 0.8544485676383062, 0.8528028754795453, 0.8546081173138287, 0.8505127101932524, 0.851269226632247, 0.8533164820192894, 0.8514897278692051, 0.8552776881431287, 0.853308414900864, 0.8508703524434407, 0.8509088953425836, 0.8532689756552293, 0.8536284105984009, 0.8542666093004911, 0.8492900935785737, 0.8549693449499839, 0.8522345918038077, 0.8554040729984583, 0.8528799612778316, 0.8515444050051989, 0.8518195833781507, 0.8531865117779929, 0.8554282743537341, 0.8541464988706035, 0.8543750672259868, 0.8

In [355]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [356]:
# 26
column_to_drop_25 = 'Cat_현재 문화시설 접근용이성'

In [357]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(10564, 135)


In [358]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [359]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [360]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8509169689641274


In [361]:
optuna_26 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [362]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.855


In [363]:
X_train = X_train.values
y_train = y_train.values

In [364]:
auc_bootstrap = []

In [365]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84811852, 0.85498402])

In [366]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9993158578872681, pvalue=0.7025040984153748),
 0.8515667217919759)

In [367]:
t_26 = auc_bootstrap
print(t_26)

[0.8494093076619698, 0.8534993367035961, 0.8506632964038578, 0.8516456921587608, 0.8531694811946506, 0.8505503567459038, 0.8533550249184325, 0.8504678928686673, 0.8546627944498225, 0.8536768133089527, 0.8514269836147861, 0.853075364813022, 0.8504634111362088, 0.8517693879746155, 0.8487065720124771, 0.8542872252698003, 0.8521386827291957, 0.8499820730701659, 0.8563326879638593, 0.848907353626618, 0.851376788211251, 0.8527912229751533, 0.853386397045642, 0.8509483345882185, 0.8497552974077659, 0.849728407013015, 0.8541025778925101, 0.850361227636155, 0.8532017496683518, 0.8496342906313864, 0.8537288014054714, 0.8532447742999534, 0.8491502635258685, 0.8526164354092719, 0.8510648596321393, 0.8522014269836147, 0.851451184970062, 0.8499067799648632, 0.8517738697070739, 0.8519477609264638, 0.8534052203219676, 0.8514664228604208, 0.8544781470725323, 0.8491914954644867, 0.8490507690652898, 0.851051414434764, 0.8528476928041303, 0.8530448890323045, 0.8517962783693664, 0.8514054712989854, 0.85151

In [368]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [369]:
# 27
column_to_drop_26 = 'Cat_현재 대중교통 접근용이성'

In [370]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(10564, 131)


In [371]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [372]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [373]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 774, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8519610148237717


In [374]:
optuna_27 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [375]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.855


In [376]:
X_train = X_train.values
y_train = y_train.values

In [377]:
auc_bootstrap = []

In [378]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84701723, 0.85519292])

In [379]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989576935768127, pvalue=0.29481640458106995),
 0.8511845913556344)

In [380]:
t_27 = auc_bootstrap
print(t_27)

[0.8489916101968378, 0.8482530206876769, 0.8481642823849987, 0.8544145064716216, 0.8517245706500306, 0.8508703524434406, 0.8507161808468682, 0.853272561041196, 0.8494424724821626, 0.8502975870352443, 0.8543436950987773, 0.85229912875121, 0.8478012620558604, 0.8473226130292926, 0.8506731562152665, 0.8555806532573231, 0.8485497113764298, 0.8495948513857516, 0.8478102255207772, 0.8522014269836148, 0.8551772973360582, 0.8494935642321896, 0.8533720555017748, 0.8468358968842995, 0.8534123910939012, 0.8515148255709727, 0.8508353949302644, 0.8512109641102865, 0.8497257179735398, 0.8511347746584921, 0.8543454877917609, 0.849823419741135, 0.855757233516188, 0.8499677315262988, 0.8522435552687246, 0.8498825786095874, 0.8480038363629845, 0.8532295364095945, 0.8516949912158044, 0.8544001649277544, 0.8526675271592987, 0.8511096769567245, 0.8454868954142913, 0.8520571151984512, 0.8522596895055753, 0.8481329102577893, 0.850468789215159, 0.8534930622781541, 0.8506435767810404, 0.8532806281596215, 0.853

In [381]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [382]:
# 28
column_to_drop_27  = '자산 중 금융자산의 비중'

In [383]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(10564, 130)


In [384]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [385]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [386]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 169, 'learning_rate': 0.05, 'max_depth': 4, 'num_leaves': 394, 'subsample': 0.2, 'colsample_bytree': 0.4, 'reg_alpha': 9, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8512962403152851


In [387]:
optuna_28 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [388]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.856


In [389]:
X_train = X_train.values
y_train = y_train.values

In [390]:
auc_bootstrap = []

In [391]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84893198, 0.85536401])

In [392]:
np.mean(auc_bootstrap)

0.8522153669642537

In [393]:
t_28 = auc_bootstrap
print(t_28)

[0.8503424043598293, 0.8511697321716682, 0.8516725825535119, 0.8514063676454771, 0.8527553691154852, 0.8518670897422107, 0.8522041160230899, 0.8530690903875802, 0.8493268437847333, 0.8505539421318705, 0.853998601699473, 0.8510182496145711, 0.8493644903373847, 0.8510325911584382, 0.8507663762504034, 0.8495052167365817, 0.8560386863145817, 0.8512916352945393, 0.8546143917392708, 0.8511374636979672, 0.8548815029937973, 0.8495912659997849, 0.8490211896310638, 0.8517837295184827, 0.8562233336918719, 0.8518706751281775, 0.8489324513283856, 0.8511060915707576, 0.8517039546807215, 0.8524371661109319, 0.851762217202682, 0.852753576422502, 0.8502671112545264, 0.8533783299272166, 0.8523851780144132, 0.8508918647592414, 0.8507547237460112, 0.8522175612204653, 0.851932523036105, 0.8556039582661075, 0.8523627693521207, 0.8497956329998926, 0.8550051988096519, 0.852992004589294, 0.8531892008174681, 0.8538282958660499, 0.8513866480226596, 0.8504723746011259, 0.8516743752464953, 0.8512441289304795, 0.85

In [394]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [395]:
# 29
column_to_drop_28 = 'Cat_현재 교육환경'

In [396]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(10564, 126)


In [397]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [398]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [399]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 104, 'learning_rate': 0.09999999999999999, 'max_depth': 5, 'num_leaves': 564, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 10, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8512640512338215


In [400]:
optuna_29 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [401]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.854


In [402]:
X_train = X_train.values
y_train = y_train.values

In [403]:
auc_bootstrap = []

In [404]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84654573, 0.8541357 ])

In [405]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984933137893677, pvalue=0.06989236921072006),
 0.8505106996880715)

In [406]:
t_29 = auc_bootstrap
print(t_29)

[0.8475995840952278, 0.8520571151984511, 0.8494702592234054, 0.8494335090172457, 0.8540989925065434, 0.8505754544476714, 0.8528790649313399, 0.8502940016492776, 0.8519011509088954, 0.8466494568140261, 0.8498377612850023, 0.8494173747803953, 0.8508094008820051, 0.849080348499516, 0.8510558961672224, 0.849522247319924, 0.8487782797318132, 0.8524111720626726, 0.8495760281094261, 0.8522820981678677, 0.8508183643469218, 0.848261984152594, 0.8508775232153741, 0.8526513929224482, 0.8470026173317557, 0.8487343587537198, 0.8523287081854363, 0.8553332616256141, 0.8520087124878992, 0.8521001398300527, 0.8516053565666344, 0.8493286364777167, 0.8493824172672189, 0.8509806030619196, 0.8506776379477251, 0.8511831773690437, 0.8501443117851637, 0.8535540138395897, 0.8500161342368506, 0.8505691800222295, 0.8507780287547955, 0.8536624717650856, 0.8492210748987128, 0.8514565630490123, 0.8510209386540462, 0.8517801441325159, 0.8504221791975906, 0.85058531425908, 0.8489799576924455, 0.8497024129647557, 0.85

In [407]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [408]:
column_to_drop_29 = '현재 무주택 기간(총 개월)'

In [409]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(10564, 125)


In [410]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [411]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [412]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 151, 'learning_rate': 0.060000000000000005, 'max_depth': 4, 'num_leaves': 694, 'subsample': 0.2, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8520939697254691


In [413]:
optuna_30 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [414]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.855


In [415]:
X_train = X_train.values
y_train = y_train.values

In [416]:
auc_bootstrap = []

In [417]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84930405, 0.85567215])

In [418]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992355704307556, pvalue=0.5984939336776733),
 0.852516743752465)

In [419]:
t_30 = auc_bootstrap
print(t_30)

[0.8530664013481053, 0.8526791796636908, 0.8544082320461798, 0.8506632964038578, 0.8551853644544836, 0.8507457602810944, 0.8517487720053064, 0.8546905811910653, 0.8542289627478397, 0.850074396758811, 0.8524371661109319, 0.8504643074827005, 0.8540317665196658, 0.8525832705890789, 0.8512566777813632, 0.8555994765336488, 0.8548797103008139, 0.8522211466064322, 0.8505584238643291, 0.8532528414183788, 0.8514744899788462, 0.853962747839805, 0.8503397153203541, 0.8541850417697465, 0.8500941163816285, 0.8523815926284465, 0.8517980710623497, 0.8532644939227708, 0.8507896812591875, 0.8540819619232011, 0.8561301136567352, 0.8543042558531424, 0.8538596679932595, 0.853484995159729, 0.8495939550392599, 0.8554632318669104, 0.852845900111147, 0.8498843713025708, 0.8518590226237854, 0.8522552077731167, 0.8507421748951274, 0.8552023950378258, 0.8532393962210032, 0.8523278118389445, 0.852359183966154, 0.8515390269262485, 0.8544781470725324, 0.8540003943924563, 0.850605930228389, 0.8530628159621384, 0.851

In [420]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [421]:
# 31
column_to_drop_30 = '소득 중 정부 보조금의 비중(월평균)'

In [422]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(10564, 124)


In [423]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [424]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [425]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 1000, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8514263961664204


In [426]:
optuna_31= LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [427]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.856


In [428]:
X_train = X_train.values
y_train = y_train.values

In [429]:
auc_bootstrap = []

In [430]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84809867, 0.85506378])

In [431]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9986035823822021, pvalue=0.09969918429851532),
 0.8516266400899932)

In [432]:
t_31 = auc_bootstrap
print(t_31)

[0.8506758452547416, 0.8513920261016099, 0.853657990032627, 0.85053780789502, 0.8519773403606897, 0.85301620594457, 0.8500896346491699, 0.8492721666487397, 0.8516098382990929, 0.8537763077695315, 0.8515847405973251, 0.8515533684701158, 0.8557043490731776, 0.8492667885697895, 0.8534123910939013, 0.8530565415366964, 0.8541832490767631, 0.8539869491950809, 0.8514440141981284, 0.8499515972894482, 0.8526899358215912, 0.8533120002868309, 0.8504876124914846, 0.8515013803735971, 0.8517640098956651, 0.8519952672905238, 0.8535880750062743, 0.8515049657595639, 0.8550213330465025, 0.8515990821411925, 0.8517182962245886, 0.8493949661181026, 0.8524452332293572, 0.8516878204438708, 0.8510603778996808, 0.8516430031192859, 0.8523448424222868, 0.851764906242157, 0.8523529095407121, 0.8540452117170413, 0.8493770391882687, 0.8513059768384066, 0.8488490911046574, 0.8510944390663656, 0.8510460363558136, 0.8524622638126995, 0.854557025563802, 0.8525357642250188, 0.8527320641067011, 0.8532178839052025, 0.8507

In [433]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [434]:
# 32
column_to_drop_31 = 'Cat_현재 주변도로의 보행 안전'

In [435]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(10564, 120)


In [436]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [437]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [438]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 124, 'learning_rate': 0.09, 'max_depth': 9, 'num_leaves': 614, 'subsample': 0.1, 'colsample_bytree': 0.2, 'reg_alpha': 8, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8510751153208832


In [439]:
optuna_32 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [440]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.854


In [441]:
X_train = X_train.values
y_train = y_train.values

In [442]:
auc_bootstrap = []

In [443]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84449885, 0.85273986])

In [444]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9986037015914917, pvalue=0.09973210841417313),
 0.8488417065540855)

In [445]:
t_32 = auc_bootstrap
print(t_32)

[0.8493860026531856, 0.8461690150944748, 0.8476694991215805, 0.8462676132085618, 0.849420960166362, 0.846252375318203, 0.8475798644724104, 0.8489476892187444, 0.8489539636441863, 0.8478908967050303, 0.848091678319171, 0.8489620307626116, 0.8487962066616471, 0.8454653830984906, 0.8491413000609517, 0.8505485640529202, 0.846977519629988, 0.8497615718332078, 0.8451776558746549, 0.8478935857445054, 0.8518061381807752, 0.8496002294647019, 0.850110250618479, 0.8459234161557492, 0.8517416012333727, 0.8480719586963537, 0.8489315549818937, 0.8518886020580116, 0.8483901617009071, 0.848199239898175, 0.8504284536230324, 0.8492121114337958, 0.8460300813882615, 0.8498476210964111, 0.8446013050804919, 0.8481113979419884, 0.8521216521458534, 0.849701516618264, 0.848339966297372, 0.8432021082069485, 0.8502491843246925, 0.8497875658814672, 0.8500797748377612, 0.8510083898031623, 0.849280233767165, 0.8441961564662436, 0.852439855150407, 0.8470770140905668, 0.8468430676562333, 0.8483498261087807, 0.8511670

In [446]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [447]:
# 33.
column_to_drop_32 = '소득 대비 주거관리비의 비율'

In [448]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(10564, 119)


In [449]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [450]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [451]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 91, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 416, 'subsample': 0.8, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 8, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8516993035962201


In [452]:
optuna_33 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [453]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.856


In [454]:
X_train = X_train.values
y_train = y_train.values

In [455]:
auc_bootstrap = []

In [456]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8483211 , 0.85540497])

In [457]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988381266593933, pvalue=0.20735618472099304),
 0.85194600408734)

In [458]:
t_33 = auc_bootstrap
print(t_33)

[0.8570605213151197, 0.8533612993438743, 0.8540900290416262, 0.8515730880929332, 0.8498547918683448, 0.8522874762468179, 0.8516994729482629, 0.8514682155534044, 0.8514682155534045, 0.8514395324656701, 0.8505871069520634, 0.8516447958122693, 0.855037467283353, 0.8497490229823239, 0.850982395754903, 0.8527562654619769, 0.8516600337026281, 0.8549971316912267, 0.8527526800760101, 0.8515560575095908, 0.852153024273063, 0.8511276038865584, 0.8547138861998493, 0.8504015632282815, 0.852377110895988, 0.8535531174930981, 0.8495885769603098, 0.8559015453013518, 0.8493949661181026, 0.8521413717686709, 0.853765551611631, 0.8516905094833458, 0.8537189415940627, 0.852833351260263, 0.8498404503244774, 0.853102255207773, 0.8496540102542038, 0.8531560359972752, 0.8539188268617116, 0.8526863504356244, 0.8515237890358898, 0.8542199992829228, 0.8494630884514718, 0.8515650209745079, 0.8513776845577428, 0.8547291240902084, 0.8521144813739199, 0.853054748843713, 0.8505897959915385, 0.852592234053996, 0.853295

In [459]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [460]:
# 34
column_to_drop_33 = '소득 대비 생활비의 비율'

In [461]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(10564, 118)


In [462]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [463]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [464]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 161, 'learning_rate': 0.06999999999999999, 'max_depth': 9, 'num_leaves': 656, 'subsample': 0.9, 'colsample_bytree': 0.2, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8509869452281785


In [465]:
optuna_34 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [466]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.855


In [467]:
X_train = X_train.values
y_train = y_train.values

In [468]:
auc_bootstrap = []

In [469]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84702294, 0.8548696 ])

In [470]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.997577428817749, pvalue=0.0036298190243542194),
 0.8511580146821555)

In [471]:
t_34 = auc_bootstrap
print(t_34)

[0.856611451722778, 0.8518903947509949, 0.8477232799110825, 0.8526603563873649, 0.8525626546197698, 0.8483327955254383, 0.8484860707755189, 0.8523609766591373, 0.85063282062314, 0.8477734753146176, 0.8534016349360007, 0.8498933347674876, 0.851200207952386, 0.85094385285576, 0.8496647664121042, 0.8499318776666309, 0.8533415797210571, 0.8542684019934745, 0.850197196228174, 0.849376142841777, 0.8481212577533972, 0.8508649743644903, 0.8549505216736581, 0.8536427521422683, 0.8513391416585995, 0.8507609981714531, 0.85283693664623, 0.8490068480871966, 0.8512539887418881, 0.8525124592162348, 0.8515668136674913, 0.8521225484923451, 0.8529821447778853, 0.8520445663475674, 0.8506471621670073, 0.8501577569825391, 0.8484851744290272, 0.8464092359542505, 0.8492452762539887, 0.8531004625147898, 0.8507359004696856, 0.8551656448316661, 0.8500484027105518, 0.8519863038256068, 0.8526289842601557, 0.8568615323939622, 0.8525913377075041, 0.8541097486644438, 0.8533406833745654, 0.8523672510845792, 0.8529911

In [472]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [473]:
# 35
column_to_drop_34 = '소득 중 근로/사업소득의 비중(월평균)'

In [474]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(10564, 117)


In [475]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [476]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [477]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'learning_rate': 0.060000000000000005, 'max_depth': 3, 'num_leaves': 610, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8514235971158584


In [478]:
optuna_35 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [479]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.855


In [480]:
X_train = X_train.values
y_train = y_train.values

In [481]:
auc_bootstrap = []

In [482]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84963965, 0.85552743])

In [483]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987006187438965, pvalue=0.13564735651016235),
 0.8526462729912875)

In [484]:
t_35 = auc_bootstrap
print(t_35)

[0.8525043920978093, 0.8514762826718296, 0.8524568857337492, 0.8519396938080384, 0.8545937757699615, 0.8510630669391559, 0.852756265461977, 0.8514789717113047, 0.852314366641569, 0.850982395754903, 0.8533890860851171, 0.8500313721272095, 0.8547085081208992, 0.8512997024129647, 0.8505216736581694, 0.8499363593990893, 0.8537054963966871, 0.8529767666989351, 0.8545632999892436, 0.8541258829012943, 0.8506991502635257, 0.8524434405363738, 0.8523322935714029, 0.8545050374672833, 0.85171381449213, 0.852612850023305, 0.8524855688214836, 0.8533155856727977, 0.8549137714674984, 0.8538309849055251, 0.8525124592162345, 0.8515668136674913, 0.8529937972822775, 0.8533379943350903, 0.8525043920978094, 0.8530502671112544, 0.853697429278262, 0.85288444301029, 0.8521404754221792, 0.8507699616363702, 0.8526388440715643, 0.85252590441361, 0.8533514395324656, 0.8531013588612815, 0.8539968090064896, 0.8506830160266753, 0.8508820049478325, 0.85170843641318, 0.8546099100068123, 0.8536535083001686, 0.8528055645

In [485]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [486]:
# 36
column_to_drop_35 = 'Cat_현재 거주 지역'

In [487]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(10564, 100)


In [488]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [489]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [490]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 144, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 940, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.1, 'reg_alpha': 4, 'reg_lambda': 10, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8500100765820234


In [491]:
optuna_36 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [492]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.855


In [493]:
X_train = X_train.values
y_train = y_train.values

In [494]:
auc_bootstrap = []

In [495]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84936492, 0.85529165])

In [496]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9966732263565063, pvalue=0.00024011974164750427),
 0.8524568041662185)

In [497]:
t_36 = auc_bootstrap
print(t_36)

[0.852305403176652, 0.8523278118389445, 0.8521368900362124, 0.8534132874403929, 0.8528978882076655, 0.8542988777741924, 0.8498521028288695, 0.851269226632247, 0.8532353626617905, 0.8545606109497687, 0.8506767416012333, 0.8534186655193432, 0.8540174249757986, 0.8512118604567782, 0.8525429349969526, 0.853846222795884, 0.8508703524434404, 0.8530103796923738, 0.8516134236850598, 0.8519656878562977, 0.8508712487899324, 0.8530601269226632, 0.8533254454842063, 0.8526316732996306, 0.8545704707611774, 0.8525608619267865, 0.8526245025276972, 0.85216198773798, 0.8490301530959807, 0.8522498296941665, 0.8519719622817397, 0.8526379477250726, 0.8536570936861354, 0.8555241834283461, 0.8537619662256641, 0.8523959341723136, 0.8519271449571546, 0.8525536911548528, 0.8493142949338497, 0.8498691334122118, 0.8527867412426948, 0.8524568857337492, 0.8515721917464415, 0.8509474382417268, 0.8536472338747266, 0.8549191495464487, 0.8523036104836685, 0.854159944067979, 0.8521539206195546, 0.8539770893836722, 0.855

In [498]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [499]:
# 37
column_to_drop_36 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [500]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(10564, 96)


In [501]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [502]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [503]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 180, 'learning_rate': 0.08, 'max_depth': 3, 'num_leaves': 426, 'subsample': 0.1, 'colsample_bytree': 0.2, 'reg_alpha': 8, 'reg_lambda': 9, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8516993035962203


In [504]:
optuna_37 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [505]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.855


In [506]:
X_train = X_train.values
y_train = y_train.values

In [507]:
auc_bootstrap = []

In [508]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.85042827, 0.85582634])

In [509]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9971431493759155, pvalue=0.0009541664621792734),
 0.8532129152325123)

In [510]:
t_37 = auc_bootstrap
print(t_37)

[0.8522471406546915, 0.8535809042343409, 0.8530565415366964, 0.8525904413610126, 0.8520768348212685, 0.8521216521458534, 0.8538874547345021, 0.851941486501022, 0.8535387759492309, 0.8544808361120074, 0.8503155139650784, 0.8488795668853752, 0.8535988311641748, 0.8548599906779966, 0.8541975906206303, 0.8536212398264673, 0.8564160481875873, 0.8527688143128609, 0.8533406833745653, 0.8512772937506723, 0.8514744899788463, 0.852361873005629, 0.8526173317557635, 0.8518554372378185, 0.8523995195582804, 0.8532367071815281, 0.8569233803018895, 0.8523529095407121, 0.8520866946326772, 0.8518697787816858, 0.8530888100103976, 0.8534652755369116, 0.8533353052956151, 0.8516430031192858, 0.8540353519056326, 0.854897637230648, 0.8543670001075616, 0.8547407765946005, 0.8560342045821233, 0.8522417625757412, 0.8542146212039727, 0.8521960489046646, 0.8554614391739273, 0.8522749273959341, 0.8514906242156968, 0.8529678032340182, 0.8527517837295184, 0.8515730880929331, 0.8532116094797604, 0.8552597612132947, 0.

In [511]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [512]:
# 38
column_to_drop_37 = '장기부채부담지표'

In [513]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(10564, 95)


In [514]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [515]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [516]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 185, 'learning_rate': 0.09, 'max_depth': 8, 'num_leaves': 766, 'subsample': 0.9, 'colsample_bytree': 0.2, 'reg_alpha': 9, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8512808455371939


In [517]:
optuna_38 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [518]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.854


In [519]:
X_train = X_train.values
y_train = y_train.values

In [520]:
auc_bootstrap = []

In [521]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84708219, 0.85439869])

In [522]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9973800182342529, pvalue=0.0019632710609585047),
 0.8508733785091965)

In [523]:
t_38 = auc_bootstrap
print(t_38)

[0.8549245276253988, 0.8473145459108673, 0.8498081818507763, 0.8470823921695171, 0.852326019145961, 0.8508049191495465, 0.8515399232727402, 0.8521225484923451, 0.8460408375461618, 0.8510415546233552, 0.8490749704205658, 0.8499336703596142, 0.8507063210354595, 0.8513274891542074, 0.8504096303467069, 0.8487119500914274, 0.8508120899214802, 0.8528656197339644, 0.8517209852640637, 0.848013696174393, 0.8502133304650246, 0.8533926714710839, 0.8511975189129111, 0.8490310494424724, 0.8475108457925495, 0.8511849700620271, 0.851343623391058, 0.8549164605069736, 0.851836613961493, 0.8495885769603098, 0.8489494819117278, 0.8496988275787888, 0.8513678247463339, 0.8506157900397978, 0.8521342009967373, 0.8531461761858664, 0.8507869922197124, 0.850060055214944, 0.8520203649922913, 0.8494460578681295, 0.8517398085403893, 0.8507036319959844, 0.8526926248610665, 0.8517523573912732, 0.8531425907998995, 0.8513391416585996, 0.8509564017066438, 0.8504732709476176, 0.8482279229859093, 0.8454304255853143, 0.84

In [524]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [525]:
# 38
column_to_drop_38 = '소득 중 사적이전소득의 비중(월평균)'

In [526]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(10564, 94)


In [527]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [528]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [529]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 106, 'learning_rate': 0.09, 'max_depth': 4, 'num_leaves': 880, 'subsample': 0.2, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8531408146356756


In [530]:
optuna_39 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [531]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.855


In [532]:
X_train = X_train.values
y_train = y_train.values

In [533]:
auc_bootstrap = []

In [534]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84912868, 0.85567847])

In [535]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9985260367393494, pvalue=0.07769444584846497),
 0.8525379775285935)

In [536]:
t_39 = auc_bootstrap
print(t_39)

[0.8535997275106665, 0.8506570219784161, 0.8498162489692016, 0.8547873866121688, 0.8534598974579615, 0.8530646086551219, 0.8508264314653472, 0.8542280664013482, 0.8491126169732173, 0.8545444767129181, 0.8545650926822272, 0.8535190563264135, 0.853093291742856, 0.8519970599835073, 0.8521216521458534, 0.8496710408375463, 0.8547703560288265, 0.8553162310422717, 0.8509958409522785, 0.8534052203219676, 0.8543392133663188, 0.8534052203219676, 0.8541348463662113, 0.8532250546771359, 0.8551997059983507, 0.8538946255064358, 0.852507977483776, 0.8523224337599943, 0.8540326628661576, 0.854452153024273, 0.853726112365996, 0.8538040945107741, 0.8528494854971138, 0.8531363163744577, 0.852424617260048, 0.8518500591588685, 0.8534473486070777, 0.8521942562116811, 0.8522148721809903, 0.8507816141407624, 0.8550760101824961, 0.8539457172564627, 0.8516474848517441, 0.854467390914632, 0.8538480154888674, 0.8531694811946506, 0.8522552077731168, 0.851597289448209, 0.8509743286364778, 0.8495356925172994, 0.8491

In [537]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [538]:
# 40
column_to_drop_39 = 'Cat_이사 계획 중인 주택의 유형'

In [539]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(10564, 78)


In [540]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [541]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [542]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 149, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 468, 'subsample': 0.5, 'colsample_bytree': 0.2, 'reg_alpha': 4, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8540533051189036


In [543]:
optuna_40 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [544]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.856


In [545]:
X_train = X_train.values
y_train = y_train.values

In [546]:
auc_bootstrap = []

In [547]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84688508, 0.85478308])

In [548]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988248348236084, pvalue=0.19918860495090485),
 0.8509203085224624)

In [549]:
t_40 = auc_bootstrap
print(t_40)

[0.8530036570936861, 0.8504069413072317, 0.8494541249865548, 0.8490238786705389, 0.8527455093040766, 0.8480432756086194, 0.8529453945717256, 0.851919974185221, 0.8536275142519092, 0.8518330285755262, 0.8497418522103904, 0.8510155605750958, 0.8504965759564017, 0.8499596644078736, 0.8526872467821162, 0.8502698002940017, 0.8507995410705963, 0.8551557850202574, 0.8521261338783119, 0.851152701588326, 0.8527688143128609, 0.8522865799003262, 0.8490570434907317, 0.846914775375569, 0.8500322684737013, 0.8473109605249005, 0.8506731562152665, 0.8521297192642787, 0.8513776845577427, 0.8537046000501953, 0.8515587465490659, 0.8528270768348213, 0.8544118174321467, 0.8466727618228101, 0.8512647448997884, 0.854189523502205, 0.8519540353519056, 0.8489934028898212, 0.8504974723028935, 0.8523179520275359, 0.8490471836793231, 0.8483704420780896, 0.8508488401276396, 0.8535979348176831, 0.8496638700656125, 0.8498942311139794, 0.8508040228030547, 0.8497409558638988, 0.8514502886235703, 0.8515058621060556, 0.8

In [550]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [551]:
# 41.
column_to_drop_40 = 'Cat_주택 보유 의식'

In [552]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(10564, 76)


In [553]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [554]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [555]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 126, 'learning_rate': 0.09999999999999999, 'max_depth': 4, 'num_leaves': 760, 'subsample': 0.4, 'colsample_bytree': 0.2, 'reg_alpha': 9, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8546620986161494


In [556]:
optuna_41 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [557]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.854


In [558]:
X_train = X_train.values
y_train = y_train.values

In [559]:
auc_bootstrap = []

In [560]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84856217, 0.85433855])

In [561]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9986822009086609, pvalue=0.12800052762031555),
 0.8515255465472733)

In [562]:
t_41 = auc_bootstrap
print(t_41)

[0.8531416944534078, 0.8515686063604746, 0.8509743286364777, 0.8539941199670145, 0.8497839804955003, 0.8501326592807716, 0.850372880140547, 0.8515354415402818, 0.8493806245742355, 0.8535495321071314, 0.8525160446022014, 0.8491350256355096, 0.8514054712989854, 0.8519423828475137, 0.8514807644042881, 0.8507932666451542, 0.85031103223262, 0.8533433724140405, 0.8510316948119465, 0.8474292782618049, 0.8504526549783085, 0.8493891398659066, 0.8478998601699472, 0.8497346814384568, 0.8516707898605285, 0.8515860851170627, 0.8516170090710266, 0.852377110895988, 0.8499300849736474, 0.8514897278692051, 0.8514189164963608, 0.8498373131117565, 0.851800760101825, 0.8498888530350291, 0.8480603061919615, 0.8504643074827005, 0.8569977770607007, 0.8528575526155391, 0.853209816786777, 0.8528871320497652, 0.8491726721881611, 0.850194507188699, 0.8513077695313902, 0.8531766519665842, 0.8481409773762145, 0.8534769280413036, 0.8484780036570937, 0.8518966691764368, 0.8510487253952889, 0.850654332938941, 0.84792

In [563]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [564]:
# 42.
column_to_drop_41 = 'Cat_가구주 종사상 지위'

In [565]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(10564, 71)


In [566]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [567]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [568]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 155, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 494, 'subsample': 0.2, 'colsample_bytree': 0.2, 'reg_alpha': 8, 'reg_lambda': 10, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8530036611581352


In [569]:
optuna_42 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [570]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.852


In [571]:
X_train = X_train.values
y_train = y_train.values

In [572]:
auc_bootstrap = []

In [573]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84720512, 0.85281025])

In [574]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992753267288208, pvalue=0.650010883808136),
 0.8500416969183607)

In [575]:
t_42 = auc_bootstrap
print(t_42)

[0.8483543078412391, 0.8505198809651859, 0.8490606288766986, 0.8493268437847333, 0.8484197411351333, 0.849351941486501, 0.8486008031264566, 0.8505593202108206, 0.8484923452009607, 0.8503146176185866, 0.851072030404073, 0.8509304076583843, 0.8486751998852676, 0.8506444731275321, 0.8528656197339644, 0.8490391165608977, 0.8469085009501272, 0.8497875658814671, 0.8492829228066402, 0.8503253737764871, 0.8477609264637338, 0.848306801477179, 0.8524353734179485, 0.8472491126169732, 0.8474642357749812, 0.8520203649922914, 0.8489709942275285, 0.8493716611093185, 0.8480361048366857, 0.8486241081352408, 0.8523547022336954, 0.8508802122548492, 0.8500259940482593, 0.850865870710982, 0.8502097450790578, 0.8524721236241082, 0.8522381771897746, 0.8487083647054605, 0.8505575275178373, 0.848468143845685, 0.85036391667563, 0.8501720985264063, 0.8511500125488509, 0.8477098347137069, 0.8475547667706429, 0.8513230074217488, 0.8500394392456349, 0.8483991251658242, 0.8453192786203434, 0.8520642859703849, 0.8485

In [576]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [577]:
# 43.
column_to_drop_42 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [578]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(10564, 67)


In [579]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [580]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [581]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 112, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 578, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8520855725737829


In [582]:
optuna_43 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [583]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.854


In [584]:
X_train = X_train.values
y_train = y_train.values

In [585]:
auc_bootstrap = []

In [586]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84737574, 0.85355112])

In [587]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981030821800232, pvalue=0.019581155851483345),
 0.8505975639543222)

In [588]:
t_43 = auc_bootstrap
print(t_43)

[0.8492739593417232, 0.8508219497328886, 0.8489306586354021, 0.8508730414829158, 0.8490695923416155, 0.8489082499731095, 0.8505476677064286, 0.849092001003908, 0.8501604460220142, 0.8493600086049263, 0.8485165465562368, 0.8523224337599942, 0.8515202036499229, 0.8506381987020903, 0.8492909899250655, 0.8468986411387186, 0.8469604890466458, 0.8509348893908429, 0.8496522175612204, 0.8495858879208347, 0.8511598723602596, 0.852687246782116, 0.852287476246818, 0.8490776594600409, 0.848486070775519, 0.8498377612850023, 0.8524779498763042, 0.8526065755978631, 0.849782187802517, 0.8509608834391023, 0.8463823455594994, 0.8517756624000573, 0.8479688788498082, 0.8523278118389443, 0.8513310745401742, 0.8522955433652433, 0.8511939335269441, 0.849560790219067, 0.8503388189738625, 0.8512190312287118, 0.8519782367071816, 0.8522641712380338, 0.8533290308701731, 0.8520194686457996, 0.8507959556846296, 0.8492596177978559, 0.8513041841454234, 0.8500071707719337, 0.8515847405973254, 0.8511401527374423, 0.849

In [589]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [590]:
## 44
column_to_drop_43 = 'Cat_현재 의료시설 접근용이성'

In [591]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(10564, 63)


In [592]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [593]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [594]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 144, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 968, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8513186327197816


In [595]:
optuna_44 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [596]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.854


In [597]:
X_train = X_train.values
y_train = y_train.values

In [598]:
auc_bootstrap = []

In [599]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84713315, 0.85330234])

In [600]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9953986406326294, pvalue=7.923064913484268e-06),
 0.85033399976695)

In [601]:
t_44 = auc_bootstrap
print(t_44)

[0.8492076297013373, 0.8506529884192033, 0.849387795346169, 0.851292979814277, 0.8457311498332796, 0.8479868057796422, 0.8486380015058621, 0.8501474489978846, 0.8499780395109535, 0.8510245240400129, 0.8517218816105554, 0.8514198128428525, 0.8501156286974293, 0.8512315800795957, 0.849653562080958, 0.8504029077480192, 0.8477555483847837, 0.8515538166433615, 0.8511459789896383, 0.8488567100498368, 0.851173317557635, 0.8511921408339608, 0.8516564483166613, 0.8499273959341722, 0.8496015739844395, 0.8504822344125347, 0.8468081101430569, 0.8541850417697465, 0.8499708687390198, 0.8514726972858628, 0.8505037467283352, 0.8496737298770214, 0.8500286830877345, 0.8488939084292425, 0.8498229715678893, 0.8520826610734646, 0.8489575490301531, 0.8491731203614068, 0.8508111935749885, 0.8472325302068768, 0.8496889677673802, 0.8527916711483992, 0.85049567960991, 0.8517245706500304, 0.8486465167975332, 0.8502966906887527, 0.849979384030691, 0.8500578143487147, 0.8508344985837726, 0.8512674339392636, 0.8497

In [602]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [603]:
# 45
column_to_drop_44 = '중기부채부담지표'

In [604]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(10564, 62)


In [605]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [606]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [607]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.851290642214161


In [608]:
optuna_45 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [609]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.851


In [610]:
X_train = X_train.values
y_train = y_train.values

In [611]:
auc_bootstrap = []

In [612]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84619182, 0.85248208])

In [613]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981501698493958, pvalue=0.02282916195690632),
 0.8493929011598723)

In [614]:
t_45 = auc_bootstrap
print(t_45)

[0.8499829694166579, 0.8462035244344054, 0.8510729267505648, 0.8485869097558354, 0.850588451471801, 0.8516237316697143, 0.8494980459646482, 0.8458759097916893, 0.8481279803520849, 0.8493649385106307, 0.8490848302319745, 0.8516022193539134, 0.8510142160553584, 0.8515336488472984, 0.8498646516797532, 0.8503419561865835, 0.8496383241905991, 0.8493308773439461, 0.849940392958302, 0.8495132838550069, 0.849381969093973, 0.8508394284894768, 0.8513803735972179, 0.849955182675415, 0.8467682227241762, 0.8477909540712057, 0.8445242192822056, 0.8497279588397691, 0.8492161449930085, 0.8525138037359721, 0.8497270624932773, 0.852370836470546, 0.8501398300527051, 0.850513606539744, 0.8491381628482306, 0.8506390950485822, 0.8502196048904664, 0.8511141586891829, 0.8477739234878635, 0.8514910723889426, 0.8484345308522463, 0.8503899107238895, 0.8465880570793447, 0.8495253845326449, 0.8488154781112187, 0.850185543723782, 0.8496800043024633, 0.8468596500663297, 0.8504410024739163, 0.8508524255136065, 0.8446

In [615]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [616]:
# 46.
column_to_drop_45 = '현재 주택 거주 기간(총 개월)'

In [617]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(10564, 61)


In [618]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [619]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [620]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 124, 'learning_rate': 0.08, 'max_depth': 2, 'num_leaves': 592, 'subsample': 0.4, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8490122150566528


In [621]:
optuna_46 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [622]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.850


In [623]:
X_train = X_train.values
y_train = y_train.values

In [624]:
auc_bootstrap = []

In [625]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84736652, 0.8518135 ])

In [626]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9977104067802429, pvalue=0.005524962674826384),
 0.849651062618766)

In [627]:
t_46 = auc_bootstrap
print(t_46)

[0.8472903445555913, 0.8500407837653724, 0.850295346169015, 0.8507350041231939, 0.8492658922232978, 0.8502043670001076, 0.8503065505001615, 0.850731866910473, 0.8512060342045822, 0.8479782904879711, 0.849394069771611, 0.8493125022408663, 0.8494962532716648, 0.8502213975834498, 0.8495693055107383, 0.8490516654117815, 0.8497606754867162, 0.8498135599297265, 0.8491583306442938, 0.8481252913126097, 0.8492416908680219, 0.8502845900111147, 0.8498140081029724, 0.8517008174680005, 0.8500766376250404, 0.8500820157039906, 0.8502572514431178, 0.8498068373310388, 0.8493169839733247, 0.8487755906923381, 0.849662525545875, 0.8483175576350794, 0.8497853250152378, 0.849205837008354, 0.84936717937686, 0.8503217883905202, 0.8511123659961997, 0.8478478720734288, 0.8483229357140296, 0.8510765121365316, 0.8489373812340898, 0.8496486321752537, 0.8512132049765158, 0.8502455989387259, 0.85054318597397, 0.8508058154960382, 0.8487576637625042, 0.8506202717722563, 0.8496858305546592, 0.8502240866229249, 0.849770

In [628]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [629]:
# 47.
column_to_drop_46 = 'Cat_기초생활보장 수급가구 여부'

In [630]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(10564, 59)


In [631]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [632]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [633]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.09, 'max_depth': 2, 'num_leaves': 864, 'subsample': 0.4, 'colsample_bytree': 0.5, 'reg_alpha': 2, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8480234504456089


In [634]:
optuna_47 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [635]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.852


In [636]:
X_train = X_train.values
y_train = y_train.values

In [637]:
auc_bootstrap = []

In [638]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84711995, 0.85166861])

In [639]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9956626892089844, pvalue=1.5480340152862482e-05),
 0.8495775406941307)

In [640]:
t_47 = auc_bootstrap
print(t_47)

[0.8502715929869851, 0.8481660750779823, 0.8481835538345702, 0.8503692947545805, 0.8490946900433832, 0.8490767631135492, 0.8479186834462731, 0.850928614965401, 0.8496566992936789, 0.8505472195331828, 0.8491914954644868, 0.8500645369474024, 0.8482216485604676, 0.8485981140869814, 0.8471527553691155, 0.8505539421318704, 0.8512665375927719, 0.848133806604281, 0.8493425298483381, 0.8489019755476677, 0.8484210856548707, 0.8497579864472411, 0.8463245312107848, 0.8488311641748234, 0.8467296798250332, 0.8517604245096985, 0.848868362554229, 0.8499090208310924, 0.8491121687999712, 0.8481598006525403, 0.8512929798142771, 0.8487746943458464, 0.8498816822630957, 0.8503271664694705, 0.8495688573374924, 0.8506113083073392, 0.849522247319924, 0.8478272561041197, 0.8492667885697895, 0.8487845541572551, 0.849809078197268, 0.8482579505933814, 0.8487321178874906, 0.8492847154996236, 0.85008963464917, 0.8499861066293786, 0.850964468825069, 0.8478138109067441, 0.8472791402244452, 0.8506704671757914, 0.84917

In [641]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [642]:
# 48
column_to_drop_47 = '총 이사 횟수'

In [643]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(10564, 58)


In [644]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [645]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [646]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 177, 'learning_rate': 0.09999999999999999, 'max_depth': 2, 'num_leaves': 870, 'subsample': 0.5, 'colsample_bytree': 0.4, 'reg_alpha': 5, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8451803148372072


In [647]:
optuna_48 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [648]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.850


In [649]:
X_train = X_train.values
y_train = y_train.values

In [650]:
auc_bootstrap = []

In [651]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84525874, 0.85066713])

In [652]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9963644742965698, pvalue=0.00010068422852782533),
 0.8481251131637446)

In [653]:
t_48 = auc_bootstrap
print(t_48)

[0.8498556882148363, 0.8502550105768887, 0.8465880570793446, 0.8477340360689828, 0.8492174895127462, 0.8482660177118067, 0.8465598221648561, 0.8503710874475637, 0.8484228783478542, 0.8485797389839016, 0.8510518626080097, 0.847506364060091, 0.8483139722491128, 0.8501169732171668, 0.8466992040443155, 0.8481490444946399, 0.8513162848230611, 0.8489934028898211, 0.8488957011222258, 0.8472186368362554, 0.8478357713957909, 0.8482265784661718, 0.8487715571331254, 0.8464307482700513, 0.848399125165824, 0.8480365530099316, 0.8486846115234304, 0.847801710229106, 0.8469197052812735, 0.8484757627908646, 0.847631852568929, 0.8490476318525689, 0.8490050553942132, 0.849931429493385, 0.8486039403391775, 0.8478989638234555, 0.8461770822129002, 0.8501456563049011, 0.8477551002115378, 0.8479563299989243, 0.8495383815567745, 0.8450642680434548, 0.8504203865046072, 0.8495307626115951, 0.8467677745509304, 0.8470066508909684, 0.8458951812412606, 0.8486330716001578, 0.846413717686709, 0.8487679717471587, 0.850

In [654]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [655]:
# 49
column_to_drop_48 = 'Cat_이사 계획 중인 주택의 점유형태'

In [656]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(10564, 34)


In [657]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [658]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [659]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 113, 'learning_rate': 0.06999999999999999, 'max_depth': 2, 'num_leaves': 680, 'subsample': 0.4, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 1, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.845789108334453


In [660]:
optuna_49 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)


In [661]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.850


In [662]:
X_train = X_train.values
y_train = y_train.values

In [663]:
auc_bootstrap = []

In [664]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84644778, 0.85006814])

In [665]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9965223670005798, pvalue=0.000156482303282246),
 0.8483635787530027)

In [666]:
t_49 = auc_bootstrap
print(t_49)

[0.8487746943458462, 0.8484210856548708, 0.8481602488257861, 0.8496571474669248, 0.8485286472338747, 0.8463384245814063, 0.846177530386146, 0.848480244523323, 0.8472325302068767, 0.8485645010935428, 0.8477716826216343, 0.8462420673335486, 0.8472334265533685, 0.8469690043383171, 0.8488746369796709, 0.8467615001254885, 0.8488387831200028, 0.8478120182137607, 0.8491740167078986, 0.8480396902226526, 0.8483180058083253, 0.8470487791760783, 0.8483856799684487, 0.8476515721917464, 0.847882381413359, 0.846228173962927, 0.8486760962317594, 0.8482014807644043, 0.8485134093435158, 0.8485860134093435, 0.8472195331827471, 0.848910490839339, 0.8469461475027787, 0.8480213151195726, 0.8476000322684738, 0.8478447348607079, 0.8472571797353986, 0.8473849091104656, 0.8489835430784124, 0.8486008031264566, 0.8490565953174859, 0.8503867735111685, 0.8491628123767524, 0.8480293822379981, 0.8497687426051415, 0.8485107203040407, 0.8479836685669211, 0.8479294396041733, 0.8463939980638917, 0.8477179018321321, 0.84

In [667]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [668]:
# 50
column_to_drop_49 = '현재 주택의 면적(㎡)'

In [669]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(10564, 33)


In [670]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [671]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [672]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 68, 'learning_rate': 0.08, 'max_depth': 3, 'num_leaves': 366, 'subsample': 1.0, 'colsample_bytree': 0.5, 'reg_alpha': 8, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8446442966545747


In [673]:
optuna_50 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [674]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.848


In [675]:
X_train = X_train.values
y_train = y_train.values

In [676]:
auc_bootstrap = []

In [677]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84537926, 0.84954882])

In [678]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9970484972000122, pvalue=0.0007185438298620284),
 0.8475585596608225)

In [679]:
t_50 = auc_bootstrap
print(t_50)

[0.8477609264637338, 0.8454895844537665, 0.8495168692409738, 0.8478783478541465, 0.8463312538094725, 0.8478259115843821, 0.8475050195403535, 0.8472786920511994, 0.8490521135850275, 0.8452758058154961, 0.8462694059015453, 0.847659191136926, 0.8486285898676993, 0.8504607220967337, 0.8476757735470224, 0.8476094439066365, 0.8471720268186871, 0.8457289089670502, 0.8468036284105984, 0.8477429995338999, 0.8474073177727582, 0.8483059051306874, 0.8469631780861211, 0.8481418737227062, 0.8467422286759169, 0.8482808074289196, 0.8478599727510666, 0.8487406331791617, 0.847267935893299, 0.8466369079631422, 0.8487289806747695, 0.8477331397224911, 0.8448849987451149, 0.8473651894876484, 0.8455729446774946, 0.8481898282600122, 0.846064142554946, 0.8488948047757341, 0.8468919185400308, 0.8487267398085403, 0.846368900362124, 0.8486859560431681, 0.848829819655086, 0.848104227170055, 0.8483838872754651, 0.8463931017173999, 0.8469519737549748, 0.8485712236922306, 0.8487289806747697, 0.847682944318956, 0.8479

In [680]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [681]:
# 51
column_to_drop_50 = 'Cat_현재 주택의 유형'

In [682]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(10564, 22)


In [683]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [684]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [685]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 124, 'learning_rate': 0.08, 'max_depth': 2, 'num_leaves': 592, 'subsample': 0.4, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8416409154014959


In [686]:
optuna_51 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [687]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.847


In [688]:
X_train = X_train.values
y_train = y_train.values

In [689]:
auc_bootstrap = []

In [690]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84390066, 0.84766508])

In [691]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9929308295249939, pvalue=3.171228968312789e-08),
 0.845952933966154)

In [692]:
t_51 = auc_bootstrap
print(t_51)

[0.8455285755261555, 0.8461833566383421, 0.8475798644724107, 0.8451848266465884, 0.8457141192499373, 0.8482597432863648, 0.8463984797963501, 0.8457275644473127, 0.8464710838621776, 0.8445206338962389, 0.8445658993940697, 0.847363396794665, 0.8453627514251909, 0.8461914237567675, 0.8436726901150908, 0.846899985658456, 0.8451431465347244, 0.8458145100570076, 0.8449800114732352, 0.8471957800007172, 0.8475937578430318, 0.8459785414649887, 0.8463137750528845, 0.8464011688358253, 0.844119967014449, 0.8466221182460293, 0.8460865512172386, 0.8452480190742533, 0.8461067190133017, 0.844332401132982, 0.8472249112616972, 0.8458158545767452, 0.8466010541034742, 0.846796009465419, 0.8440536373740632, 0.8456558567279767, 0.8455133376357964, 0.8463845864257287, 0.8435368936215983, 0.8456343444121762, 0.8447913305367323, 0.8460892402567136, 0.8458234735219247, 0.8448890323043275, 0.8446264027822594, 0.8450203470653617, 0.845843193144742, 0.8479339213366319, 0.8461882865440464, 0.8470066508909684, 0.846

In [693]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [694]:
# 52
column_to_drop_51 = '가구주 나이'

In [695]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(10564, 21)


In [696]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [697]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [698]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 100, 'learning_rate': 0.09, 'max_depth': 4, 'num_leaves': 870, 'subsample': 0.2, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 4, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8404590163016705


In [699]:
optuna_52 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [700]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.846


In [701]:
X_train = X_train.values
y_train = y_train.values

In [702]:
auc_bootstrap = []

In [703]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.84212188, 0.84616642])

In [704]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988720417022705, pvalue=0.22957101464271545),
 0.8442105547040264)

In [705]:
t_52 = auc_bootstrap
print(t_52)

[0.846416406726184, 0.845162417984296, 0.8445103259115843, 0.8443794593237962, 0.842910795597146, 0.8444529597361156, 0.8427844107418164, 0.8445954788282959, 0.8452888028396257, 0.8449589473306801, 0.8449912158043813, 0.845469864830949, 0.844242766483812, 0.8457876196622566, 0.8437515686063602, 0.8452027535764224, 0.8454479043419023, 0.8447245527231005, 0.8429999820730701, 0.8438542002796601, 0.8462738876340037, 0.843414990498727, 0.8436139794198846, 0.8431716324262306, 0.8430102900577247, 0.8431165071169912, 0.845786723315765, 0.8444668531067369, 0.8441692660714925, 0.8443911118281883, 0.8426625076189452, 0.8434893872575381, 0.8434015453013517, 0.8448361478613172, 0.8447492022516224, 0.8445726219927575, 0.8436915133914166, 0.8418463841382525, 0.8447043849270374, 0.8439568319529597, 0.8447630956222437, 0.8446895952099243, 0.8433446272991287, 0.8433298375820155, 0.8463070524541966, 0.8450028683087735, 0.8449396758811084, 0.8448478003657093, 0.8449719443548096, 0.8445811372844286, 0.8444

In [706]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [707]:
# 53
column_to_drop_52 = 'Cat_현재 주택의 점유형태'

In [708]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(10564, 17)


In [709]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [710]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [711]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 132, 'learning_rate': 0.09, 'max_depth': 2, 'num_leaves': 878, 'subsample': 0.4, 'colsample_bytree': 0.8, 'reg_alpha': 3, 'reg_lambda': 9, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8364661706749071


In [712]:
optuna_53 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [713]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.841


In [714]:
X_train = X_train.values
y_train = y_train.values

In [715]:
auc_bootstrap = []

In [716]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.8378611 , 0.84232456])

In [717]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9982060790061951, pvalue=0.027397233992815018),
 0.8401143220483309)

In [718]:
t_53 = auc_bootstrap
print(t_53)

[0.8391008748341758, 0.8366865655587823, 0.8382905776056792, 0.8415331110394033, 0.8395114015273745, 0.8383425657021978, 0.8396584023520132, 0.840826341830698, 0.8393621598365063, 0.8395817647269728, 0.8418141156645513, 0.8381892904521172, 0.8401442221505144, 0.8423411674016708, 0.8391479330249902, 0.839984224301746, 0.8397081495823024, 0.8402504392097808, 0.841435409271808, 0.8390323043275608, 0.8393527481983436, 0.8419875587106951, 0.8397090459287941, 0.8398511168477286, 0.8397794091283927, 0.8402598508479437, 0.8392442902728479, 0.8403087017317414, 0.8394074253343372, 0.8387530923953963, 0.8407057832275644, 0.8400348678785271, 0.841522354881503, 0.8392666989351404, 0.8404942454555233, 0.8416805600372881, 0.8404709404467391, 0.8414242049406618, 0.8394791330536732, 0.8388624466673837, 0.8387598149940841, 0.8397206984331863, 0.839022444516152, 0.8403149761571832, 0.8401379477250726, 0.840536821913879, 0.8401988992865081, 0.8405556451902046, 0.8393778459001111, 0.8403956473414362, 0.840

In [719]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [720]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 거주 지역'

In [721]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(10564, 10)


In [722]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [723]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [724]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 56, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 534, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8315538369385105


In [725]:
optuna_54 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [726]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.826


In [727]:
X_train = X_train.values
y_train = y_train.values

In [728]:
auc_bootstrap = []

In [729]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.82326423, 0.83104589])

In [730]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.996757984161377, pvalue=0.00030638056341558695),
 0.8269933058603134)

In [731]:
t_54 = auc_bootstrap
print(t_54)

[0.8263431752178121, 0.8248180416621849, 0.8234901043347317, 0.8257650317306658, 0.8264740418056004, 0.8277809149904987, 0.829147395217095, 0.8238795668853751, 0.8243281883044711, 0.825856907246065, 0.8265210999964147, 0.8282223656376608, 0.8307227241762576, 0.827409827542935, 0.8286893621598366, 0.8290824100964469, 0.8297174715858163, 0.8270414291348465, 0.825701391129755, 0.8249112616973217, 0.8233269692732423, 0.8289564734143631, 0.827708759097917, 0.8251142841776917, 0.8264821089240257, 0.8283016923021762, 0.8292917070022587, 0.8281600695564877, 0.8278974400344197, 0.8289488544691836, 0.8248668925459826, 0.8241471263131477, 0.830345362303252, 0.8278481409773762, 0.8266084937793554, 0.8249502527697107, 0.8283146893263059, 0.8289277903266287, 0.8254678928686673, 0.8292979814277008, 0.8255445304937077, 0.8294799397655157, 0.826367824746334, 0.8273753182030046, 0.8259048617833709, 0.8273067476963896, 0.8283572657846616, 0.8252890717435731, 0.8271821555340432, 0.8259680542110358, 0.8272

In [732]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [733]:
# 55
column_to_drop_54 = '소득 대비 주택 임대료의 비율'

In [734]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(10564, 9)


In [735]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [736]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [737]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 75, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 978, 'subsample': 0.1, 'colsample_bytree': 0.6, 'reg_alpha': 9, 'reg_lambda': 7, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7768177034349949


In [738]:
optuna_55 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [739]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.778


In [740]:
X_train = X_train.values
y_train = y_train.values

In [741]:
auc_bootstrap = []

In [742]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77800904, 0.77869653])

In [743]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.5622764825820923, pvalue=0.0), 0.7781624439783441)

In [744]:
t_55 = auc_bootstrap
print(t_55)

[0.7780090351726363, 0.7781246638700656, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7780090351726363, 0.7780090351726363, 0.7780090351726363, 0.7781040479007565, 0.7786015202036499, 0.7786965329317701, 0.7766851314043957, 0.7766851314043957, 0.7786965329317701, 0.7781040479007565, 0.7781040479007565, 0.7786965329317701, 0.7786015202036499, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7786965329317701, 0.7786965329317701, 0.7781040479007565, 0.7786965329317701, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7766851314043957, 0.7786965329317701, 0.7787171489010791, 0.7781040479007565, 0.7781040479007565, 0.7786965329317701, 0.7781246638700656, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565, 0.7781040479007565,

In [745]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc