In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [55]:
from sklearn.utils import resample
from numpy.random import RandomState

In [56]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [57]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [58]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [59]:
중장년가구 = pd.read_csv('중장년가구_변수추가.csv', encoding='cp949')
중장년가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [60]:
중장년가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'target'    
]

In [61]:
cat = 중장년가구.select_dtypes(include = 'object')
num = 중장년가구.select_dtypes(exclude = 'object')
num_중장년 = num.drop('target',axis=1)
target = 중장년가구.target

In [62]:
scaler=RobustScaler()
scaler.fit(num_중장년)
num_scaled_중장년=scaler.transform(num_중장년)
num_df_scaled_중장년=pd.DataFrame(data=num_scaled_중장년, columns=num_중장년.columns)

In [63]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [64]:
comp =pd.concat([num_df_scaled_중장년, target,cat2],axis=1)

In [65]:
X =comp.drop('target', axis = 1)
y=comp.target
X.shape

(19949, 214)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [67]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [14]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [15]:
print(study.best_trial.params)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}


In [16]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7972291832724003


In [17]:
optuna_0 = LGBMClassifier(**study.best_trial.params, random_state = 0)

In [18]:
optuna_0.fit(X_train, y_train)

In [19]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.794


In [20]:
X_train = X_train.values
y_train = y_train.values

In [23]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [207]:
auc_bootstrap = []
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [25]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77933695, 0.78995656])

In [26]:
np.mean(auc_bootstrap)

0.7847051715684721

In [27]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9988760352134705, pvalue=0.23233012855052948)

In [28]:
t_0 = auc_bootstrap
print(t_0)

[0.783391501987561, 0.7860915102294411, 0.7816686320380902, 0.7829910311437406, 0.7776953589761472, 0.7841977269317171, 0.7858994638551289, 0.7797695654838511, 0.7825963718820863, 0.7861543809819672, 0.7857668541166077, 0.7795978596471207, 0.7831703977024174, 0.7796446164672766, 0.7816459140350767, 0.783000541005467, 0.7874300232674617, 0.7834659959044195, 0.7816424799183421, 0.7818944912540972, 0.7874168151261747, 0.7799359880640669, 0.7803660451443702, 0.7862061568958121, 0.7834567502055186, 0.7855925066516198, 0.7863070670952444, 0.7853566092482348, 0.7855544672047136, 0.7865984386920347, 0.7844222653335955, 0.7896170273017564, 0.7792615803699547, 0.7864988493067311, 0.7820379316684735, 0.7915768513059154, 0.7876115031287445, 0.7891708562890829, 0.7825020657532973, 0.7822902071670544, 0.7811432121776949, 0.783874655795838, 0.7900642549657328, 0.7867585213644326, 0.7794906095398707, 0.784902777513615, 0.7860801512279345, 0.7899342868554692, 0.7826848664287087, 0.7826185615594483, 0.7

In [68]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [69]:
# 1.
column_to_drop = '부채 중 임대 보증금의 비중'

In [70]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(19949, 213)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [33]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [34]:
print(study.best_trial.params)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}


In [35]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.7972291832724003


In [36]:
optuna_1 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [37]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.794


In [38]:
X_train = X_train.values
y_train = y_train.values

In [39]:
auc_bootstrap = []

In [40]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77928243, 0.78995342])

In [41]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9984325766563416, pvalue=0.057379234582185745)

In [42]:
np.mean(auc_bootstrap)

0.784735041912074

In [43]:
t_1 = auc_bootstrap
print(t_1)

[0.779507515960718, 0.7824553089331414, 0.7821922027587053, 0.7853436652697736, 0.7822262797632257, 0.7832874218342198, 0.7855021629652172, 0.7821528424976701, 0.787888609932945, 0.7831939081939081, 0.784329544181761, 0.7829588032790004, 0.7814261305640616, 0.7868095047898003, 0.7863432574023707, 0.7830589209899554, 0.7822196756925821, 0.7850264057160609, 0.783279232786622, 0.7871032538520224, 0.7817111622530342, 0.7856345085409125, 0.7786616665927011, 0.785445632120509, 0.7801053164353656, 0.7832240227560423, 0.7817175021608519, 0.7941777984881433, 0.7816395741272588, 0.7824590072127018, 0.7847094103251739, 0.7881651884114937, 0.7801494316272641, 0.7793957750854303, 0.7845802347033874, 0.7799336105986352, 0.7832602130631687, 0.7843416956717449, 0.7845165714623843, 0.783218475336702, 0.7874767800876175, 0.7854720484030828, 0.7871922767242965, 0.7839816417402625, 0.7840545506801665, 0.7827715118355512, 0.7832496465501391, 0.7867199535918747, 0.7907980992956362, 0.7866679135152042, 0.780

In [13]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

NameError: name 'X_val' is not defined

In [71]:
#### 2. 
column_to_drop_1 = 'Cat_가구주 동거 여부'

In [72]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(19949, 211)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [48]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [49]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 148, 'learning_rate': 0.08, 'max_depth': 8, 'num_leaves': 432, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7978982971649962


In [50]:
optuna_2 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [51]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.795


In [52]:
X_train = X_train.values
y_train = y_train.values

In [53]:
auc_bootstrap = []

In [54]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78191382, 0.79160633])

In [55]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9981198310852051, pvalue=0.020676197484135628)

In [56]:
np.mean(auc_bootstrap)

0.7869371690039793

In [57]:
t_2 = auc_bootstrap
print(t_2)

[0.7895853277626677, 0.7863390307971589, 0.7878587595336364, 0.7890823617424603, 0.7865432286614553, 0.7896838604966685, 0.7887455541396428, 0.7892582941844025, 0.7815117193196011, 0.7914376374967508, 0.7875333109323257, 0.7879720853858785, 0.7879065730050951, 0.7853827613679831, 0.7879718212230528, 0.7837880103889956, 0.7865036042375945, 0.7833439526789281, 0.7849416094489986, 0.7869870222086971, 0.787856117905379, 0.7869043392442407, 0.7846848431823801, 0.7874091544042282, 0.7855584296470997, 0.7881221298708982, 0.7889859423110653, 0.7910318833964154, 0.7876978843727611, 0.7823668143865189, 0.7889568844002341, 0.7875465190736126, 0.7861942695686538, 0.7899221353654852, 0.7847139010932115, 0.7881387721289198, 0.7817669006092651, 0.7886031703765694, 0.7890585870881437, 0.7893879981318405, 0.7905680134744174, 0.7905516353792216, 0.7899810436756249, 0.7882721743559183, 0.7850195374825917, 0.7870155517938769, 0.78710774462006, 0.788067712328796, 0.7883221011299829, 0.788911448394207, 0.78

In [58]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [73]:
#### 3.
column_to_drop_2 = 'Cat_가구주 주민등록상 등재 여부'

In [74]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(19949, 209)


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [62]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [63]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 162, 'learning_rate': 0.08, 'max_depth': 8, 'num_leaves': 470, 'subsample': 1.0, 'colsample_bytree': 0.5, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7984605014350243


In [64]:
optuna_3 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [65]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.795


In [66]:
X_train = X_train.values
y_train = y_train.values

In [67]:
auc_bootstrap = []

In [68]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7818019 , 0.79165389])

In [69]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989611506462097, pvalue=0.29777535796165466),
 0.7866885666894903)

In [70]:
t_3 = auc_bootstrap
print(t_3)

[0.7906044679443693, 0.7870142309797483, 0.790611072015013, 0.7845382328140948, 0.7852979651009206, 0.784621179941377, 0.7831703977024174, 0.7843379973921846, 0.7878724960005747, 0.7875153478601755, 0.7872934510865546, 0.7830726574568939, 0.7841636499271968, 0.7852210937186307, 0.7857333054377389, 0.7836709862571931, 0.7860508291542774, 0.7902599996196055, 0.780487824207036, 0.7848161321067724, 0.7876701472760586, 0.7865796831314072, 0.7828777052914985, 0.7895044939379914, 0.786220421688402, 0.7889410346306898, 0.7851038054240024, 0.7880655990261901, 0.7879601980587201, 0.7835785292681844, 0.7871579355569503, 0.7874918373686846, 0.7891375717730398, 0.7866071560652842, 0.785570052811432, 0.7861192473261439, 0.7898059037221599, 0.7904232522459123, 0.7857348904146932, 0.7882235683959822, 0.786523152286699, 0.7856416409372073, 0.7838551077467333, 0.7843871316777721, 0.7879805385963021, 0.7921133660049917, 0.7872163155414387, 0.7854472170974633, 0.786602136971595, 0.7847773001713887, 0.7878

In [71]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [75]:
### 4. 
column_to_drop_3 = '소득 중 사회보험 수혜금의 비중(월평균)'

In [76]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(19949, 208)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [75]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [76]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7976324678332062


In [77]:
optuna_4 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [78]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.794


In [79]:
X_train = X_train.values
y_train = y_train.values

In [80]:
auc_bootstrap = []

In [81]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77893907, 0.7901864 ])

In [82]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991553425788879, pvalue=0.4981984496116638),
 0.7845869384954977)

In [83]:
t_4 = auc_bootstrap
print(t_4)

[0.7910279209540293, 0.786858374912562, 0.787712413328177, 0.7862782733472389, 0.7818773206704241, 0.7861387953752484, 0.7861821180786699, 0.7796792217974484, 0.7860669430866476, 0.7844492099418208, 0.7834062951058025, 0.7815568911628025, 0.7851627137341424, 0.784932099587272, 0.7854126117672916, 0.7903321160710324, 0.7864964718412994, 0.7953369449674867, 0.7870350998429817, 0.7883598764140637, 0.7886961556912295, 0.785587487557931, 0.7825688989482094, 0.7839309224777204, 0.7837930294826847, 0.7849888945948059, 0.7874928940199877, 0.7860262620114837, 0.7865540593373106, 0.7885735841400865, 0.7836435133233162, 0.787716375770563, 0.7831968139849914, 0.7861834388927986, 0.782418326137538, 0.7832562506207826, 0.7827110185484569, 0.7903265686516918, 0.7856670005684784, 0.779400001690642, 0.7804246892916844, 0.7870525345894803, 0.7898336408188624, 0.786604250274201, 0.7876989410240641, 0.7835624153358144, 0.785557108832971, 0.7824267793479616, 0.78744191059462, 0.7809686005498814, 0.77853196

In [77]:
## 5.현재 주택의 위치
column_to_drop_4 = '소득 중 재산소득의 비중(월평균)'

In [78]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(19949, 207)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [87]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [88]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7967771908526641


In [89]:
optuna_5 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [90]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.793


In [91]:
X_train = X_train.values
y_train = y_train.values

In [92]:
auc_bootstrap = []

In [93]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7794116, 0.7901317])

In [94]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991281628608704, pvalue=0.46610867977142334),
 0.7847451347811569)

In [95]:
t_5 = auc_bootstrap
print(t_5)

[0.7881105067065659, 0.7823250766600521, 0.7894622278858733, 0.7860183371267115, 0.7898027337682512, 0.7834678450441998, 0.7905032935821112, 0.7822556018368827, 0.7810349054191419, 0.7835676985923291, 0.7824360250468625, 0.7863712586618992, 0.782025516015664, 0.7838408429541435, 0.7811125692899092, 0.7890020562434356, 0.7880106531584363, 0.7875045171843201, 0.7899057572702892, 0.7828660821271659, 0.7822463561379818, 0.7858038369122113, 0.783523319237605, 0.7816517256172428, 0.7871534447889128, 0.7865796831314074, 0.7899802511871478, 0.7830214098687005, 0.7840114921395711, 0.7874218342198638, 0.7856773029186823, 0.7835064128167577, 0.7834023326634164, 0.7841480643204781, 0.7889140900224644, 0.7830002768426414, 0.7823517571054517, 0.7880349561384045, 0.7828063813285489, 0.7800221051452578, 0.7780097127387768, 0.7884496917748149, 0.7887682721426563, 0.7842682584061894, 0.7814950770615795, 0.7885120342016894, 0.7831574537239561, 0.784968554057224, 0.7823055286109473, 0.7846737483436991, 0.

In [96]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [79]:
## 6
column_to_drop_5 = '소득 중 사적이전소득의 비중(월평균)'

In [80]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(19949, 206)


In [99]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [100]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [101]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7969476683589207


In [102]:
optuna_6 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [103]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.794


In [104]:
X_train = X_train.values
y_train = y_train.values

In [105]:
auc_bootstrap = []

In [106]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77940935, 0.79000442])

In [107]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9976827502250671, pvalue=0.005060562398284674),
 0.7847671831314073)

In [108]:
t_6 = auc_bootstrap
print(t_6)

[0.7834451270411861, 0.7871846160023499, 0.7879596697330686, 0.7867809752046206, 0.7805406567721839, 0.7872810354337447, 0.7879718212230529, 0.7843554321386834, 0.7854876340098015, 0.7849643274520122, 0.7856696421967359, 0.7830362029869419, 0.7861461919343692, 0.7871832951882214, 0.7813146538515997, 0.7847279017229756, 0.7891534215425842, 0.7814189981677666, 0.7816794627139455, 0.7831841341693558, 0.7846999004634473, 0.7897892614641384, 0.7816123653562077, 0.7857990819813481, 0.7837224980082123, 0.780407782870837, 0.7834097292225372, 0.7864404693222427, 0.7846433696187392, 0.7835510563343075, 0.7845192130906417, 0.791515301367518, 0.7881358663378368, 0.7821723905467748, 0.7872308444968543, 0.7825628232032172, 0.7827704551842483, 0.7839237900814255, 0.7858616885710481, 0.786508095005632, 0.7806497560192142, 0.7826172407453196, 0.783324932955475, 0.7861662683091255, 0.784253465287948, 0.7837494426164378, 0.7890305858286154, 0.7848589264845423, 0.7868192788143527, 0.7877655100561505, 0.78

In [109]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [81]:
## 7 .
column_to_drop_6 = 'Cat_현재 주택의 위치'

In [82]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(19949, 202)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [113]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [114]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 145, 'learning_rate': 0.08, 'max_depth': 8, 'num_leaves': 402, 'subsample': 0.5, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 6, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7972543627587967


In [115]:
optuna_7 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [116]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.793


In [117]:
X_train = X_train.values
y_train = y_train.values

In [118]:
auc_bootstrap = []

In [119]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78111328, 0.79127674])

In [120]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989916086196899, pvalue=0.3244906961917877),
 0.7862544543135677)

In [121]:
t_7 = auc_bootstrap
print(t_7)

[0.7902119219853211, 0.7843461864397825, 0.7865559084770908, 0.7904356678987221, 0.784796319894842, 0.7877192815616462, 0.7874773084132689, 0.7856598681721834, 0.781459150917279, 0.7814496410555524, 0.7837726889451028, 0.7868309019786852, 0.7861118507670233, 0.786213289292107, 0.7874355706868023, 0.7905722400796292, 0.7828967250149517, 0.7849405527976956, 0.784896701768623, 0.7845448368847384, 0.7860505649914518, 0.7856831145008485, 0.7861208323030983, 0.7809236928695057, 0.7851141077742063, 0.7878933648638082, 0.7851550530121959, 0.7823879474125779, 0.7881569993638959, 0.7842246715399425, 0.7842703717087953, 0.7902275075920396, 0.7836123421098791, 0.7884687114982682, 0.7837124598208343, 0.7844415492198744, 0.7892744081167726, 0.7891127404674203, 0.7874659494117622, 0.7861712874028144, 0.7851360332887427, 0.7899292677617802, 0.7862124968036299, 0.7877911338502472, 0.7929261950197911, 0.7886512480108538, 0.7896598216795261, 0.7847553746568525, 0.7885661875809659, 0.7884967127577964, 0.7

In [122]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [83]:
## 8 .
column_to_drop_7 = 'Cat_가구주 장애 여부'

In [84]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(19949, 200)


In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [126]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [127]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7973088495162441


In [128]:
optuna_8 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [129]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.794


In [130]:
X_train = X_train.values
y_train = y_train.values

In [131]:
auc_bootstrap = []

In [132]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77953821, 0.78994426])

In [133]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.998736560344696, pvalue=0.15179851651191711),
 0.7847793845481653)

In [134]:
t_8 = auc_bootstrap
print(t_8)

[0.7858688209673431, 0.786709651241671, 0.7842645601266289, 0.7818966045567031, 0.7866219491835256, 0.7904076666391937, 0.7870615161255555, 0.7877493961237805, 0.7847529971914209, 0.7855135219667239, 0.7861604567269591, 0.7823102835418108, 0.7801185245766526, 0.7909135384504842, 0.7829683131407268, 0.7860513574799288, 0.7828032113746399, 0.7870720826385851, 0.7848462466689068, 0.7847786209855176, 0.7869186040368307, 0.7856865486175832, 0.7891425908667288, 0.7900985961330789, 0.7843961132138473, 0.7847506197259891, 0.7852052439490863, 0.7875494248646958, 0.7833114606513621, 0.7779735224316504, 0.7839784717863535, 0.7881926613453707, 0.7833397260737162, 0.7877718499639682, 0.7866274966028661, 0.7835676985923291, 0.7859919208441376, 0.7872799787824418, 0.7851825259460727, 0.7870704976616307, 0.785870141781472, 0.785938824116164, 0.7861633625180423, 0.7896656332616923, 0.7823314165678698, 0.7845997827524921, 0.7868084481384974, 0.7833804071488801, 0.7863778627325426, 0.7877977379208907, 0.

In [135]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [85]:
#9.
column_to_drop_8 = '부채 중 비금융기관 대출금의 비중'

In [86]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(19949, 199)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [139]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [140]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7968159920284222


In [141]:
optuna_9 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [142]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.794


In [143]:
X_train = X_train.values
y_train = y_train.values

In [144]:
auc_bootstrap = []

In [145]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77940222, 0.78988384])

In [146]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9993559718132019, pvalue=0.7534233927726746),
 0.7845597556124034)

In [147]:
t_9 = auc_bootstrap
print(t_9)

[0.787599879964412, 0.7821805795943726, 0.7852068289260408, 0.785198375715617, 0.780849463115473, 0.7799454979257936, 0.7850279906930153, 0.788749516582029, 0.7879351025902749, 0.7789263577440917, 0.7796528055148745, 0.7867194252662233, 0.7839932649045951, 0.788187378088856, 0.7835970206659861, 0.7889729983326041, 0.7839095252888357, 0.783551584659959, 0.7848718704630036, 0.7838786182382242, 0.7801061089238429, 0.7812988040820552, 0.7856741329647734, 0.7816681037124387, 0.7848940601403656, 0.7873235656486888, 0.7827598886712187, 0.7837901236916015, 0.7850958805392302, 0.7815418338817353, 0.7863921275251324, 0.7855597504612283, 0.7838535227697789, 0.7851397315683031, 0.7894733227245542, 0.7805353735156692, 0.7762215945713482, 0.7898320558419081, 0.7801824519804814, 0.7850372363919161, 0.7825303311756514, 0.784056135657121, 0.7838920905423369, 0.7874339857098478, 0.7838025393444114, 0.7856001673735664, 0.784368904442796, 0.7829733322344159, 0.7848795311849499, 0.7823020944942127, 0.78156

In [148]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [None]:
# 여기서 다시 3월 25일 오전 3시

In [87]:
# 10.
column_to_drop_9 = 'Cat_현재 의료시설 접근용이성'

In [208]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(19949, 195)


In [209]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [210]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [211]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7968651126658183


In [212]:
optuna_10 = LGBMClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [213]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.795


In [214]:
X_train = X_train.values
y_train = y_train.values

In [215]:
auc_bootstrap = []

In [216]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77893949, 0.78993159])

In [218]:
from scipy.stats import shapiro

In [219]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9982767701148987, pvalue=0.03451215848326683),
 0.78456024140784)

In [220]:
t_10 = auc_bootstrap
print(t_10)

[0.7833291595606867, 0.7857742506757284, 0.7873822097960028, 0.7861382670495971, 0.7840886276846868, 0.7838905055653824, 0.7825818429266704, 0.7877150549564342, 0.7867262934996925, 0.7817140680441172, 0.7889146183481158, 0.7828571005910907, 0.7840777970088315, 0.7882148510227326, 0.783025900636738, 0.7853354762221756, 0.7870887248966068, 0.7807834224090382, 0.7849009283738347, 0.783234589269072, 0.7805137121639584, 0.7824637621435652, 0.7829852195615743, 0.7886338132643551, 0.7875288201642882, 0.7827931731872619, 0.7846618610165408, 0.7815008886437458, 0.7872620157102915, 0.7848663230436629, 0.7862806508126705, 0.78533521205935, 0.7849484776824678, 0.7846903906017206, 0.7853209472667602, 0.780357327771121, 0.7833434243532766, 0.7773567022335495, 0.7867999949280737, 0.7831986631247715, 0.7817045581823907, 0.7844774753641748, 0.7838429562567494, 0.7866803291680139, 0.7834599201594277, 0.7852707563298696, 0.7836635896980724, 0.7801203737164328, 0.785457783610493, 0.7834321830627249, 0.778

In [221]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [222]:
# 11.
column_to_drop_10 = 'Cat_현재 공공기관 접근용이성'

In [223]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(19949, 191)


In [224]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [225]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [226]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7973612723813643


In [227]:
optuna_11 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [228]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.793


In [229]:
X_train = X_train.values
y_train = y_train.values

In [230]:
auc_bootstrap = []

In [231]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77921164, 0.79005433])

In [232]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990311861038208, pvalue=0.3619132339954376),
 0.784553287189371)

In [233]:
t_11 = auc_bootstrap
print(t_11)

[0.7850501803703774, 0.7837792930157462, 0.7802957778327236, 0.7832905917881288, 0.7867804468789691, 0.7877906055245957, 0.7824539881190128, 0.7868699980768946, 0.7910701870061476, 0.7887407992087795, 0.7843372049037073, 0.7839972273469811, 0.7839317149661977, 0.7818641125291372, 0.7827273966436528, 0.7838498244902186, 0.7843033920620127, 0.7833278387465579, 0.7805403926093581, 0.7817019165541332, 0.7838852223088675, 0.7851106736574717, 0.7802672482475439, 0.7875388583516663, 0.784906739956001, 0.7835222625863019, 0.787412588520963, 0.7824288926505676, 0.7787845023066697, 0.7815254557865394, 0.784003303091973, 0.78578904379397, 0.7858883690164478, 0.782983370421794, 0.7851896583423678, 0.780551751610865, 0.7839216767788195, 0.7825612382262628, 0.7832723645531527, 0.789223424691405, 0.7848819086503815, 0.7872490717318303, 0.7823718334802079, 0.7820957833273103, 0.7909716542721469, 0.7812361974923552, 0.7853790630884226, 0.7837869537376927, 0.7829218204833968, 0.7849574592185429, 0.78512

In [234]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [235]:
# 12
column_to_drop_11 = 'Cat_현재 주변도로의 보행 안전'

In [236]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(19949, 187)


In [237]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [238]:
X_train.shape

(12767, 187)

In [239]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [240]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 198, 'learning_rate': 0.05, 'max_depth': 8, 'num_leaves': 648, 'subsample': 0.6, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 6, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7985116859647479


In [241]:
optuna_12 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [242]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.794


In [243]:
X_train = X_train.values
y_train = y_train.values

In [244]:
auc_bootstrap = []

In [245]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78314283, 0.79271364])

In [246]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991815686225891, pvalue=0.5302231311798096),
 0.7877996645924601)

In [247]:
t_12 = auc_bootstrap
print(t_12)

[0.7863422007510678, 0.7844066797268768, 0.7907368135200648, 0.7889526577950223, 0.7877219231899035, 0.7850884839801097, 0.784457134826593, 0.7883873493479404, 0.7896186122787108, 0.7843044487133157, 0.7891856494073244, 0.787494478996942, 0.791242949494181, 0.7897411838298538, 0.7885424129266493, 0.7830885072264383, 0.7881900197171132, 0.7844127554718687, 0.7834110500366658, 0.7856950018280068, 0.7894992106814767, 0.7849664407546182, 0.789937456809378, 0.7859927133326148, 0.7864819428858838, 0.7844999292043628, 0.7910149769755681, 0.7878333999023654, 0.7891824794534155, 0.7903595890049092, 0.7893156175175879, 0.7873367737899758, 0.7898500189140583, 0.787099555572462, 0.7869083016866267, 0.7837956711109421, 0.7834506744605265, 0.7841232330148586, 0.7913367272973184, 0.7870432888905795, 0.7869608700889488, 0.7930765036676367, 0.7871489540208753, 0.7871259718550359, 0.7874057202874936, 0.7860497725029745, 0.7874572320385128, 0.7833217630015661, 0.7853835538564602, 0.7862505362505363, 0.78

In [248]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [249]:
# 13.
column_to_drop_12 = 'Cat_현재 주택의 구조'

In [250]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(19949, 185)


In [251]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [252]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [253]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 148, 'learning_rate': 0.060000000000000005, 'max_depth': 8, 'num_leaves': 152, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7973414590150196


In [254]:
optuna_13 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [255]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.795


In [256]:
X_train = X_train.values
y_train = y_train.values

In [257]:
auc_bootstrap = []

In [258]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78444848, 0.7934541 ])

In [259]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992426037788391, pvalue=0.6075040698051453),
 0.788948353922184)

In [260]:
t_13 = auc_bootstrap
print(t_13)

[0.7900925203880869, 0.790385212799006, 0.787781095662869, 0.7905526920305245, 0.7925986331158745, 0.7872223912864307, 0.7893932813883553, 0.7891029664428679, 0.7881223940337242, 0.7877980020837163, 0.7891719129403858, 0.7880640140492357, 0.790002176701684, 0.7900576508950894, 0.7883744053694792, 0.7880088040186563, 0.7915047348544886, 0.7938774453552778, 0.7867413507807597, 0.790339512630153, 0.7837745380848828, 0.7870858191055236, 0.7888147647999865, 0.7893887906203176, 0.7886029062137436, 0.7877673591959307, 0.7889243923726683, 0.7866296099054719, 0.7848900976979793, 0.7940893039415207, 0.7913113676660475, 0.7853262305232749, 0.7866512712571825, 0.7892453502059414, 0.7875068946497519, 0.7889877914508456, 0.7928131333303747, 0.7894410948598141, 0.7886985331566612, 0.7872099756336208, 0.7880027282736642, 0.7909785225056161, 0.7847279017229757, 0.7902623770850372, 0.7867907492291729, 0.7904113649187541, 0.7921284232860586, 0.7903281536286463, 0.7878812133738242, 0.79150341404036, 0.787

In [261]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [262]:
#14.
column_to_drop_13 = 'Cat_이사 예상 기간'

In [263]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(19949, 181)


In [264]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [265]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [266]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7976204972577062


In [267]:
optuna_14 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [268]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.794


In [269]:
X_train = X_train.values
y_train = y_train.values

In [270]:
auc_bootstrap = []

In [271]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77922155, 0.7901766 ])

In [272]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9993759393692017, pvalue=0.7780375480651855),
 0.7846538295420685)

In [273]:
t_14 = auc_bootstrap
print(t_14)

[0.7847115236277796, 0.7876091256633129, 0.7797463191551861, 0.7816900292269751, 0.783595171526206, 0.7874421747574458, 0.7887500449076805, 0.7838096717407063, 0.7851027487726994, 0.7865897213187854, 0.7832829310661823, 0.7787641617690879, 0.7834242581779527, 0.7906269217845572, 0.7851196551935468, 0.7880156722521254, 0.7869389445744126, 0.7861533243306642, 0.7892860312811052, 0.7860230920575748, 0.7798918728721684, 0.7810840397047294, 0.7839858683454742, 0.7798419460981038, 0.7855124653154211, 0.7815130401337298, 0.7886914007603663, 0.7864333369259477, 0.784681937391297, 0.7828565722654393, 0.7849532326133312, 0.7848504732741186, 0.785850593732367, 0.7819261907931858, 0.7818775848332499, 0.7804529547140384, 0.7817056148336936, 0.7810343770934903, 0.7903727971461961, 0.7874643644348077, 0.7825622948775659, 0.7861012842539937, 0.7803787249600057, 0.7858896898305765, 0.7863617488001724, 0.7842310114477602, 0.7824484406996722, 0.7818963403938772, 0.7834992804204627, 0.785297700938095, 0.7

In [274]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [275]:
## 15.
column_to_drop_14 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [276]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(19949, 177)


In [277]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [278]:
X_train.shape

(12767, 177)

In [279]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [280]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 150, 'learning_rate': 0.08, 'max_depth': 8, 'num_leaves': 140, 'subsample': 0.4, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7980374035078739


In [281]:
optuna_15 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [282]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.795


In [283]:
X_train = X_train.values
y_train = y_train.values

In [284]:
auc_bootstrap = []

In [285]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78268735, 0.79216176])

In [286]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992210268974304, pvalue=0.5798124670982361),
 0.7875419756050913)

In [287]:
t_15 = auc_bootstrap
print(t_15)

[0.7896928420327436, 0.7878864966303389, 0.786632515696555, 0.7828613271963026, 0.7872374485674978, 0.7883778394862138, 0.7876548258321657, 0.7895935168102656, 0.7880571458157666, 0.784791300801153, 0.7884084823739996, 0.7867498039911833, 0.7905878256863479, 0.7889352230485236, 0.7899105122011526, 0.7848074147335231, 0.7836049455507583, 0.7837460084997031, 0.7880132947866938, 0.7874009653566304, 0.784978592244602, 0.7922636746528373, 0.7885310539251426, 0.7875631613316343, 0.7874516846191724, 0.7842936180374604, 0.7847477139349062, 0.7892110090385953, 0.785662245637615, 0.785687605268886, 0.7866681776780299, 0.7843834333982117, 0.7850895406314125, 0.7898611137527394, 0.7887460824652943, 0.7891848569188471, 0.7906855659318713, 0.7899208145513563, 0.7882011145557944, 0.7896495193293223, 0.7863871084314433, 0.7847096744879996, 0.7853806480653771, 0.7899506649506649, 0.785182261783247, 0.7879459332661303, 0.7882996472897951, 0.788022012159943, 0.7895538923864047, 0.7878360415306228, 0.7896

In [288]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [289]:
# 16.
column_to_drop_15 = 'Cat_소득 계층'

In [290]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(19949, 175)


In [291]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [292]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [293]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 166, 'learning_rate': 0.060000000000000005, 'max_depth': 8, 'num_leaves': 368, 'subsample': 0.5, 'colsample_bytree': 0.5, 'reg_alpha': 3, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7973777835199849


In [294]:
optuna_16 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [295]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.797


In [296]:
X_train = X_train.values
y_train = y_train.values

In [297]:
auc_bootstrap = []

In [298]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78479852, 0.79457462])

In [299]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.998386561870575, pvalue=0.049394410103559494),
 0.7898427266992538)

In [300]:
t_16 = auc_bootstrap
print(t_16)

[0.7868570540984335, 0.787777133220483, 0.789040095690342, 0.789522192847316, 0.7912170615372587, 0.7893887906203176, 0.7916032675884893, 0.7846563135972002, 0.7884245963063696, 0.7930453324541995, 0.7860096197534622, 0.7892168206207616, 0.7893264481934432, 0.7893607893607895, 0.7888533325725444, 0.7902721511095896, 0.7887880843545868, 0.7885672442322689, 0.7903656647499011, 0.7927724522552109, 0.7902312058715999, 0.7901667501421197, 0.7895700063187748, 0.7891484024488951, 0.7882634569826688, 0.7888395961056061, 0.7937543454784833, 0.7916074941937011, 0.7903017373460723, 0.7897002385918642, 0.7883432341560421, 0.788455767519807, 0.7865601350823026, 0.7888454076877722, 0.78807986381878, 0.7869563793209113, 0.7946469516666561, 0.7865083591684576, 0.7909167084043931, 0.7888567666892791, 0.7889545069348025, 0.7893700350596902, 0.787092423176167, 0.7871124995509232, 0.7897221641064005, 0.7896170273017563, 0.789168742986477, 0.7922465040691642, 0.7884610507763217, 0.7920232864814145, 0.79278

In [301]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [302]:
# 17.
column_to_drop_16 = '자산 중 부동산 자산의 비중'

In [303]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(19949, 174)


In [304]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [305]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [306]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'learning_rate': 0.06999999999999999, 'max_depth': 9, 'num_leaves': 510, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7970702635631781


In [307]:
optuna_17 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [308]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.795


In [309]:
X_train = X_train.values
y_train = y_train.values

In [310]:
auc_bootstrap = []

In [311]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7832801 , 0.79250811])

In [312]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987480640411377, pvalue=0.15733180940151215),
 0.7880008887758272)

In [313]:
t_17 = auc_bootstrap
print(t_17)

[0.7893148250291109, 0.78879099014567, 0.7883411208534361, 0.7880489567681686, 0.7883952742327127, 0.7841089682222687, 0.7839840192056939, 0.7898703594516403, 0.7878497779975613, 0.7876688264619298, 0.7901995063325112, 0.7891071930480797, 0.7863102370491533, 0.7863274076328264, 0.7871510673234811, 0.785679680384114, 0.7884026707918332, 0.7883559139716775, 0.7876939219303751, 0.7908126282510519, 0.7884560316826327, 0.783753140895998, 0.7828357034022058, 0.7894078103437709, 0.788984357334111, 0.7870179292593087, 0.7882349273974891, 0.7893275048447462, 0.7906810751638338, 0.7863321625636897, 0.7895847994370162, 0.7875399150029692, 0.7864869619795727, 0.7860471308747171, 0.7874527412704752, 0.7877898130361184, 0.7879596697330687, 0.7913916731650723, 0.7912099291409636, 0.7869893996741288, 0.7865416436845007, 0.7886298508219691, 0.787959405570243, 0.7902267151035623, 0.7905196716773071, 0.7840896843359897, 0.7873161690895681, 0.7878944215151111, 0.7917609727954557, 0.7863702020105962, 0.790

In [314]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [315]:
## 18.
column_to_drop_17 ='소득 중 정부 보조금의 비중(월평균)'

In [316]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(19949, 173)


In [317]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [318]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [319]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'learning_rate': 0.06999999999999999, 'max_depth': 9, 'num_leaves': 510, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7977922130993594


In [320]:
optuna_18 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [321]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.797


In [322]:
X_train = X_train.values
y_train = y_train.values

In [323]:
auc_bootstrap = []

In [324]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78332065, 0.79259708])

In [325]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.998451828956604, pvalue=0.061078932136297226),
 0.7880830179229195)

In [326]:
t_18 = auc_bootstrap
print(t_18)

[0.7896291787917404, 0.7886140010524247, 0.7883342526199668, 0.7888726164588233, 0.7873513027453913, 0.7871497465093524, 0.7914088437487452, 0.7890506622033715, 0.7902531313861363, 0.7910062596023187, 0.7891568556593187, 0.789161610590182, 0.7901598819086504, 0.7869186040368307, 0.7842967879913693, 0.7848739837656095, 0.7885593193474967, 0.7897295606655212, 0.7845324212319287, 0.7868192788143527, 0.7849965553167523, 0.7874059844503194, 0.7908585925827305, 0.7908187039960439, 0.7853941203694899, 0.7891148537700261, 0.7881525085958583, 0.7851674686650056, 0.7873586993045121, 0.7887009106220929, 0.7891307035395705, 0.7893050510045584, 0.7861461919343693, 0.7865382095677662, 0.7848784745336469, 0.7859147852990217, 0.7896534817717084, 0.7914796393860433, 0.7858257624267476, 0.7848013389885311, 0.785779798095069, 0.7851191268678954, 0.7881210732195953, 0.7912738565447924, 0.7891880268727559, 0.787085290779872, 0.7872136739131812, 0.7866813858193169, 0.7914653745934535, 0.7860579615505724, 0.

In [327]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [328]:
# 19
column_to_drop_18 = 'Cat_현재 상업시설 접근용이성'

In [329]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(19949, 169)


In [330]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [331]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [332]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7976015094482927


In [333]:
optuna_19 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [334]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.793


In [335]:
X_train = X_train.values
y_train = y_train.values

In [336]:
auc_bootstrap = []

In [337]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77868161, 0.78961082])

In [338]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9983934164047241, pvalue=0.05051054060459137),
 0.7843987162743321)

In [339]:
t_19 = auc_bootstrap
print(t_19)

[0.7805409209350096, 0.7771535610205561, 0.7853526468058487, 0.7852953234726634, 0.783777443875966, 0.7859390882789897, 0.7862220066653565, 0.7808177635763842, 0.7827921165359589, 0.7861842313812757, 0.7878315507625853, 0.7844848719232955, 0.7846425771302619, 0.7852456608614244, 0.7836189461805225, 0.7860933593692213, 0.78651813319301, 0.7813384285059162, 0.78227250825773, 0.7783362179913903, 0.7819652868913953, 0.7800728244077998, 0.7838894489140795, 0.7844684938280997, 0.7762258211765601, 0.7852464533499017, 0.7858202150074072, 0.7890276800375323, 0.783988245810906, 0.786456319091787, 0.7852313960688345, 0.7764474537873554, 0.785506389570429, 0.7828172120044041, 0.7794388336260256, 0.7876006724528892, 0.7863342758662956, 0.7832345892690722, 0.7856014881876952, 0.7838559002352106, 0.7837230263338637, 0.7854923889406649, 0.7845136656713011, 0.7869484544361391, 0.7841094965479203, 0.7911753238107918, 0.7836440416489678, 0.7861583434243532, 0.7869516243900481, 0.7883213086415057, 0.78644

In [340]:
# 20.
column_to_drop_19 = '소득 중 근로/사업소득의 비중(월평균)'

In [341]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(19949, 168)


In [342]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [343]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [344]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7977108957416535


In [345]:
optuna_20 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [346]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.793


In [347]:
X_train = X_train.values
y_train = y_train.values

In [348]:
auc_bootstrap = []

In [349]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77880293, 0.78983953])

In [350]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991673231124878, pvalue=0.5127094984054565),
 0.7843265973133584)

In [351]:
t_20 = auc_bootstrap
print(t_20)

[0.7862703484624667, 0.7800958065736392, 0.7865030759119429, 0.782435496721211, 0.7839034495438436, 0.7851418448709089, 0.7842584843816371, 0.7851130511229033, 0.7809004465408407, 0.7831157159974894, 0.7841298370855021, 0.7823488513143686, 0.7827342648771219, 0.7836289843679006, 0.7795463478961016, 0.7814707740816116, 0.7902663395274233, 0.7884145581189916, 0.7809701855268358, 0.7861229456057042, 0.7877655100561505, 0.7830459770114943, 0.7887616680720129, 0.7833104040000592, 0.7857269655299213, 0.7820257801784896, 0.7838873356114735, 0.7814715665700888, 0.7842109350730041, 0.7838572210493393, 0.7857153423655887, 0.7869561151580855, 0.7831101685781487, 0.7840978733835877, 0.7801095430405774, 0.7904578575760841, 0.7843239967624204, 0.782238959578861, 0.7844964950876281, 0.7831254900220418, 0.7871204244356954, 0.7831947006823854, 0.786849393376487, 0.7843179210174285, 0.7797978309062052, 0.7813331452494013, 0.7879900484580289, 0.7793249794481323, 0.7927286012261383, 0.7828341184252515, 0.

In [352]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [353]:
# 21
column_to_drop_20 = '부채 중 금융기관 대출금의 비중'

In [354]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(19949, 167)


In [355]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [356]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [357]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7967994808898018


In [358]:
optuna_21 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [359]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.792


In [360]:
X_train = X_train.values
y_train = y_train.values

In [361]:
auc_bootstrap = []

In [362]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77877058, 0.78926753])

In [363]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987772107124329, pvalue=0.17219695448875427),
 0.7840811948031776)

In [364]:
t_21 = auc_bootstrap
print(t_21)

[0.7821671072902601, 0.783305649069196, 0.7825200288254475, 0.7837925011570331, 0.7873560576762546, 0.783069487502985, 0.7832335326177691, 0.7838115208804864, 0.7867379166640251, 0.7863416724254163, 0.7818181481974585, 0.7828536664743562, 0.7837018933078046, 0.7811326456646653, 0.7777101520943885, 0.7826402229111589, 0.7819713626363873, 0.7836984591910701, 0.7806748514876594, 0.7826933196391325, 0.7864565832546128, 0.7866800650051882, 0.7850261415532352, 0.7891344018191309, 0.7835315082852028, 0.786563833361863, 0.7811772891822153, 0.786317633608274, 0.7883960667211899, 0.7842962596657178, 0.7874551187359069, 0.7816947841578383, 0.7777241527241527, 0.7849231180511969, 0.7824867443094043, 0.7852604539796658, 0.7819652868913952, 0.7854464246089862, 0.78482062287481, 0.783667023814807, 0.7870163442823541, 0.7838035959957143, 0.7815489662780304, 0.7792356924130324, 0.7823100193789848, 0.7848158679439468, 0.7823454171976338, 0.7835278100056425, 0.7854857848700212, 0.7847249959318925, 0.7881

In [365]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [366]:
# 22
column_to_drop_21 = 'Cat_현재 문화시설 접근용이성'

In [367]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(19949, 163)


In [368]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [369]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [370]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.797003393451765


In [371]:
optuna_22 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [372]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.796


In [373]:
X_train = X_train.values
y_train = y_train.values

In [374]:
auc_bootstrap = []

In [375]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78463599, 0.79347983])

In [376]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989874362945557, pvalue=0.320699006319046),
 0.7892169877037488)

In [377]:
t_22 = auc_bootstrap
print(t_22)

[0.7885955096546229, 0.7916587417818945, 0.7931008066476046, 0.7900555375924834, 0.7911372843638853, 0.7912442703083097, 0.7872982060174178, 0.7881908122055905, 0.7900425936140221, 0.7901709767473315, 0.7907777587580543, 0.7919157722113388, 0.7868346002582456, 0.7841731597889232, 0.7855777135333787, 0.7895620814340025, 0.7861290213506963, 0.7856883977573632, 0.7909344073137177, 0.7916827805990369, 0.78930557933021, 0.7884877312217213, 0.7872820920850477, 0.7907473800330944, 0.7904528384823951, 0.7892910503747943, 0.7866111185076703, 0.7862737825792012, 0.7922008039003113, 0.7884610507763217, 0.7925468572020296, 0.7917316507217985, 0.7872355994277176, 0.7866066277396326, 0.7893024093763009, 0.7909053494028863, 0.7923453009659907, 0.7876410893652273, 0.787223183774908, 0.7911718896940572, 0.7912614408919827, 0.7928025668173451, 0.7900383670088104, 0.7837948786224649, 0.7923804346218138, 0.7890493413892429, 0.7873003193200236, 0.7907373418457162, 0.7883450832958222, 0.7864542057891812, 0.

In [378]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [379]:
# 23.
column_to_drop_22 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [380]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(19949, 159)


In [381]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [382]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [383]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 145, 'learning_rate': 0.05, 'max_depth': 8, 'num_leaves': 640, 'subsample': 0.5, 'colsample_bytree': 0.6, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7966682173377687


In [384]:
optuna_23 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [385]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.796


In [386]:
X_train = X_train.values
y_train = y_train.values

In [387]:
auc_bootstrap = []

In [388]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78597396, 0.79482678])

In [389]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989867210388184, pvalue=0.3200758099555969),
 0.7902936873802021)

In [390]:
t_23 = auc_bootstrap
print(t_23)

[0.7904459702489259, 0.787097442269856, 0.7877517735892121, 0.790294869112603, 0.7879694437576212, 0.7905307665159881, 0.7932653800880402, 0.7902161485905328, 0.789065455321613, 0.7911930227201163, 0.794392827028295, 0.7886435872889075, 0.7914286559606757, 0.7910445632120509, 0.790183656562967, 0.7897599393904813, 0.7902124503109725, 0.7932460962017611, 0.7934801444653661, 0.7914320900774102, 0.7882283233268454, 0.7895269477781793, 0.7892672757204776, 0.7894519255356693, 0.7905096334899291, 0.7863945049905642, 0.7915649639787572, 0.7925162143142439, 0.7883257994095432, 0.7885379221586117, 0.7920090216888246, 0.7925170068027212, 0.7912730640563153, 0.7921934073411906, 0.7870158159567027, 0.7914233727041609, 0.7860450175721111, 0.7873513027453913, 0.7970302286804749, 0.7910007121829783, 0.7932754182754183, 0.786311557863282, 0.7929364973699949, 0.7863628054514754, 0.7904208747804806, 0.7909779941799646, 0.789239538623775, 0.7877631325907188, 0.7887415916972568, 0.7887907259828444, 0.7874

In [391]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [392]:
# 24
column_to_drop_23 = 'Cat_현재 대중교통 접근용이성'

In [393]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(19949, 155)


In [394]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [395]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [396]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 162, 'learning_rate': 0.08, 'max_depth': 8, 'num_leaves': 470, 'subsample': 1.0, 'colsample_bytree': 0.5, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7973761324061228


In [397]:
optuna_24 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [398]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.796


In [399]:
X_train = X_train.values
y_train = y_train.values

In [400]:
auc_bootstrap = []

In [401]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78145963, 0.79133512])

In [402]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981054663658142, pvalue=0.019730759784579277),
 0.7865336668917334)

In [403]:
t_24 = auc_bootstrap
print(t_24)

[0.7854369147472596, 0.7880426168603509, 0.7835978131544635, 0.7874495713165663, 0.7890134152449424, 0.7870741959411911, 0.7829973710515582, 0.7844180387283834, 0.7903640797729468, 0.7852784170518161, 0.7850771249786028, 0.7855563163444936, 0.7887809519582918, 0.7795389513369809, 0.7862492154364076, 0.7840027747663216, 0.788702759761873, 0.7875676520996719, 0.7824875367978815, 0.7890081319884275, 0.7902127144737983, 0.7870755167553196, 0.7879554431278569, 0.7880228046484203, 0.7892572375330996, 0.7822196756925821, 0.7874646285976334, 0.7833331220030728, 0.7870242691671263, 0.7874260608250756, 0.7857528534868435, 0.7896767281003734, 0.7841813488365212, 0.787798530409368, 0.7837729531079285, 0.7857974970043935, 0.7845033633210974, 0.788613736889599, 0.7836781186534881, 0.7837198563799548, 0.7847696394494424, 0.7842872781296427, 0.7870776300579256, 0.7871291418089448, 0.7907378701713677, 0.7855674111831747, 0.7846314822915809, 0.7846642384819725, 0.7865873438533537, 0.7867252368483895, 0.

In [404]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [405]:
column_to_drop_24 = 'Cat_현재 교육환경'

In [406]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(19949, 151)


In [407]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [408]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [409]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7976671412243093


In [410]:
optuna_25 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [411]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.795


In [412]:
X_train = X_train.values
y_train = y_train.values

In [413]:
auc_bootstrap = []

In [414]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78412929, 0.79328835])

In [415]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992559552192688, pvalue=0.6248380541801453),
 0.7887515254082372)

In [416]:
t_25 = auc_bootstrap
print(t_25)

[0.7895879693909251, 0.790649375624745, 0.7896775205888507, 0.7887772536787316, 0.7862761600446329, 0.7894788701438947, 0.7887598189322327, 0.7907191146107402, 0.7888586158290593, 0.7858809724573271, 0.7896270654891345, 0.789224481342708, 0.7886980048310097, 0.7918325609212309, 0.7917995405680135, 0.790387061938786, 0.7873370379528015, 0.7871608413480334, 0.7904797830906205, 0.7905257474222991, 0.7905466162855326, 0.7884177280729006, 0.7886068686561298, 0.7910593563302921, 0.7921371406593081, 0.7924517585847635, 0.7845009858556657, 0.787410475218357, 0.7856841711521515, 0.786594212086823, 0.7891074572109055, 0.786055319922315, 0.7898201685147497, 0.7862788016728903, 0.7894440006508971, 0.7898376032612485, 0.7865590784309997, 0.7882024353699231, 0.7908007409238936, 0.7834834306509182, 0.786097850137259, 0.7877475469840002, 0.7910054671138416, 0.7910976599400245, 0.7877398862620538, 0.7898582079616562, 0.7907019440270672, 0.7887054013901305, 0.7855217110143219, 0.7882587020518055, 0.7918

In [417]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [418]:
# 26
column_to_drop_25 = 'Cat_현재 주차시설 이용편의성'

In [419]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(19949, 147)


In [420]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [421]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [422]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 136, 'learning_rate': 0.060000000000000005, 'max_depth': 7, 'num_leaves': 524, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7965522265889597


In [423]:
optuna_26 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [424]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.796


In [425]:
X_train = X_train.values
y_train = y_train.values

In [426]:
auc_bootstrap = []

In [427]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78657746, 0.79464543])

In [428]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999151349067688, pvalue=0.49343034625053406),
 0.7906430892098995)

In [429]:
t_26 = auc_bootstrap
print(t_26)

[0.7858534995234503, 0.7894041120642106, 0.7892059899449062, 0.7890102452910335, 0.792030947203361, 0.789779487439586, 0.7887178170429401, 0.7932083209176805, 0.7928271339601389, 0.7889494878411134, 0.79092991654568, 0.7900365178690302, 0.7908654608161997, 0.7935271654483478, 0.7857412303225111, 0.7911850978353441, 0.7891219861663212, 0.7896933703583949, 0.7911821920442611, 0.7940863981504376, 0.7894815117721521, 0.789386413154886, 0.7890326991312212, 0.7902050537518518, 0.7911248687110757, 0.7909130101248327, 0.7912416286800523, 0.7902364891281146, 0.7904628766697732, 0.7893742616649021, 0.7906662820455925, 0.7922182386468101, 0.7927772071860741, 0.7895108338458092, 0.794580118471744, 0.7891174953982836, 0.7904573292504327, 0.7920518160665944, 0.7905069918616717, 0.7882901374280684, 0.7904689524147652, 0.7912712149165352, 0.7884803346626006, 0.786391599199481, 0.7913369914601441, 0.7876749022069219, 0.7928287189370933, 0.7896579725397459, 0.788733138486833, 0.7920605334398437, 0.78676

In [430]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [431]:
# 27
column_to_drop_26 = 'Cat_기초생활보장 수급가구 여부'

In [432]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(19949, 145)


In [433]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [434]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [435]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'learning_rate': 0.08, 'max_depth': 9, 'num_leaves': 432, 'subsample': 0.9, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7972795422451928


In [436]:
optuna_27 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [437]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.794


In [438]:
X_train = X_train.values
y_train = y_train.values

In [439]:
auc_bootstrap = []

In [440]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78241562, 0.79182073])

In [441]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987279176712036, pvalue=0.14777104556560516),
 0.7872153324594827)

In [442]:
t_27 = auc_bootstrap
print(t_27)

[0.7842244073771167, 0.7876809779519138, 0.7880103889956107, 0.7864203929474864, 0.7867849376470066, 0.7897298248283469, 0.7804003863117164, 0.7879287626824573, 0.7850047443643502, 0.789478605981069, 0.7833561041689119, 0.7862835566037536, 0.7873283205795522, 0.7831355282094199, 0.7832713079018496, 0.7864465450672347, 0.784842020063695, 0.7887252136020608, 0.789926361970697, 0.7867503323168348, 0.7853000784035267, 0.7838189174396071, 0.7867582572016069, 0.7919369052373979, 0.7875747844959667, 0.7874334573841962, 0.7903376634903728, 0.7883672729731843, 0.7825694272738608, 0.7860207145921432, 0.7863482764960599, 0.7885865281185478, 0.7870665352192445, 0.788261343680063, 0.7881686225282285, 0.7903067564397613, 0.7880397110692677, 0.7903107188821474, 0.7919086398150437, 0.7859142569733703, 0.7923268095681888, 0.7824753853078976, 0.7848177170837269, 0.7864631873252562, 0.7850948238879274, 0.7850998429816164, 0.7857348904146934, 0.7867149344981857, 0.7876307870150234, 0.7897506936915805, 0.7

In [443]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [444]:
# 28
column_to_drop_27  = 'Cat_이사 계획 중인 주택의 유형'

In [445]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(19949, 126)


In [446]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [447]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [448]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 136, 'learning_rate': 0.060000000000000005, 'max_depth': 7, 'num_leaves': 524, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7956622762173147


In [449]:
optuna_28 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [450]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.797


In [451]:
X_train = X_train.values
y_train = y_train.values

In [452]:
auc_bootstrap = []

In [453]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78643505, 0.7946142 ])

In [454]:
np.mean(auc_bootstrap)

0.7905559755744486

In [455]:
t_28 = auc_bootstrap
print(t_28)

[0.7923981335311385, 0.7925114593833805, 0.7941682886264168, 0.7905038219077629, 0.7895655155507373, 0.7893721483622962, 0.7924808164955948, 0.7897594110648297, 0.7840690796355821, 0.7896357828623837, 0.7919170930254673, 0.7922990724714862, 0.792044155344648, 0.7929246100428367, 0.7881422062456545, 0.7906821318151368, 0.7900106299121077, 0.791341218065356, 0.7911520774821268, 0.7877166399333887, 0.7890160568731996, 0.7880930719600671, 0.7912292130272425, 0.7913235191560315, 0.7913274815984175, 0.7925220258964101, 0.7910104862075306, 0.7905077843501489, 0.7906665462084181, 0.789089758301581, 0.7916143624271703, 0.7900468202192341, 0.7905320873301168, 0.7911058489876224, 0.7928860422702788, 0.7947478618660885, 0.7867207460803519, 0.7894683036308652, 0.7908741781894492, 0.7911732105081859, 0.7909095760080982, 0.7859987890776068, 0.7908675741188057, 0.7941344757847222, 0.7924411920717339, 0.7909359922906721, 0.788560111835974, 0.7939228813613051, 0.7921104602139084, 0.7895103055201576, 0.7

In [456]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [457]:
# 29
column_to_drop_28 = '현재 무주택 기간(총 개월)'

In [458]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(19949, 125)


In [459]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [460]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [461]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 536, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7957464830242792


In [462]:
optuna_29 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [463]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.795


In [464]:
X_train = X_train.values
y_train = y_train.values

In [465]:
auc_bootstrap = []

In [466]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78548018, 0.79402727])

In [467]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999273419380188, pvalue=0.6474655270576477),
 0.7897907781550023)

In [468]:
t_29 = auc_bootstrap
print(t_29)

[0.7870662710564189, 0.7887518940474605, 0.7913808424892168, 0.7890115661051622, 0.7928944954807023, 0.7874516846191724, 0.7922148045300754, 0.7903223420464799, 0.7885284122968852, 0.7885680367207462, 0.7899430042287185, 0.793428632714347, 0.7927964910723532, 0.7948146950610004, 0.788574112465738, 0.7919789071266905, 0.7903860052874832, 0.7901913172849134, 0.7887431766742112, 0.7932820223460617, 0.7900861804802691, 0.7899427400658927, 0.7886581162443231, 0.787480214204352, 0.7880373336038361, 0.7912012117677143, 0.7915158296931695, 0.7921448013812545, 0.7872466942663987, 0.7893909039229235, 0.7885239215288475, 0.7829183863666622, 0.7910308267451125, 0.7887759328646027, 0.7877747557550513, 0.7867204819175262, 0.7908736498637976, 0.7886832117127682, 0.7916896488325058, 0.7857092666205967, 0.789575817900941, 0.7900838030148376, 0.7898782843364125, 0.7947890712669039, 0.7871579355569502, 0.7892028199909974, 0.7884393894246111, 0.7899348151811206, 0.7895634022481313, 0.7908807822600925, 0.7

In [469]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [470]:
column_to_drop_29 = 'Cat_현재 대기오염 정도'

In [471]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(19949, 121)


In [472]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [473]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [474]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 178, 'learning_rate': 0.04, 'max_depth': 10, 'num_leaves': 718, 'subsample': 0.8, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 7, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7956503056418147


In [475]:
optuna_30 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [476]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.795


In [477]:
X_train = X_train.values
y_train = y_train.values

In [478]:
auc_bootstrap = []

In [479]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7843797 , 0.79322632])

In [481]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992871880531311, pvalue=0.6653820872306824),
 0.7887400724968459)

In [483]:
t_30 = auc_bootstrap
print(t_30)

[0.7909241049635138, 0.7882840616830765, 0.7843105244583077, 0.7895018523097341, 0.789512154659938, 0.7892176131092387, 0.7878796283968698, 0.7888501626186355, 0.7905999771763319, 0.7855885442092339, 0.7900404803114163, 0.7879200453092079, 0.7872157872157872, 0.7916225514747681, 0.7917778792163028, 0.788485089593464, 0.7880508059079487, 0.7874699118541484, 0.7892535392535394, 0.7893827148753256, 0.785342344455645, 0.7911618515066791, 0.7874028144964105, 0.7891692713121284, 0.7853328345939183, 0.7887312893470528, 0.7859765994002447, 0.7926646738223093, 0.7872324294738087, 0.7870078910719306, 0.7874728176452314, 0.7943925628654692, 0.7861633625180422, 0.7910149769755681, 0.7921321215656192, 0.7858904823190537, 0.7873410003951875, 0.786164683332171, 0.7913351423203641, 0.7861036617194254, 0.7893385996834272, 0.7877620759394159, 0.7873975312398958, 0.7940253765376918, 0.790071123199202, 0.7882137943714298, 0.7877610192881129, 0.7878376265075773, 0.7874918373686846, 0.7894360757661252, 0.79

In [485]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

NameError: name 'X_train' is not defined

In [486]:
# 31
column_to_drop_30 = 'Cat_현재 청소/쓰레기 처리상태'

In [487]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(19949, 117)


In [488]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [489]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [490]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 136, 'learning_rate': 0.060000000000000005, 'max_depth': 7, 'num_leaves': 524, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7957625813844342


In [491]:
optuna_31= LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [492]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.796


In [493]:
X_train = X_train.values
y_train = y_train.values

In [494]:
auc_bootstrap = []

In [495]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78682095, 0.79471465])

In [496]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991985559463501, pvalue=0.5513724684715271),
 0.7907533753405059)

In [497]:
t_31 = auc_bootstrap
print(t_31)

[0.7933551954487915, 0.7901955438901253, 0.7913803141635654, 0.7906599421377746, 0.7944792082723118, 0.7869666816711152, 0.789241651926381, 0.7881112991950432, 0.7895285327551338, 0.7905859765465677, 0.7879562356163342, 0.7925296866183567, 0.7924131908122055, 0.7905796366387499, 0.7907236053787778, 0.7930006889366495, 0.7926755044981646, 0.79111033975566, 0.7886937782257979, 0.7911581532271188, 0.7869738140674101, 0.7878566462310304, 0.7905241624453447, 0.794296671759726, 0.7902261867779108, 0.7878880816072935, 0.7923202054975453, 0.7878949498407627, 0.7909827491108279, 0.790486387161264, 0.7898064320478113, 0.7898962474085626, 0.7893969796679157, 0.7948490362283466, 0.7938259336042588, 0.7889880556136714, 0.7893824507125, 0.7889299397920086, 0.7887458183024686, 0.7916447411521303, 0.7895433258733752, 0.790816590693438, 0.7919236970961109, 0.7883324034801867, 0.7893626385005694, 0.7906948116307723, 0.7905334081442456, 0.7893523361503657, 0.7924372296293478, 0.7920771756978653, 0.790999

In [498]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [499]:
# 32
column_to_drop_31 = 'Cat_이사 계획 첫 번째 이유'

In [500]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(19949, 104)


In [501]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [502]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [503]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'learning_rate': 0.06999999999999999, 'max_depth': 9, 'num_leaves': 510, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7952887117060259


In [504]:
optuna_32 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [505]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.795


In [506]:
X_train = X_train.values
y_train = y_train.values

In [507]:
auc_bootstrap = []

In [508]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78336275, 0.7922787 ])

In [509]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991471767425537, pvalue=0.4884590804576874),
 0.7878556881124614)

In [510]:
t_32 = auc_bootstrap
print(t_32)

[0.7827519637864466, 0.7827163018049716, 0.7871204244356954, 0.787797209595239, 0.787775019917877, 0.7898698311259887, 0.7869864938830455, 0.7843924149342868, 0.7885138833414695, 0.7856345085409124, 0.7866491579545767, 0.788222775907505, 0.7856749254532505, 0.7848790028592985, 0.7848119055015605, 0.788102053496142, 0.7892730873026439, 0.7887167603916372, 0.7870831774772662, 0.7880600516068497, 0.7874757234363146, 0.78942894336983, 0.789071531066605, 0.7930997499963017, 0.7860246770345292, 0.7899004740137745, 0.7890562096227121, 0.7908374595566714, 0.7898764351966323, 0.7859221818581424, 0.7875845585205191, 0.7854731050543858, 0.7836728353969732, 0.784602952706401, 0.7850076501554335, 0.7905315590044655, 0.7881337530352308, 0.7869606059261232, 0.7844848719232956, 0.7862185725486218, 0.7878830625136044, 0.7882312291179286, 0.7849672332430953, 0.7871088012713628, 0.7876297303637204, 0.7863303134239095, 0.7861419653291574, 0.7860957368346531, 0.7883110062913019, 0.7901049360408966, 0.78819

In [511]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [512]:
# 33.
column_to_drop_32 = '자산 중 기타자산의 비중'

In [513]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(19949, 103)


In [514]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [515]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [516]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 136, 'learning_rate': 0.060000000000000005, 'max_depth': 7, 'num_leaves': 524, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7962352127274461


In [517]:
optuna_33 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [518]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.796


In [519]:
X_train = X_train.values
y_train = y_train.values

In [520]:
auc_bootstrap = []

In [521]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78636225, 0.79434109])

In [522]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9982876181602478, pvalue=0.03575947880744934),
 0.7904739077131319)

In [523]:
t_33 = auc_bootstrap
print(t_33)

[0.7931980185674767, 0.79136763434793, 0.7903352860249412, 0.791622815637594, 0.7851532038724156, 0.7896938986840465, 0.7952183358587299, 0.7928530219170613, 0.788477957197169, 0.7909499929204362, 0.7908353462540655, 0.7885664517437916, 0.7881702075051829, 0.789142326703903, 0.7911261895252044, 0.7880016716223612, 0.7884906370128046, 0.7919635856827975, 0.7886457005915135, 0.7914582421971583, 0.7934645588586475, 0.7960884882067149, 0.7914680162217108, 0.7919398110284809, 0.793765968642816, 0.7883302901775808, 0.7909325581739375, 0.7895042297751658, 0.7890393032018648, 0.7909827491108279, 0.7896931061955693, 0.7894115086233313, 0.7906388091117156, 0.7906055245956723, 0.7925360265261743, 0.7891719129403859, 0.7912556293098165, 0.7921881240846759, 0.7905730325681065, 0.7902079595429349, 0.7889481670269847, 0.7920003043155752, 0.7917414247463508, 0.7872102397964467, 0.7909111609850525, 0.7894078103437709, 0.7895887618794023, 0.7909919948097288, 0.7919538116582451, 0.7923273378938404, 0.789

In [524]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [525]:
# 34
column_to_drop_33 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [526]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(19949, 99)


In [527]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [528]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [529]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 145, 'learning_rate': 0.05, 'max_depth': 8, 'num_leaves': 640, 'subsample': 0.5, 'colsample_bytree': 0.6, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7951050252888727


In [530]:
optuna_34 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [531]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.797


In [532]:
X_train = X_train.values
y_train = y_train.values

In [533]:
auc_bootstrap = []

In [534]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78568827, 0.79418813])

In [535]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990262985229492, pvalue=0.35714247822761536),
 0.7899180691071085)

In [536]:
t_34 = auc_bootstrap
print(t_34)

[0.7948860190239501, 0.7922948458662744, 0.7897971863489104, 0.7945925341245539, 0.7882531546324649, 0.7925162143142439, 0.7886084536330842, 0.7920790248376455, 0.787789284710467, 0.7902835101110963, 0.7908113074369232, 0.7935023341427281, 0.7898782843364124, 0.7919931719192803, 0.7895354009886029, 0.7918761477874778, 0.7873042817624099, 0.7905143884207924, 0.7896857096364485, 0.7898994173624715, 0.7919971343616663, 0.787114612853529, 0.7884893161986757, 0.7889693000530438, 0.7897113334305453, 0.7899821003269278, 0.7890113019423364, 0.7887886126802384, 0.7869413220398443, 0.7891024381172165, 0.7887024955990474, 0.7932120191972408, 0.7876310511778493, 0.7881009968448393, 0.7897575619250496, 0.7888414452453861, 0.7921722743151314, 0.790580693290053, 0.7887616680720129, 0.792623464421494, 0.7898275650738705, 0.7890057545229959, 0.7890831542309376, 0.7905891465004765, 0.7898426223549375, 0.7897617885302615, 0.7892131223412011, 0.7874971206251994, 0.7892527467650621, 0.7875626330059828, 0.7

In [537]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [538]:
# 35
column_to_drop_34 = '자산 중 금융자산의 비중'

In [539]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(19949, 98)


In [540]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [541]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [542]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'learning_rate': 0.06999999999999999, 'max_depth': 9, 'num_leaves': 510, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.793191797101222


In [543]:
optuna_35 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [544]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.795


In [545]:
X_train = X_train.values
y_train = y_train.values

In [546]:
auc_bootstrap = []

In [547]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78347881, 0.7924458 ])

In [548]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9994248747825623, pvalue=0.8348005414009094),
 0.7878723647116505)

In [549]:
t_35 = auc_bootstrap
print(t_35)

[0.7857459852533744, 0.7868520350047443, 0.7843895091432038, 0.7842452762403502, 0.7892403311122523, 0.7835365273788919, 0.7889558277489311, 0.7871877859562588, 0.7875547081212106, 0.7866013444831178, 0.7878582312079848, 0.7857784772809403, 0.7903138888360564, 0.7847585446107613, 0.7875628971688085, 0.7837362344751507, 0.7885572060448908, 0.7857145498771114, 0.7843060336902701, 0.7902008271466399, 0.7859287859287859, 0.7857652691396533, 0.7895744970868124, 0.787165332116071, 0.7921667268957909, 0.7873563218390804, 0.7868684130999402, 0.7838487678389155, 0.7868517708419185, 0.7864793012576264, 0.789993195165609, 0.7867669745748563, 0.788910391742904, 0.7907037931668474, 0.7890596437394466, 0.7876815062775653, 0.7888889945540191, 0.7843731310480079, 0.7856773029186823, 0.7888261238014933, 0.7867210102431776, 0.7891085138622084, 0.7875742561703153, 0.7882491921900789, 0.7885688292092233, 0.7876553541578173, 0.786758257201607, 0.7855040121049973, 0.7914833376656036, 0.784001453952193, 0.78

In [550]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [551]:
# 36
column_to_drop_35 = '소득 대비 생활비의 비율'

In [552]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(19949, 97)


In [553]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [554]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [555]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 199, 'learning_rate': 0.05, 'max_depth': 9, 'num_leaves': 258, 'subsample': 0.6, 'colsample_bytree': 1.0, 'reg_alpha': 6, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7927071951827103


In [556]:
optuna_36 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [557]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.793


In [558]:
X_train = X_train.values
y_train = y_train.values

In [559]:
auc_bootstrap = []

In [560]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77864535, 0.788985  ])

In [561]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992402195930481, pvalue=0.6044871807098389),
 0.7839145602322942)

In [562]:
t_36 = auc_bootstrap
print(t_36)

[0.7881533010843356, 0.7830190324032689, 0.7823179442637571, 0.7836184178548711, 0.7878476646949553, 0.7886853250153744, 0.7825910886255714, 0.7869217739907395, 0.7839457155959619, 0.7853151356845938, 0.7844584556407217, 0.779887382104131, 0.7796641645163812, 0.7821396343563831, 0.7825691631110351, 0.7861715515656403, 0.7810639633299732, 0.7803515161889547, 0.7772515654289054, 0.783721705519735, 0.7851941491104053, 0.7824436857688091, 0.7875124420690923, 0.7897419763183309, 0.7866396480928499, 0.7838540510954304, 0.7813553349267635, 0.7858645943621312, 0.7843728668851822, 0.7857095307834223, 0.7858616885710481, 0.7837259321249468, 0.7866742534230219, 0.7863828818262315, 0.7815193800415476, 0.7802704182014526, 0.7823158309611513, 0.7776021094986612, 0.7794398902773287, 0.7829725397459388, 0.7814015634212679, 0.7831318299298595, 0.7892419160892068, 0.7863237093532661, 0.7818934346027941, 0.7783018768240443, 0.7847994898487509, 0.7798213413976961, 0.7818073175216032, 0.7841377619702743, 0

In [563]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [564]:
# 37
column_to_drop_36 = '소득 대비 주거관리비의 비율'

In [565]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(19949, 96)


In [566]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [567]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [568]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 145, 'learning_rate': 0.05, 'max_depth': 8, 'num_leaves': 640, 'subsample': 0.5, 'colsample_bytree': 0.6, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7937044679553886


In [569]:
optuna_37 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [570]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.796


In [571]:
X_train = X_train.values
y_train = y_train.values

In [572]:
auc_bootstrap = []

In [573]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78648679, 0.79477717])

In [574]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9973708987236023, pvalue=0.0019091066205874085),
 0.7906704161938152)

In [575]:
t_37 = auc_bootstrap
print(t_37)

[0.7911732105081859, 0.7902573579913481, 0.7909563328282541, 0.7954806495446889, 0.7920222298301116, 0.792601010581306, 0.7915921727498082, 0.794443546290837, 0.7878310224369337, 0.7895731762726838, 0.7900872371315721, 0.788348517412557, 0.7894252450902696, 0.7916767048540447, 0.7899017948279032, 0.7914236368669866, 0.789095834046573, 0.7910786402165713, 0.7913272174355918, 0.7913805783263912, 0.7890678327870446, 0.7916724782488329, 0.7902848309252251, 0.7926139545597675, 0.7928585693364019, 0.7916854222272941, 0.7906430357169274, 0.7873499819312628, 0.7887518940474605, 0.7924897980316699, 0.7905896748261281, 0.7891272694228358, 0.7931155997658461, 0.7894373965802537, 0.7888829188090272, 0.7906308842269433, 0.7908839522140014, 0.788922543232888, 0.792592029045231, 0.7933734226837675, 0.7900838030148375, 0.7905812216157044, 0.791364464394021, 0.7898777560107609, 0.7917057627648761, 0.7893066359815128, 0.7897588827391783, 0.7918378441777456, 0.7930543139902746, 0.7929357048815177, 0.7877

In [576]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [577]:
# 38
column_to_drop_37 = '총 이사 횟수'

In [578]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(19949, 95)


In [579]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [580]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [581]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7936301678315962


In [582]:
optuna_38 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [583]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.795


In [584]:
X_train = X_train.values
y_train = y_train.values

In [585]:
auc_bootstrap = []

In [586]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78560965, 0.79379934])

In [587]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980401396751404, pvalue=0.01595306769013405),
 0.7897409618009987)

In [588]:
t_38 = auc_bootstrap
print(t_38)

[0.7932500586441473, 0.7865984386920346, 0.7952471296067356, 0.7901598819086504, 0.7932331522233, 0.7920700433015704, 0.7879171395181247, 0.7891814228021125, 0.7922686937465262, 0.7909539553628224, 0.7876891669995119, 0.7924504377706347, 0.7892297645992228, 0.7892501051368047, 0.7891457608206377, 0.7883004397782721, 0.7877192815616462, 0.7900241022162203, 0.7932690783676004, 0.7869389445744126, 0.790555333658782, 0.789106928885254, 0.7879266493798514, 0.7891983292229596, 0.7875536514699077, 0.7899432683915442, 0.7871497465093524, 0.7898072245362886, 0.7878827983507786, 0.7880169930662542, 0.7876759588582248, 0.7876550899949915, 0.7905297098646853, 0.7919680764508351, 0.7914608838254158, 0.7901556553034386, 0.7868882253118706, 0.7919752088471299, 0.7870739317783653, 0.7921749159433888, 0.7879572922676371, 0.7880661273518417, 0.7900444427538024, 0.7885759616055183, 0.7895549490377076, 0.7863556730551805, 0.7904058174994135, 0.7911626439951563, 0.7902541880374392, 0.7920412495535648, 0.79

In [589]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [590]:
# 38
column_to_drop_38 = '총 가구원 수'

In [591]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(19949, 94)


In [592]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [593]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [594]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 121, 'learning_rate': 0.060000000000000005, 'max_depth': 8, 'num_leaves': 438, 'subsample': 0.6, 'colsample_bytree': 0.9, 'reg_alpha': 3, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7921004108384067


In [595]:
optuna_39 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [596]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.796


In [597]:
X_train = X_train.values
y_train = y_train.values

In [598]:
auc_bootstrap = []

In [599]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78360121, 0.79321314])

In [600]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989610910415649, pvalue=0.29770857095718384),
 0.7883986132508299)

In [601]:
t_39 = auc_bootstrap
print(t_39)

[0.7914479398469547, 0.782457686398573, 0.7934125187819769, 0.785398082811876, 0.7853589867136666, 0.7882647777967975, 0.7888329920349626, 0.7884296154000586, 0.787122801901127, 0.7894196976709291, 0.7885933963520171, 0.7868134672321865, 0.7858540278491017, 0.7887030239246988, 0.7908472335812237, 0.7880365411153589, 0.7893655442916527, 0.7870240050043007, 0.785141844870909, 0.7905740892194094, 0.7870512137753516, 0.786559870919477, 0.7881218657080726, 0.7858416121962919, 0.7926913542677089, 0.7903276253029947, 0.7869352462948522, 0.7898479056114525, 0.7841581025078561, 0.7871838235138727, 0.7877050167690561, 0.7851907149936707, 0.7883181386875968, 0.7856780954071595, 0.7838873356114736, 0.7835682269179806, 0.7860907177409641, 0.7892123298527239, 0.7868639223319025, 0.7883891984877205, 0.7907840986658721, 0.7896521609575797, 0.7893066359815128, 0.7893903755972721, 0.7869156982457475, 0.7888274446156219, 0.7910775835652684, 0.7913066127351841, 0.7929298932993515, 0.7876450518076135, 0.78

In [602]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [603]:
# 40
column_to_drop_39 = '현재 주택 거주 기간(총 개월)'

In [604]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(19949, 93)


In [605]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [606]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [607]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 122, 'learning_rate': 0.060000000000000005, 'max_depth': 8, 'num_leaves': 422, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 3, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7936334700593204


In [608]:
optuna_40 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [609]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.798


In [610]:
X_train = X_train.values
y_train = y_train.values

In [611]:
auc_bootstrap = []

In [612]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78622153, 0.79498912])

In [613]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991217255592346, pvalue=0.45867201685905457),
 0.7906819468351179)

In [614]:
t_40 = auc_bootstrap
print(t_40)

[0.7930331809642154, 0.788066919840319, 0.7915538691400761, 0.7947460127263083, 0.7922380508587406, 0.7924810806584206, 0.7911209062686896, 0.7908342896027625, 0.7951211239388578, 0.7923394893838244, 0.7888683898536115, 0.7946234411751654, 0.789086588347672, 0.792976914282333, 0.7911983059766311, 0.7882732310072211, 0.7962633639973542, 0.789983949466708, 0.7903410976071075, 0.787603842406798, 0.7942356501469802, 0.7911166796634777, 0.7862803866498448, 0.7902956616010803, 0.7907222845646491, 0.7902047895890261, 0.7898162060723637, 0.788043409348828, 0.78877672535308, 0.7864565832546128, 0.789957665265547, 0.7900935770393899, 0.7912801964526103, 0.7952180716959042, 0.7899916101886546, 0.7882853824972051, 0.7875594630520738, 0.7906316767154205, 0.7906073737354526, 0.7872805071080933, 0.7914489964982576, 0.7939448068758413, 0.7898560946590502, 0.7924126624865541, 0.7925331207350911, 0.7897554486224436, 0.7893811298983713, 0.7910619979585496, 0.7864565832546129, 0.7916722140860072, 0.793926

In [615]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [616]:
# 41.
column_to_drop_40 = 'Cat_현재 주택의 유형'

In [617]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(19949, 82)


In [618]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [619]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [620]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 124, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 856, 'subsample': 0.8, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 4, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7927583797124337


In [621]:
optuna_41 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [622]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.796


In [623]:
X_train = X_train.values
y_train = y_train.values

In [624]:
auc_bootstrap = []

In [625]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78467932, 0.79306595])

In [626]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989213347434998, pvalue=0.26550179719924927),
 0.789006041073621)

In [627]:
t_41 = auc_bootstrap
print(t_41)

[0.7891845927560213, 0.7899680996971638, 0.7925461967949653, 0.7869561151580857, 0.7880280879049351, 0.7880398431506805, 0.7872403543585811, 0.7898920208033507, 0.7891095705135114, 0.7937847242034435, 0.786582060596839, 0.7872406185214067, 0.7903503433060084, 0.7871618979993364, 0.7920357021342244, 0.7912696299395807, 0.784916778143379, 0.7870102685373622, 0.7886475497312935, 0.78959510178722, 0.7852218862071079, 0.7879881993182485, 0.7925951989991399, 0.7927890945132325, 0.7887859710519809, 0.7908694232585858, 0.7902695094813321, 0.7892203868189089, 0.7887954809137074, 0.7870744601040167, 0.7873391512554073, 0.7883751978579565, 0.7894400382085112, 0.7852160746249417, 0.7907074914464077, 0.7915063198314429, 0.7936703416998984, 0.7923276020566661, 0.7903677780525071, 0.7911623798323305, 0.7886443797773847, 0.7882660986109261, 0.7859298425800888, 0.78974699541202, 0.7875296126527653, 0.7893214290997541, 0.7884074257226967, 0.7895282685923081, 0.7900011200503811, 0.7891542140310613, 0.786

In [628]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [629]:
# 42.
column_to_drop_41 = '중기부채부담지표'

In [630]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(19949, 81)


In [631]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [632]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [633]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 110, 'learning_rate': 0.04, 'max_depth': 8, 'num_leaves': 898, 'subsample': 0.1, 'colsample_bytree': 0.9, 'reg_alpha': 5, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7925111254115917


In [634]:
optuna_42 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [635]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.794


In [636]:
X_train = X_train.values
y_train = y_train.values

In [637]:
auc_bootstrap = []

In [638]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7832546 , 0.79217971])

In [639]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989175200462341, pvalue=0.26253998279571533),
 0.7878321158729102)

In [640]:
t_42 = auc_bootstrap
print(t_42)

[0.7897805440908889, 0.7873346604873699, 0.7858822932714559, 0.7841707823234916, 0.7864468092300605, 0.7892271229709653, 0.7883186670132483, 0.7865304167644069, 0.7870995555724619, 0.7929132510413298, 0.787713205816654, 0.7803647243302416, 0.7854805016135066, 0.7873177540665226, 0.7890675686242188, 0.7903697592737001, 0.7878582312079849, 0.7870158159567027, 0.7913433313679619, 0.789907870572895, 0.790643299879753, 0.790146145441712, 0.7865656825016432, 0.7899020589907289, 0.7889912255675802, 0.7832557222951313, 0.7872542229069324, 0.785026405716061, 0.79074579505614, 0.7888427660595148, 0.7924398712576053, 0.7880471076283884, 0.7907180579594373, 0.7869398691443026, 0.7869154340829217, 0.7908715365611917, 0.7826415437252876, 0.7895258911268763, 0.7902784910174073, 0.7903669855640298, 0.7901070493435025, 0.7885122983645151, 0.7891642522184394, 0.7849461002170361, 0.7910429782350964, 0.7850879556544582, 0.7876928652790722, 0.7876442593191362, 0.7879924259234603, 0.787568444588149, 0.78533

In [641]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [642]:
# 43.
column_to_drop_42 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [643]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(19949, 77)


In [644]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [645]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [646]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 169, 'learning_rate': 0.09, 'max_depth': 6, 'num_leaves': 456, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7919216777628398


In [647]:
optuna_43 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [648]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.796


In [649]:
X_train = X_train.values
y_train = y_train.values

In [650]:
auc_bootstrap = []

In [651]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7824616 , 0.79119399])

In [652]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984950423240662, pvalue=0.07027559727430344),
 0.7869856871297758)

In [653]:
t_43 = auc_bootstrap
print(t_43)

[0.7887011747849186, 0.7865294921945167, 0.7855409949006008, 0.7879012897485804, 0.791511338925132, 0.7847030704173561, 0.7872591099192083, 0.7870010228384612, 0.7853457785723795, 0.7907756454554484, 0.7869265289216028, 0.790030706286864, 0.7857547026266238, 0.7909880323673426, 0.7899741754421558, 0.7873330755104153, 0.7859409374187699, 0.7896371036765125, 0.7886414739863016, 0.7829857478872257, 0.7894656620026079, 0.7862177800601445, 0.7893977721563928, 0.7861097374644173, 0.7840006614637157, 0.7826499969357112, 0.7843836975610374, 0.7870858191055236, 0.7849059474675238, 0.7871880501190847, 0.7857187764823232, 0.7867173119636174, 0.7897932239065244, 0.7904562725991298, 0.7901265973926073, 0.7877414712390083, 0.7915332644396683, 0.7853217397552372, 0.7889328455830918, 0.7895467599901098, 0.7862621594148688, 0.7879110637731327, 0.7820035905011274, 0.7813371076917875, 0.784555403397768, 0.7905157092349211, 0.7852821153313764, 0.78817099999366, 0.7923336778016581, 0.7866623660958636, 0.78

In [654]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [655]:
## 44
column_to_drop_43 = '가구주 나이'

In [656]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(19949, 76)


In [657]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [658]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [659]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 156, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 414, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 5, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.790419164148374


In [660]:
optuna_44 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [661]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.796


In [662]:
X_train = X_train.values
y_train = y_train.values

In [663]:
auc_bootstrap = []

In [664]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78431221, 0.79285703])

In [665]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992485642433167, pvalue=0.615230917930603),
 0.7886905916296311)

In [666]:
t_44 = auc_bootstrap
print(t_44)

[0.7921772934088205, 0.7866174584154879, 0.7881460366066277, 0.7878312865997594, 0.7927750938834683, 0.7878365698562744, 0.7888747297614292, 0.7880228046484204, 0.7876228621302512, 0.7908596492340336, 0.7892178772720644, 0.7856498299848054, 0.7919538116582452, 0.7918205415126597, 0.7844576631522444, 0.7900584433835665, 0.7886023778880922, 0.786983059766311, 0.7913126884801761, 0.7891201370265408, 0.7872810354337447, 0.7906216385280425, 0.7826909421737008, 0.7869302272011631, 0.7887246852764094, 0.7879139695642159, 0.7858569336401848, 0.7865049250517231, 0.786483792025664, 0.7914920550388531, 0.7860666789238218, 0.793307117814507, 0.7891481382860692, 0.7882579095633283, 0.7849133440266445, 0.7830089942158908, 0.7919178855139446, 0.7879617830356747, 0.7903368710018955, 0.789556534014662, 0.7869109433148842, 0.7882823446247091, 0.7877480753096516, 0.7875599913777253, 0.7876498067384767, 0.7884787496856462, 0.7870718184757595, 0.7901176158565322, 0.7921297441001873, 0.7885895659910438, 0.7

In [667]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [668]:
# 45
column_to_drop_44 = 'Cat_현재 주택의 점유형태'

In [669]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(19949, 72)


In [670]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [671]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [672]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 122, 'learning_rate': 0.09, 'max_depth': 6, 'num_leaves': 266, 'subsample': 0.6, 'colsample_bytree': 0.4, 'reg_alpha': 1, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7917303549440747


In [673]:
optuna_45 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [674]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.798


In [675]:
X_train = X_train.values
y_train = y_train.values

In [676]:
auc_bootstrap = []

In [677]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78618163, 0.79451711])

In [678]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990577101707458, pvalue=0.3886953294277191),
 0.7905226344615198)

In [679]:
t_45 = auc_bootstrap
print(t_45)

[0.7916016826115349, 0.7917875211594424, 0.7877995870606709, 0.789528268592308, 0.7915381514519446, 0.7903068885211744, 0.7938070459622184, 0.7890493413892429, 0.7906522814158282, 0.7903170587899653, 0.7885038451540916, 0.7919601515660629, 0.7945391732337546, 0.7889991504523524, 0.7943133140177475, 0.7926140866411802, 0.7900178943898156, 0.7886169068435078, 0.7918835443465986, 0.7897973184303234, 0.7934185945269688, 0.7882160397554486, 0.7919211875492664, 0.7843559604643349, 0.7864703197215512, 0.7937252875676521, 0.7941035687341107, 0.7902185260559644, 0.7908370633124328, 0.7885457149619712, 0.7891433833552061, 0.7922137478787725, 0.7890861921034336, 0.7901899964707846, 0.7881786607156066, 0.7900271400887164, 0.7906355070763937, 0.7857156065284144, 0.7893434866957035, 0.7932486057486058, 0.791039412036949, 0.78976126020461, 0.7869672099967666, 0.7927827546054148, 0.7913792575122625, 0.7889673188318508, 0.7906724898719973, 0.790277830610343, 0.7919218479563308, 0.7899498724621877, 0.79

In [680]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [681]:
# 46.
column_to_drop_45 = 'Cat_주택 보유 의식'

In [682]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(19949, 70)


In [683]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [684]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [685]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7895238476566772


In [686]:
optuna_46 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [687]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.794


In [688]:
X_train = X_train.values
y_train = y_train.values

In [689]:
auc_bootstrap = []

In [690]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78386469, 0.79183731])

In [691]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990719556808472, pvalue=0.4036564230918884),
 0.7878594365829588)

In [692]:
t_46 = auc_bootstrap
print(t_46)

[0.7884296154000588, 0.7864217137616152, 0.787034835680156, 0.7895998567180833, 0.7899066818401793, 0.7899480233224075, 0.7839177143364335, 0.7859538813972311, 0.7845223830445505, 0.7909560686654282, 0.7883254031653046, 0.7892684644531935, 0.788083826261166, 0.7878278524830249, 0.786450243346795, 0.7910866971827563, 0.7873398116624718, 0.7869027542672863, 0.7857487589630446, 0.7877795106859146, 0.7854371789100854, 0.7870504212868745, 0.7865940800054101, 0.7877818881513463, 0.7863498614730142, 0.7856102055609444, 0.7884763722202146, 0.7896348582924937, 0.7865283034618009, 0.7859298425800889, 0.7877414712390082, 0.7881501311304266, 0.7870250616556035, 0.7885712066746549, 0.7905423896803208, 0.7851447506619921, 0.7862560836698769, 0.7863737682087435, 0.7887277231489054, 0.7855343908299574, 0.7845408744423523, 0.7860270544999609, 0.7924341917568518, 0.7874672702258909, 0.7873658317008071, 0.7892876162580597, 0.7876994693497157, 0.7873452270003994, 0.7889199016046307, 0.7879822556546694, 0.

In [693]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [694]:
# 47.
column_to_drop_46 = 'Cat_가구주 성별'

In [695]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(19949, 68)


In [696]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [697]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [698]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 143, 'learning_rate': 0.06999999999999999, 'max_depth': 9, 'num_leaves': 510, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7901568434335408


In [699]:
optuna_47 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [700]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.794


In [701]:
X_train = X_train.values
y_train = y_train.values

In [702]:
auc_bootstrap = []

In [703]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78366633, 0.79166901])

In [704]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9986288547515869, pvalue=0.10806354880332947),
 0.7878630536324501)

In [705]:
t_47 = auc_bootstrap
print(t_47)

[0.7884156147702945, 0.7889020706138932, 0.7867425395134755, 0.7898453960646079, 0.7870896494664968, 0.7911205100244508, 0.7887805557140533, 0.7880542400246834, 0.7886413419048887, 0.788467786928378, 0.7872111643663368, 0.7876509954711925, 0.7879584810003529, 0.7892230284471664, 0.7869335292364849, 0.7887554602456079, 0.7886373794625026, 0.7891981971415469, 0.7882321536878187, 0.7885240536102605, 0.7904083270462581, 0.784660672283825, 0.786369277440706, 0.7875568214238164, 0.7899824965711665, 0.7860792266580443, 0.7918192206985311, 0.7884607866134961, 0.7846873527292246, 0.7866368743831798, 0.7874676664701294, 0.7892084994917506, 0.787236259834782, 0.7907052460623889, 0.7865481156737315, 0.787568444588149, 0.7898018091983608, 0.7876116352101574, 0.7864086377017412, 0.7898049791522698, 0.7864658289535136, 0.7891554027637773, 0.7881595089107404, 0.7889242602912553, 0.7893264481934432, 0.7882995152083823, 0.7863402195298748, 0.7883602726583023, 0.7838211628236258, 0.7881748303546333, 0.79

In [706]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [707]:
# 48
column_to_drop_47 = 'Cat_가구주 종사상 지위'

In [708]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(19949, 63)


In [709]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [710]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [711]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 536, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7901155655869895


In [712]:
optuna_48 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [713]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.793


In [714]:
X_train = X_train.values
y_train = y_train.values

In [715]:
auc_bootstrap = []

In [716]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78351663, 0.79091748])

In [717]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991487860679626, pvalue=0.4903935492038727),
 0.7872621518862283)

In [718]:
t_48 = auc_bootstrap
print(t_48)

[0.7877738311851612, 0.7873967387514187, 0.7886290583334918, 0.7886608899539933, 0.7856024127575851, 0.7843185814244927, 0.7875768977985727, 0.7897641659956931, 0.7874297591046359, 0.7853045691715642, 0.7865216993911576, 0.7880817129585602, 0.7910358458388016, 0.7887477995236616, 0.7849943099327336, 0.7879166111924732, 0.7901034831453551, 0.7872997909943722, 0.7870494967169843, 0.7882322857692315, 0.7870954610486629, 0.7881785286341938, 0.7888448793621208, 0.7876771475909408, 0.7868299774087952, 0.787645448051852, 0.788615718110792, 0.7857598538017257, 0.7873894742737109, 0.7832940259048633, 0.7882505130042076, 0.7865234164495248, 0.785596469094006, 0.7898517359724255, 0.787983708550211, 0.7889595260284916, 0.7889320530946147, 0.7876179751179752, 0.7883946138256483, 0.785242094663277, 0.7834818456739638, 0.7898723406728334, 0.7860430363509181, 0.7883157612221651, 0.7874602699110089, 0.7897731475317682, 0.7889728662511915, 0.7888006320888095, 0.7849599687653875, 0.788934694722872, 0.786

In [719]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [720]:
# 49
column_to_drop_48 = 'Cat_현재 거주 지역'

In [721]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(19949, 46)


In [722]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [723]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [724]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 181, 'learning_rate': 0.09, 'max_depth': 4, 'num_leaves': 240, 'subsample': 0.4, 'colsample_bytree': 0.2, 'reg_alpha': 3, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7864763042457981


In [725]:
optuna_49 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)


In [726]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.788


In [727]:
X_train = X_train.values
y_train = y_train.values

In [728]:
auc_bootstrap = []

In [729]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78241437, 0.78870502])

In [730]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9975117444992065, pvalue=0.0029545733705163),
 0.7856184936035613)

In [731]:
t_49 = auc_bootstrap
print(t_49)

[0.7859610137935261, 0.7853704777965861, 0.7840389650734478, 0.7885360730188316, 0.7862024586162517, 0.7863720511503762, 0.7885899622352825, 0.7849521759620282, 0.7845299116850841, 0.785294663065599, 0.7835227909119535, 0.7864255441225885, 0.7834518631932424, 0.7862584611353085, 0.7870505533682874, 0.787748735716716, 0.78744283516451, 0.78476078999478, 0.784601896055098, 0.7868998484762032, 0.7852077534959309, 0.7858134788553507, 0.7864601494527603, 0.7867760881923442, 0.7864229024943311, 0.7825963718820861, 0.7856753216974892, 0.7861753819266135, 0.7829860120500514, 0.7867087266717808, 0.7844517194886653, 0.7879749911769617, 0.7851554492564345, 0.7881707358308343, 0.78650201926064, 0.7845705927602479, 0.7868346002582456, 0.786111058278546, 0.7874684589586067, 0.783883901494739, 0.7868249583151061, 0.7831307732785566, 0.7873203956947799, 0.7871073483758213, 0.7862049681630963, 0.7884770326272789, 0.7876623544726993, 0.785997468263478, 0.7864361106356179, 0.7874609303180732, 0.784435077

In [732]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [733]:
# 50
column_to_drop_49 = '소득 대비 주택 임대료의 비율'

In [734]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(19949, 45)


In [735]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [736]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [737]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 181, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 1020, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.785548997423024


In [738]:
optuna_50 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [739]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.787


In [740]:
X_train = X_train.values
y_train = y_train.values

In [741]:
auc_bootstrap = []

In [742]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.78208027, 0.78721864])

In [743]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9974501132965088, pvalue=0.0024387093726545572),
 0.7846772903709479)

In [744]:
t_50 = auc_bootstrap
print(t_50)

[0.7844939855407834, 0.7842808061404121, 0.7832039463812863, 0.7862419509586998, 0.7845185526835774, 0.7856191870970197, 0.7853163244173096, 0.7816634808629883, 0.7850359155777875, 0.7872215987979534, 0.7816373287432401, 0.785272341306824, 0.7877453015999814, 0.7863543522410518, 0.7832373629787422, 0.7869710403577399, 0.7844942497036093, 0.7847150898259272, 0.7868157126162052, 0.7843376011479459, 0.7848768895566925, 0.7845659699107975, 0.7859496547920194, 0.7840270777462895, 0.7837578958268614, 0.7837475934766575, 0.7842802778147606, 0.7841413281684217, 0.7831624728176453, 0.7836369092526727, 0.7801087505521004, 0.7860131859516096, 0.782763454869366, 0.7856922281183365, 0.783209229637801, 0.7848223399331773, 0.7850651055700317, 0.7861854201139915, 0.7841019679073865, 0.7842376155184037, 0.7843105244583077, 0.7869153020015089, 0.783870825434865, 0.7865320017413613, 0.7857409661596853, 0.785779798095069, 0.7843394502877261, 0.7842242752957038, 0.7839165256037176, 0.7841485926461297, 0.78

In [745]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [746]:
# 51
column_to_drop_50 = 'Cat_가구주 최종 학력'

In [747]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(19949, 42)


In [748]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [749]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [750]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 137, 'learning_rate': 0.09999999999999999, 'max_depth': 6, 'num_leaves': 428, 'subsample': 0.9, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 8, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7817295582816364


In [751]:
optuna_51 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [752]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.785


In [753]:
X_train = X_train.values
y_train = y_train.values

In [754]:
auc_bootstrap = []

In [755]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77802147, 0.7841638 ])

In [756]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990549087524414, pvalue=0.3858190178871155),
 0.7812721295803192)

In [757]:
t_51 = auc_bootstrap
print(t_51)

[0.7841013075003223, 0.7820945945945946, 0.7815905719230842, 0.7813686751494634, 0.7799498566124182, 0.7829227450532869, 0.7810156215328629, 0.7820288180509856, 0.7818094308242092, 0.7816669149797228, 0.7814891333980003, 0.7802753052137289, 0.7812937849883663, 0.7808388966024433, 0.7807243820174855, 0.780533392294476, 0.7840121525466354, 0.7795203278577663, 0.7824123824739588, 0.7820710841031038, 0.7805822624172378, 0.7819113976749444, 0.778794408412635, 0.7820305351093529, 0.7832377592229809, 0.7800420494386011, 0.7796990340093788, 0.7813780529297771, 0.7821150672135895, 0.7797115817436014, 0.7824127787181974, 0.7788640153172174, 0.7817528999795009, 0.780450313085781, 0.7796979773580759, 0.7794099077966073, 0.7824184582189508, 0.7806593979623536, 0.7800432381713169, 0.781228272607583, 0.7817263516155142, 0.7814129224227747, 0.7822257514375741, 0.7823357752544944, 0.7812824259868596, 0.7825408976886808, 0.7784504684135226, 0.7828742711747638, 0.7775422766186313, 0.7813916573153027, 0.7

In [758]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [759]:
# 52
column_to_drop_51 = '장기부채부담지표'

In [760]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(19949, 41)


In [761]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [762]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [763]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 145, 'learning_rate': 0.09, 'max_depth': 9, 'num_leaves': 160, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.9, 'reg_alpha': 10, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7777621380603623


In [764]:
optuna_52 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [765]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.777


In [766]:
X_train = X_train.values
y_train = y_train.values

In [767]:
auc_bootstrap = []

In [768]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.77013955, 0.77679298])

In [769]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987909197807312, pvalue=0.1796136051416397),
 0.7735119134132373)

In [770]:
t_52 = auc_bootstrap
print(t_52)

[0.7715142130166761, 0.7777483236227078, 0.77002539133081, 0.7724803885518171, 0.7714411719953591, 0.7715552903360785, 0.7743852666882223, 0.7743317737160103, 0.774394644468536, 0.7735790417440664, 0.7726185457096788, 0.7751862083758636, 0.7763795639411403, 0.7747092623939915, 0.7716383695447735, 0.7713974530476994, 0.7704192581039871, 0.7730495273598722, 0.7725706001568071, 0.7719226087452689, 0.7763013717447214, 0.7720713324161599, 0.7739406806525033, 0.7722660204187297, 0.7722001117937078, 0.771330884015613, 0.7742943946761681, 0.7738029197388804, 0.7740789698917778, 0.772324532484631, 0.7720021217558164, 0.7732748582502277, 0.7716209347982748, 0.7761754981582567, 0.774903289989497, 0.7733642773667405, 0.7733012745328016, 0.774347095159903, 0.7749702552658218, 0.7727570991117789, 0.7711977459514405, 0.7745317449750947, 0.7687955812955812, 0.7734023168136468, 0.7740570443772414, 0.7755968494884752, 0.7739162455911225, 0.7728194415386533, 0.7743390381937181, 0.7708021621198962, 0.7706

In [771]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [772]:
# 53
column_to_drop_52 = 'Cat_이사 계획 중인 거주 지역'

In [773]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(19949, 34)


In [774]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [775]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [776]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.775596083062585


In [777]:
optuna_53 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [778]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.775


In [779]:
X_train = X_train.values
y_train = y_train.values

In [780]:
auc_bootstrap = []

In [781]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76995342, 0.77616416])

In [782]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.99881911277771, pvalue=0.19577518105506897),
 0.7731087002290821)

In [783]:
t_53 = auc_bootstrap
print(t_53)

[0.7749895391521007, 0.773379730892046, 0.7727527404251542, 0.7728050446646505, 0.7725248999879542, 0.7699674075905604, 0.7742423545994973, 0.7728996149562652, 0.7740414587705228, 0.7753976707178678, 0.7735234354692483, 0.7757541584512029, 0.7735457572280232, 0.7746039935079344, 0.7716769373173314, 0.7735836645935168, 0.7738755645159586, 0.7758273315539325, 0.7738129579262585, 0.772394403552039, 0.776591554608796, 0.7732661408769783, 0.7746539202819991, 0.7728607830208816, 0.7718479827469975, 0.7707095730494745, 0.7705372068056797, 0.7734433941330492, 0.7704636374587113, 0.7749320837375024, 0.7759196564615284, 0.7723395897656982, 0.7729846753861532, 0.7738208828110307, 0.7729117664462491, 0.771105156881019, 0.7714510781013244, 0.7710741177489945, 0.7703927097400004, 0.7727784963006636, 0.7696895082978827, 0.7704588825278481, 0.771934496072427, 0.7711439888164027, 0.771458606741858, 0.7734287330962208, 0.7742662613352268, 0.7709135067509452, 0.774467685489853, 0.7725992618233998, 0.7692

In [784]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [785]:
# 54
column_to_drop_53 = 'Cat_이사 계획 중인 주택의 점유형태'

In [786]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(19949, 10)


In [787]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [788]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [789]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'learning_rate': 0.09, 'max_depth': 6, 'num_leaves': 436, 'subsample': 0.8, 'colsample_bytree': 0.4, 'reg_alpha': 8, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7662227096677258


In [790]:
optuna_54 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [791]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.766


In [792]:
X_train = X_train.values
y_train = y_train.values

In [793]:
auc_bootstrap = []

In [794]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76216961, 0.76711483])

In [795]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9995373487472534, pvalue=0.9365851283073425),
 0.7647019016949743)

In [796]:
t_54 = auc_bootstrap
print(t_54)

[0.7659440756731398, 0.7645448051851991, 0.7629425255656783, 0.7631772342363475, 0.7664051718854674, 0.7646717354229668, 0.7643970060841982, 0.7626401912116199, 0.7654990933931822, 0.7660570052811433, 0.7667561121994619, 0.7655282833854263, 0.7652131371343194, 0.7656566665187352, 0.7661894829382514, 0.7637953752485772, 0.7646404321281168, 0.7636366133903079, 0.7649965236172133, 0.7640838410542845, 0.7642843406390204, 0.7666429184286327, 0.7666002561322758, 0.7649078969891777, 0.7645261817059847, 0.7645418993941162, 0.7634297738977542, 0.7652394213354804, 0.7657808230468328, 0.7623659901985024, 0.7663082241284211, 0.7626107370565499, 0.7663526034831453, 0.7652856498299849, 0.765019109538814, 0.764370986045863, 0.7632702195510077, 0.7662914497889868, 0.7645566925123575, 0.7639322115923103, 0.7651060191084823, 0.7657658978471786, 0.76340058390551, 0.7651435302297372, 0.7653752010279103, 0.7643991193868042, 0.7664832320004734, 0.7653497093152265, 0.7660194941598881, 0.7671143169911644, 0.7

In [797]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [798]:
# 55
column_to_drop_54 = '현재 주택의 면적(㎡)'

In [799]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(19949, 9)


In [800]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [801]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [802]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 166, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 768, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 2, 'reg_lambda': 8, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7352387324894216


In [803]:
optuna_55 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [804]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.742


In [805]:
X_train = X_train.values
y_train = y_train.values

In [806]:
auc_bootstrap = []

In [807]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74220667, 0.74237203])

In [808]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.2366672158241272, pvalue=0.0), 0.7423214084950539)

In [809]:
t_55 = auc_bootstrap
print(t_55)

[0.7423720342439556, 0.7423419196818213, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7422367828771771, 0.7422367828771771, 0.7422367828771771, 0.7423720342439556, 0.7423419196818213, 0.7423720342439556, 0.7423419196818213, 0.74155524278677, 0.7423720342439556, 0.7423419196818213, 0.7423720342439556, 0.7423720342439556, 0.7422367828771771, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7423419196818213, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0.7422066683150428, 0.7423720342439556, 0.7423720342439556, 0.7423720342439556, 0

In [810]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc