In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, precision_recall_curve,auc, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OneHotEncoder
import matplotlib
import sklearn
#from skopt import BayesSearchCV, space
import optuna
import optuna.study
from optuna import Trial
from optuna import distributions
from optuna import integration
from optuna.study import create_study
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
import joblib
plt.rcParams['font.family'] = 'NanumGothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import operator

In [2]:
from sklearn.utils import resample
from numpy.random import RandomState

In [3]:
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=2000):
    for b in range(nsamples):
        idx = rs.randint(X_train.shape[0], size=X_train.shape[0])
        clf.fit(X_train[idx], y_train[idx])
        pred = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        auc_bootstrap.append(roc_auc)
    return np.percentile(auc_bootstrap, (2.5, 97.5))

In [4]:
import decimal
context = decimal.getcontext()

context.rounding = decimal.ROUND_HALF_UP

In [5]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str = "minimize"):
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study, trial):
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
신혼가구 = pd.read_csv('신혼가구_변수추가.csv', encoding='cp949')
신혼가구.rename(columns = {'문41. 귀 가구는 공공임대주택 입주 기회를 준다면 입주할 의향이 있으십니까?':'target'}, inplace=True)

In [8]:
신혼가구.columns = [
    'Cat_현재 거주 지역', 'Cat_현재 주택의 유형','Cat_현재 주택의 위치',
    '현재 주택 거주 기간(총 개월)','현재 무주택 기간(총 개월)',
    'Cat_현재 주택의 점유형태','Cat_현재 주택의 구조', '현재 주택의 면적(㎡)',
    'Cat_현재 상업시설 접근용이성', 'Cat_현재 의료시설 접근용이성',
    'Cat_현재 공공기관 접근용이성', 'Cat_현재 문화시설 접근용이성',
    'Cat_현재 도시공원 및 녹지 접근용이성', 'Cat_현재 대중교통 접근용이성',
    'Cat_현재 주차시설 이용편의성', 'Cat_현재 주변도로의 보행 안전',
    'Cat_현재 교육환경', 'Cat_현재 치안 및 범죄 등 방범 상태',
    'Cat_현재 자동차 경적/집주변의 소음 정도', 'Cat_현재 청소/쓰레기 처리상태',
    'Cat_현재 대기오염 정도', 'Cat_현재 주택에 대한 전반적인 만족도',
    '총 이사 횟수', 'Cat_이사 예상 기간','Cat_이사 계획 첫 번째 이유',
    'Cat_이사 계획 중인 거주 지역', 'Cat_이사 계획 중인 주택의 유형', 'Cat_이사 계획 중인 주택의 점유형태',
    'Cat_주택 보유 의식', 'Cat_현재 가장 필요한 주거지원 1순위',
    '가구주 나이','Cat_가구주 성별','Cat_가구주 주민등록상 등재 여부','Cat_가구주 동거 여부','Cat_가구주 장애 여부',
    '총 가구원 수','Cat_기초생활보장 수급가구 여부','Cat_소득 계층',
    '소득 대비 주택 임대료의 비율', '소득 중 근로/사업소득의 비중(월평균)',
    '소득 중 재산소득의 비중(월평균)', '소득 중 사회보험 수혜금의 비중(월평균)',
    '소득 중 정부 보조금의 비중(월평균)', '소득 중 사적이전소득의 비중(월평균)', 
    '소득 대비 생활비의 비율', '소득 대비 주거관리비의 비율',
    '자산 중 부동산 자산의 비중', '자산 중 금융자산의 비중', '자산 중 기타자산의 비중',
    '부채 중 금융기관 대출금의 비중', '부채 중 비금융기관 대출금의 비중', '부채 중 임대 보증금의 비중',
    '중기부채부담지표', '장기부채부담지표', 'Cat_가구주 최종 학력', 'Cat_가구주 종사상 지위',
    'Cat_주택 마련 예상 소요연수','Cat_남편/아내의 부모님과 동거 의향','Cat_가족계획 시 중요 고려 사항 1순위',
    'target'    
]

In [9]:
cat = 신혼가구.select_dtypes(include = 'object')
num = 신혼가구.select_dtypes(exclude = 'object')
num_신혼 = num.drop('target',axis=1)
target = 신혼가구.target

In [10]:
scaler=RobustScaler()
scaler.fit(num_신혼)
num_scaled_신혼=scaler.transform(num_신혼)
num_df_scaled_신혼=pd.DataFrame(data=num_scaled_신혼, columns=num_신혼.columns)

In [11]:
enc = OneHotEncoder()
enc.fit(cat)

X_cat = enc.transform(cat).toarray()
new_feature_names = enc.get_feature_names_out(cat.columns)
cat2 = pd.DataFrame(X_cat, columns= new_feature_names)

In [12]:
comp =pd.concat([num_df_scaled_신혼, target,cat2],axis=1)

In [13]:
X =comp.drop('target', axis = 1)
y=comp.target
X.shape

(6119, 221)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state = 0)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [16]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [17]:
print(study.best_trial.params)

{'n_estimators': 74, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 674, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 5, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}


In [18]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.795365570664196


In [19]:
optuna_0 = LGBMClassifier(**study.best_trial.params, random_state = 0)

In [20]:
optuna_0.fit(X_train, y_train)

In [21]:
optuna_0_proba = optuna_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, optuna_0_proba)
print(decimal.Decimal(auc_0).quantize(decimal.Decimal('1.000')))

0.779


In [22]:
X_train = X_train.values
y_train = y_train.values

In [23]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [24]:
auc_bootstrap = []

In [25]:
rs = RandomState(seed = 2024)
bootstrap_auc(optuna_0, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.759642  , 0.77914455])

In [26]:
np.mean(auc_bootstrap)

0.7695165621795967

In [27]:
from scipy.stats import shapiro

In [28]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9989734888076782, pvalue=0.30837133526802063)

In [29]:
t_0 = auc_bootstrap
print(t_0)

[0.7684872693096376, 0.7691440960355433, 0.7687115516062885, 0.7690025845864663, 0.7638948222829802, 0.7718915541695147, 0.7705538704716337, 0.7740542763157896, 0.7705618805536568, 0.7679799641148326, 0.7683030374231032, 0.7704737696514012, 0.7705405203349283, 0.7697688824333562, 0.7706900418660287, 0.7740943267259057, 0.7644368378332194, 0.7628775418660287, 0.7727566430280246, 0.7647118506493507, 0.7698730134996582, 0.7738326640464799, 0.7731331168831168, 0.765390037593985, 0.7726044514695831, 0.7627333603896105, 0.7634329075529733, 0.7692909475393028, 0.7754907510252905, 0.7730263157894737, 0.7769459159261791, 0.7630537636705399, 0.7663539174641147, 0.7715818309979494, 0.7654701384142173, 0.775274478810663, 0.7682790071770336, 0.7741503973000684, 0.7758565447710184, 0.7625758287764867, 0.7702147769993165, 0.7591875640806561, 0.7693203178400547, 0.7704737696514012, 0.7693229878673957, 0.7565682672590568, 0.7694110987696514, 0.7650669642857142, 0.7795972530758715, 0.7734001196172249, 0

In [30]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [31]:
# 1.
column_to_drop = '소득 중 사회보험 수혜금의 비중(월평균)'

In [32]:
if not column_to_drop.startswith('Cat_'):
    comp_1 = comp.drop(column_to_drop, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']
else:
    comp_1 = comp.drop(comp.filter(regex='^' + column_to_drop).columns, axis=1)
    X_1 = comp_1.drop('target', axis=1)
    y_1 = comp_1['target']

print(X_1.shape)

(6119, 220)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, shuffle=True, stratify=y_1, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [34]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [35]:
print(study.best_trial.params)

{'n_estimators': 74, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 674, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 5, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}


In [36]:
optuna_auc = study.best_trial.value
print(optuna_auc)

0.795365570664196


In [37]:
optuna_1 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_1.fit(X_train, y_train)

In [38]:
optuna_1_proba = optuna_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, optuna_1_proba)
print(decimal.Decimal(auc_1).quantize(decimal.Decimal('1.000')))

0.779


In [39]:
X_train = X_train.values
y_train = y_train.values

In [40]:
auc_bootstrap = []

In [41]:
rs = RandomState(seed = 1)
bootstrap_auc(optuna_1, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7599024 , 0.77882915])

In [42]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.9986600875854492, pvalue=0.11935637146234512)

In [43]:
np.mean(auc_bootstrap)

0.7694551562232997

In [44]:
t_1 = auc_bootstrap
print(t_1)

[0.7593851461038962, 0.7702601674641147, 0.7674833390293917, 0.7613022257347914, 0.7610352230006836, 0.7593531057758031, 0.7752744788106629, 0.768631450786056, 0.7638974923103212, 0.7721185064935064, 0.7753171992481203, 0.7669413234791524, 0.7664019779562543, 0.7711572966507177, 0.7727112525632261, 0.7670881749829118, 0.7789697966507179, 0.7674432886192755, 0.7598657510252906, 0.769899713773069, 0.7674032382091592, 0.7776214328434723, 0.768994574504443, 0.7710157852016404, 0.7689571941216679, 0.7659721035543403, 0.7747912038619276, 0.7711412764866712, 0.7658920027341081, 0.7701239960697197, 0.767560769822283, 0.7699611244019138, 0.7725430408407381, 0.7666769907723855, 0.7763798701298701, 0.766191045796309, 0.7771701982228298, 0.7698089328434723, 0.7665915498974709, 0.7698890336637048, 0.7650829844497606, 0.775205058099795, 0.7738166438824334, 0.7724442498291182, 0.7664874188311688, 0.7751569976076556, 0.7624076170539987, 0.7641057544429255, 0.7662257561517429, 0.7624957279562544, 0.764

In [45]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [46]:
#### 2. 
column_to_drop_1 = '소득 중 재산소득의 비중(월평균)'

In [47]:
if not column_to_drop_1.startswith('Cat_'):
    comp_2 = comp_1.drop(column_to_drop_1, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']
else:
    comp_2 = comp_1.drop(comp_1.filter(regex='^' + column_to_drop_1).columns, axis=1)
    X_2 = comp_2.drop('target', axis=1)
    y_2 = comp_2['target']

print(X_2.shape)

(6119, 219)


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, stratify=y_2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [49]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [50]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 120, 'learning_rate': 0.06999999999999999, 'max_depth': 4, 'num_leaves': 144, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7939840900174459


In [51]:
optuna_2 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_2.fit(X_train, y_train)

In [52]:
optuna_2_proba = optuna_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, optuna_2_proba)
print(decimal.Decimal(auc_2).quantize(decimal.Decimal('1.000')))

0.777


In [53]:
X_train = X_train.values
y_train = y_train.values

In [54]:
auc_bootstrap = []

In [55]:
rs = RandomState(seed = 2)
bootstrap_auc(optuna_2, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75974246, 0.77837545])

In [56]:
shapiro(auc_bootstrap)

ShapiroResult(statistic=0.998955249786377, pvalue=0.2927720546722412)

In [57]:
np.mean(auc_bootstrap)

0.769193359375

In [58]:
t_2 = auc_bootstrap
print(t_2)

[0.7666663106630212, 0.7661296351674642, 0.7647892814422419, 0.7721051563568011, 0.7648159817156528, 0.7667837918660287, 0.776246368762816, 0.7762356886534518, 0.7701560363978127, 0.7703589584757348, 0.7716512517088175, 0.7769245557074504, 0.7732719583048531, 0.7683163875598085, 0.7752718087833219, 0.7667437414559124, 0.7596468087833219, 0.7769405758714968, 0.7739608253588517, 0.7677369916267942, 0.7778136748120301, 0.7725724111414901, 0.7653179468557757, 0.769899713773069, 0.7816638542378673, 0.7637426307245386, 0.7779418361244019, 0.7698463132262474, 0.7725777511961722, 0.7639535628844839, 0.7702681775461382, 0.7752958390293916, 0.7657852016404649, 0.770564550580998, 0.7682816772043747, 0.7706713516746412, 0.7774612312030076, 0.7671015251196173, 0.7695472701640466, 0.7653660073479152, 0.764687820403281, 0.7669359834244702, 0.7689091336295284, 0.7694244489063569, 0.7655395591250854, 0.7624957279562542, 0.7701533663704716, 0.7624610176008202, 0.7789511064593302, 0.7715417805878333, 0.7

In [59]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [60]:
#### 3.
column_to_drop_2 = 'Cat_가구주 장애 여부'

In [61]:
if not column_to_drop_2.startswith('Cat_'):
    comp_3 = comp_2.drop(column_to_drop_2, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']
else:
    comp_3 = comp_2.drop(comp_2.filter(regex='^' + column_to_drop_2).columns, axis=1)
    X_3 = comp_3.drop('target', axis=1)
    y_3 = comp_3['target']

print(X_3.shape)

(6119, 217)


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, stratify=y_3, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [63]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [64]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 120, 'learning_rate': 0.06999999999999999, 'max_depth': 4, 'num_leaves': 144, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7939840900174459


In [65]:
optuna_3 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_3.fit(X_train, y_train)

In [66]:
optuna_3_proba = optuna_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, optuna_3_proba)
print(decimal.Decimal(auc_3).quantize(decimal.Decimal('1.000')))

0.777


In [67]:
X_train = X_train.values
y_train = y_train.values

In [68]:
auc_bootstrap = []

In [69]:
rs = RandomState(seed = 3)
bootstrap_auc(optuna_3, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75935865, 0.77841243])

In [70]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9990593194961548, pvalue=0.39034736156463623),
 0.7690812983274948)

In [71]:
t_3 = auc_bootstrap
print(t_3)

[0.7709997650375939, 0.76953125, 0.769699461722488, 0.7795945830485305, 0.7687062115516061, 0.7782488892686261, 0.7612221249145591, 0.770132006151743, 0.773106416609706, 0.767194976076555, 0.7745268711551607, 0.7695018796992481, 0.7660255041011619, 0.7719716549897471, 0.7727673231373889, 0.7638868122009569, 0.7685486799384826, 0.7756269224196857, 0.7665514994873548, 0.7692028366370474, 0.7701934167805878, 0.7636144694121667, 0.7625037380382775, 0.7707888328776487, 0.7710504955570745, 0.769731502050581, 0.770169386534518, 0.757211743848257, 0.7567177887901572, 0.7668825828776487, 0.7688530630553657, 0.7720384056732742, 0.7661189550580998, 0.7663058569719754, 0.7707915029049899, 0.7665888798701298, 0.7743319591592619, 0.7627680707450445, 0.7738647043745728, 0.7666075700615174, 0.7634943181818181, 0.77544002050581, 0.7757337235133288, 0.7612488251879699, 0.7759873761107313, 0.7695365900546822, 0.7668211722488039, 0.7721398667122351, 0.7674593087833219, 0.7740222359876965, 0.76620973598769

In [72]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [73]:
### 4. 
column_to_drop_3 = 'Cat_가구주 동거 여부'

In [74]:
if not column_to_drop_3.startswith('Cat_'):
    comp_4 = comp_3.drop(column_to_drop_3, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']
else:
    comp_4 = comp_3.drop(comp_3.filter(regex='^' + column_to_drop_3).columns, axis=1)
    X_4 = comp_4.drop('target', axis=1)
    y_4 = comp_4['target']

print(X_4.shape)

(6119, 215)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, shuffle=True, stratify=y_4, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [76]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [77]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 120, 'learning_rate': 0.06999999999999999, 'max_depth': 4, 'num_leaves': 144, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7939840900174459


In [78]:
optuna_4 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_4.fit(X_train, y_train)

In [79]:
optuna_4_proba = optuna_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, optuna_4_proba)
print(decimal.Decimal(auc_4).quantize(decimal.Decimal('1.000')))

0.777


In [80]:
X_train = X_train.values
y_train = y_train.values

In [81]:
auc_bootstrap = []

In [82]:
rs = RandomState(seed = 4)
bootstrap_auc(optuna_4, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75936926, 0.7784681 ])

In [83]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9965349435806274, pvalue=0.00016210651665460318),
 0.7693221628289473)

In [84]:
t_4 = auc_bootstrap
print(t_4)

[0.7774745813397128, 0.7651844454887218, 0.7675340695488722, 0.7609871625085441, 0.7640657040328093, 0.7793542805878333, 0.7641751751537936, 0.7653446471291866, 0.7675580997949419, 0.7684792592276146, 0.7666769907723855, 0.7760728169856459, 0.7678144224196856, 0.7699370941558442, 0.7739608253588516, 0.7692722573479154, 0.7623355263157894, 0.7631071642173617, 0.7705912508544087, 0.7604638371496925, 0.7666743207450444, 0.7690186047505126, 0.7693897385509226, 0.774807224025974, 0.7611660543403964, 0.7732559381408066, 0.7687382518796991, 0.7707487824675325, 0.7655529092617908, 0.767761021872864, 0.7681214755639096, 0.7716619318181819, 0.768158855946685, 0.7637533108339029, 0.7724762901572112, 0.7584666566985647, 0.7740809765892003, 0.7599351717361585, 0.772564401059467, 0.7630671138072453, 0.7787081339712919, 0.7623889268626112, 0.7745322112098427, 0.7668745727956254, 0.7639455528024606, 0.759750939849624, 0.7679078733766234, 0.7728340738209158, 0.7703696385850992, 0.7701773966165414, 0.76

In [85]:
## 5.
column_to_drop_4 = 'Cat_가구주 주민등록상 등재 여부'

In [86]:
if not column_to_drop_4.startswith('Cat_'):
    comp_5 = comp_4.drop(column_to_drop_4, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']
else:
    comp_5 = comp_4.drop(comp_4.filter(regex='^' + column_to_drop_4).columns, axis=1)
    X_5 = comp_5.drop('target', axis=1)
    y_5 = comp_5['target']

print(X_5.shape)

(6119, 213)


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.2, shuffle=True, stratify=y_5, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [88]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [89]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7961251763370313


In [90]:
optuna_5 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_5.fit(X_train, y_train)

In [91]:
optuna_5_proba = optuna_5.predict_proba(X_test)[:, 1]
auc_5 = roc_auc_score(y_test, optuna_5_proba)
print(decimal.Decimal(auc_5).quantize(decimal.Decimal('1.000')))

0.775


In [92]:
X_train = X_train.values
y_train = y_train.values

In [93]:
auc_bootstrap = []

In [94]:
rs = RandomState(seed = 5)
bootstrap_auc(optuna_5, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75534366, 0.77648674])

In [95]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991717338562012, pvalue=0.518109142780304),
 0.7658939932394908)

In [96]:
t_5 = auc_bootstrap
print(t_5)

[0.76715759569378, 0.7655982997265893, 0.7631285244360901, 0.7619296821599453, 0.7721318566302118, 0.774606971975393, 0.763683890123035, 0.7678518028024608, 0.7767456638755981, 0.7626078691045797, 0.7726925623718386, 0.7655021787423104, 0.7669439935064934, 0.7667731117566643, 0.76295497265892, 0.7539703306561858, 0.755505596377307, 0.7585841379015721, 0.7590967831510594, 0.7726204716336295, 0.7725777511961722, 0.7589873120300752, 0.7661590054682158, 0.7721238465481886, 0.7686901913875599, 0.7592142643540669, 0.7657852016404647, 0.7573986457621327, 0.7679532638414217, 0.7617988508202325, 0.7631952751196173, 0.7653206168831168, 0.7697154818865345, 0.7757978041695147, 0.7582877648667122, 0.7659694335269993, 0.7647225307587151, 0.765224495898838, 0.764153814935065, 0.771304148154477, 0.7649414730006835, 0.7810791182501708, 0.7754800709159262, 0.7665675196514012, 0.7674032382091593, 0.7626799598427887, 0.7740969967532467, 0.767061474709501, 0.7714723598769649, 0.7583037850307588, 0.75943587

In [97]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [98]:
## 6
column_to_drop_5 = 'Cat_기초생활보장 수급가구 여부'

In [99]:
if not column_to_drop_5.startswith('Cat_'):
    comp_6 = comp_5.drop(column_to_drop_5, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']
else:
    comp_6 = comp_5.drop(comp_5.filter(regex='^' + column_to_drop_5).columns, axis=1)
    X_6 = comp_6.drop('target', axis=1)
    y_6 = comp_6['target']

print(X_6.shape)

(6119, 211)


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y_6, test_size=0.2, shuffle=True, stratify=y_6, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [101]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [102]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7961251763370313


In [103]:
optuna_6 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_6.fit(X_train, y_train)

In [104]:
optuna_proba_6 = optuna_6.predict_proba(X_test)[:, 1]
auc_6 = roc_auc_score(y_test, optuna_proba_6)
print(decimal.Decimal(auc_6).quantize(decimal.Decimal('1.000')))

0.775


In [105]:
X_train = X_train.values
y_train = y_train.values

In [106]:
auc_bootstrap = []

In [107]:
rs = RandomState(seed = 6)
bootstrap_auc(optuna_6, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75563015, 0.77589172])

In [108]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981856942176819, pvalue=0.025631437078118324),
 0.7659252152042036)

In [109]:
t_6 = auc_bootstrap
print(t_6)

[0.761454417293233, 0.7704844497607656, 0.7624369873547505, 0.762319506151743, 0.7569153708133971, 0.7720624359193438, 0.7661563354408749, 0.7693390080314422, 0.7631525546821599, 0.7731838474025974, 0.7648186517429938, 0.7648346719070402, 0.7649574931647298, 0.7696967916951469, 0.7662391062884484, 0.77019875683527, 0.7619376922419686, 0.7631712448735475, 0.7671362354750513, 0.7655529092617908, 0.7689198137388926, 0.7688210227272726, 0.7648987525632263, 0.7613289260082023, 0.7705031399521531, 0.7604424769309638, 0.7672403665413534, 0.767293767088175, 0.7645836893369788, 0.7714323094668489, 0.7614650974025974, 0.7781180579289131, 0.7596708390293917, 0.7661296351674642, 0.761489127648667, 0.761147364149009, 0.779517152255639, 0.7703028879015721, 0.7727352828092959, 0.7593584458304853, 0.7647252007860561, 0.7616172889610389, 0.7674753289473685, 0.7621673145933013, 0.7570355220437457, 0.7663832877648667, 0.7739928656869446, 0.7698329630895422, 0.7730903964456596, 0.7613582963089542, 0.77075

In [110]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [111]:
## 7 .
column_to_drop_6 = '부채 중 임대 보증금의 비중'

In [112]:
if not column_to_drop_6.startswith('Cat_'):
    comp_7 = comp_6.drop(column_to_drop_6, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']
else:
    comp_7 = comp_6.drop(comp_6.filter(regex='^' + column_to_drop_6).columns, axis=1)
    X_7 = comp_7.drop('target', axis=1)
    y_7 = comp_7['target']

print(X_7.shape)

(6119, 210)


In [113]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y_7, test_size=0.2, shuffle=True, stratify=y_7, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [114]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [115]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7961251763370313


In [116]:
optuna_7 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_7.fit(X_train, y_train)

In [117]:
optuna_proba_7 = optuna_7.predict_proba(X_test)[:, 1]
auc_7 = roc_auc_score(y_test, optuna_proba_7)
print(decimal.Decimal(auc_7).quantize(decimal.Decimal('1.000')))

0.775


In [118]:
X_train = X_train.values
y_train = y_train.values

In [119]:
auc_bootstrap = []

In [120]:
rs = RandomState(seed = 7)
bootstrap_auc(optuna_7, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75513166, 0.77564721])

In [121]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9977033138275146, pvalue=0.005401795729994774),
 0.7659747588965311)

In [122]:
t_7 = auc_bootstrap
print(t_7)

[0.7681989063568011, 0.7650322539302803, 0.7611206638755981, 0.7580474624060151, 0.776174278024607, 0.7648320018796992, 0.7659801136363636, 0.7663298872180451, 0.7677316515721122, 0.759582728127136, 0.7732425880041013, 0.7658759825700615, 0.7728287337662337, 0.7702040968899521, 0.7651390550239234, 0.7706713516746411, 0.7604825273410799, 0.772537700786056, 0.7637079203691046, 0.7701880767259057, 0.7678865131578948, 0.7625624786397813, 0.7702922077922078, 0.7632166353383458, 0.7581168831168831, 0.7617080698906356, 0.7706419813738892, 0.7592756749829118, 0.7610966336295284, 0.763021723342447, 0.7687622821257689, 0.7636091293574847, 0.7670054041353382, 0.7595960782638415, 0.7666636406356802, 0.764754571086808, 0.7679265635680109, 0.7620071129528366, 0.7639055023923444, 0.7636465097402598, 0.7670027341079972, 0.7642953263841422, 0.7671148752563226, 0.7592222744360901, 0.7681188055365687, 0.766826512303486, 0.7729408749145591, 0.7656036397812714, 0.7659267130895419, 0.7608002605946685, 0.759

In [123]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [124]:
## 8 .
column_to_drop_7 = '자산 중 부동산 자산의 비중'

In [125]:
if not column_to_drop_7.startswith('Cat_'):
    comp_8 = comp_7.drop(column_to_drop_7, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']
else:
    comp_8 = comp_7.drop(comp_7.filter(regex='^' + column_to_drop_7).columns, axis=1)
    X_8 = comp_8.drop('target', axis=1)
    y_8 = comp_8['target']

print(X_8.shape)

(6119, 209)


In [126]:
X_train, X_test, y_train, y_test = train_test_split(X_8, y_8, test_size=0.2, shuffle=True, stratify=y_8, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [127]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [128]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 102, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 542, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 10, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7932829155502132


In [129]:
optuna_8 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_8.fit(X_train, y_train)

In [130]:
optuna_proba_8 = optuna_8.predict_proba(X_test)[:, 1]
auc_8 = roc_auc_score(y_test, optuna_proba_8)
print(decimal.Decimal(auc_8).quantize(decimal.Decimal('1.000')))

0.775


In [131]:
X_train = X_train.values
y_train = y_train.values

In [132]:
auc_bootstrap = []

In [133]:
rs = RandomState(seed = 8)
bootstrap_auc(optuna_8, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76020164, 0.77794471])

In [134]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9981907606124878, pvalue=0.026059385389089584),
 0.7691385503887559)

In [135]:
t_8 = auc_bootstrap
print(t_8)

[0.7764145804853042, 0.7725510509227614, 0.7777576042378674, 0.765291246582365, 0.7698169429254955, 0.7662497863978126, 0.7693496881408066, 0.7647839413875598, 0.7721104964114831, 0.776406570403281, 0.7725163405673274, 0.773699162679426, 0.7716726119275462, 0.7668825828776487, 0.7755521616541353, 0.7630804639439508, 0.7745989618933697, 0.770132006151743, 0.7627333603896103, 0.7677850521189338, 0.7696247009569377, 0.7662310962064252, 0.7677263115174299, 0.7744975008544087, 0.771133266404648, 0.7672724068694463, 0.7677797120642516, 0.7625064080656185, 0.7734561901913876, 0.7720143754272042, 0.768025354579631, 0.7590513926862612, 0.7701186560150375, 0.767392558099795, 0.765827922077922, 0.7666689806903623, 0.7797734748803828, 0.7659160329801777, 0.7724255596377306, 0.7669600136705401, 0.77009996582365, 0.7658626324333562, 0.7668238422761449, 0.7616493292891319, 0.7716779519822283, 0.7642392558099795, 0.7602021744702666, 0.769731502050581, 0.7667197112098428, 0.7723534688995215, 0.77251100

In [136]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [137]:
#9.
column_to_drop_8 = 'Cat_현재 주택의 위치'

In [138]:
if not column_to_drop_8.startswith('Cat_'):
    comp_9 = comp_8.drop(column_to_drop_8, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']
else:
    comp_9 = comp_8.drop(comp_8.filter(regex='^' + column_to_drop_8).columns, axis=1)
    X_9 = comp_9.drop('target', axis=1)
    y_9 = comp_9['target']

print(X_9.shape)

(6119, 206)


In [139]:
X_train, X_test, y_train, y_test = train_test_split(X_9, y_9, test_size=0.2, shuffle=True, stratify=y_9, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [140]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [141]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.796517500146078


In [142]:
optuna_9 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_9.fit(X_train, y_train)

In [143]:
optuna_proba_9 = optuna_9.predict_proba(X_test)[:, 1]
auc_9 = roc_auc_score(y_test, optuna_proba_9)
print(decimal.Decimal(auc_9).quantize(decimal.Decimal('1.000')))

0.778


In [144]:
X_train = X_train.values
y_train = y_train.values

In [145]:
auc_bootstrap = []

In [146]:
rs = RandomState(seed = 9)
bootstrap_auc(optuna_9, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75148901, 0.77473073])

In [147]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980487823486328, pvalue=0.01640745811164379),
 0.763150520121326)

In [148]:
t_9 = auc_bootstrap
print(t_9)

[0.7649307928913193, 0.7672190063226249, 0.7582397043745728, 0.7693630382775118, 0.764823991797676, 0.7680039943609023, 0.7679372436773751, 0.7578899307928914, 0.7587203092959672, 0.770033215140123, 0.7712827879357484, 0.7710424854750513, 0.7698863636363636, 0.7605706382433356, 0.7601861543062202, 0.7663192071086808, 0.7598897812713603, 0.7672777469241286, 0.7697795625427204, 0.75019758202324, 0.7640176435406698, 0.7544909859876966, 0.7622687756322625, 0.7733974495898839, 0.7613582963089542, 0.7643674171223513, 0.7582610645933014, 0.7761208774777855, 0.769632711038961, 0.7718862141148325, 0.7550410116199591, 0.7638521018455228, 0.7568032296650717, 0.7621112440191387, 0.7656356801093643, 0.7620792036910458, 0.7604424769309637, 0.7806091934381408, 0.759483937115516, 0.7618816216678057, 0.7686821813055366, 0.7677423316814764, 0.7663352272727273, 0.763590439166097, 0.761019202836637, 0.7626051990772386, 0.758383885850991, 0.7742865686944634, 0.7549635808270677, 0.75751612696514, 0.76230615

In [149]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [150]:
# 10.
column_to_drop_9 = '소득 중 사적이전소득의 비중(월평균)'

In [151]:
if not column_to_drop_9.startswith('Cat_'):
    comp_10 = comp_9.drop(column_to_drop_9, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']
else:
    comp_10 = comp_9.drop(comp_9.filter(regex='^' + column_to_drop_9).columns, axis=1)
    X_10 = comp_10.drop('target', axis=1)
    y_10 = comp_10['target']

print(X_10.shape)

(6119, 205)


In [152]:
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, stratify=y_10, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [153]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [154]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 88, 'learning_rate': 0.08, 'max_depth': 4, 'num_leaves': 10, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7935708979206838


In [155]:
optuna_10 = LGBMClassifier(**study.best_trial.params, random_state=0)
optuna_10.fit(X_train, y_train)

In [156]:
optuna_proba_10 = optuna_10.predict_proba(X_test)[:, 1]
auc_10 = roc_auc_score(y_test, optuna_proba_10)
print(decimal.Decimal(auc_10).quantize(decimal.Decimal('1.000')))

0.778


In [157]:
X_train = X_train.values
y_train = y_train.values

In [158]:
auc_bootstrap = []

In [159]:
rs = RandomState(seed = 10)
bootstrap_auc(optuna_10, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76191046, 0.77934794])

In [160]:
from scipy.stats import shapiro

In [161]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9983963966369629, pvalue=0.05100443586707115),
 0.7707717427001453)

In [162]:
t_10 = auc_bootstrap
print(t_10)

[0.7749247052289816, 0.7661403152768285, 0.769328327922078, 0.7678384526657552, 0.7743105989405332, 0.7719369446343131, 0.7633421266233765, 0.7725163405673274, 0.7706526614832536, 0.7817866754955571, 0.7703562884483938, 0.7748980049555707, 0.7663058569719755, 0.7710771958304854, 0.7743266191045797, 0.7729008245044429, 0.7655368890977443, 0.7747244531784006, 0.7748579545454546, 0.7716779519822282, 0.7713869190020506, 0.7739634953861927, 0.7763932202665755, 0.7696433911483254, 0.7661536654135338, 0.7700412252221462, 0.7651176948051948, 0.7713895890293917, 0.7643006664388244, 0.7612568352699932, 0.7638173914900889, 0.7711492865686944, 0.7685727101845523, 0.7729622351332877, 0.7718595138414217, 0.7655742694805194, 0.7713522086466165, 0.7740195659603556, 0.7776935235816815, 0.7763451597744362, 0.7645649991455913, 0.7724308996924129, 0.7758218344155844, 0.7646103896103897, 0.7716699419002051, 0.7740916566985645, 0.7686127605946685, 0.7747324632604238, 0.7701453562884484, 0.762722680280246, 0

In [163]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [164]:
# 11.
column_to_drop_10 = '부채 중 비금융기관 대출금의 비중'

In [165]:
if not column_to_drop_10.startswith('Cat_'):
    comp_11 = comp_10.drop(column_to_drop_10, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']
else:
    comp_11 = comp_10.drop(comp_10.filter(regex='^' + column_to_drop_10).columns, axis=1)
    X_11 = comp_11.drop('target', axis=1)
    y_11 = comp_11['target']

print(X_11.shape)

(6119, 204)


In [166]:
X_train, X_test, y_train, y_test = train_test_split(X_11, y_11, test_size=0.2, shuffle=True, stratify=y_11, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [167]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])

        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [168]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 129, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 510, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7943513718812345


In [169]:
optuna_11 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_11.fit(X_train, y_train)

In [170]:
optuna_proba_11 = optuna_11.predict_proba(X_test)[:, 1]
auc_11 = roc_auc_score(y_test, optuna_proba_11)
print(decimal.Decimal(auc_11).quantize(decimal.Decimal('1.000')))

0.774


In [171]:
X_train = X_train.values
y_train = y_train.values

In [172]:
auc_bootstrap = []

In [173]:
rs = RandomState(seed = 11)
bootstrap_auc(optuna_11, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75650078, 0.77701767])

In [174]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979067444801331, pvalue=0.010359610430896282),
 0.7672732839734279)

In [175]:
t_11 = auc_bootstrap
print(t_11)

[0.7650108937115516, 0.7732452580314423, 0.7687542720437457, 0.7604264567669172, 0.7664286782296651, 0.763451597744361, 0.7742011278195489, 0.7638654519822283, 0.7683404178058784, 0.7647892814422419, 0.7632006151742994, 0.7612701854066984, 0.7751249572795625, 0.767365857826384, 0.7662444463431305, 0.7752584586466166, 0.7734214798359536, 0.7728741242310321, 0.7691200657894738, 0.7709890849282298, 0.7690453050239234, 0.7721425367395762, 0.7713789089200274, 0.7666876708817498, 0.7644848983253588, 0.7679132134313055, 0.7697608723513328, 0.7641484748803826, 0.7675180493848257, 0.7601808142515379, 0.7590220223855091, 0.7757577537593985, 0.7639028323650033, 0.7658065618591934, 0.7677049512987013, 0.7627520505809979, 0.7759152853725222, 0.763315426349966, 0.7627440404989747, 0.7592142643540669, 0.7658519523239917, 0.7629576426862611, 0.7645836893369788, 0.7699985047846891, 0.7657398111756665, 0.7582931049213945, 0.7624289772727273, 0.7671041951469582, 0.7685059595010253, 0.7693603682501708, 0.

In [176]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [177]:
# 12
column_to_drop_11 = 'Cat_현재 주택의 구조'

In [178]:
if not column_to_drop_11.startswith('Cat_'):
    comp_12 = comp_11.drop(column_to_drop_11, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']
else:
    comp_12 = comp_11.drop(comp_11.filter(regex='^' + column_to_drop_11).columns, axis=1)
    X_12 = comp_12.drop('target', axis=1)
    y_12 = comp_12['target']

print(X_12.shape)

(6119, 202)


In [179]:
X_train, X_test, y_train, y_test = train_test_split(X_12, y_12, test_size=0.2, shuffle=True, stratify=y_12, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [180]:
X_train.shape

(3916, 202)

In [181]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [182]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7959457090626798


In [183]:
optuna_12 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_12.fit(X_train, y_train)

In [184]:
optuna_proba_12 = optuna_12.predict_proba(X_test)[:, 1]
auc_12 = roc_auc_score(y_test, optuna_proba_12)
print(decimal.Decimal(auc_12).quantize(decimal.Decimal('1.000')))

0.775


In [185]:
X_train = X_train.values
y_train = y_train.values

In [186]:
auc_bootstrap = []

In [187]:
rs = RandomState(seed = 12)
bootstrap_auc(optuna_12, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7555078 , 0.77564835])

In [188]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9985117316246033, pvalue=0.07417694479227066),
 0.7658818806604579)

In [189]:
t_12 = auc_bootstrap
print(t_12)

[0.7664660586124401, 0.7742972488038278, 0.7710451555023923, 0.7569207108680793, 0.7647358808954203, 0.7720410757006153, 0.7689678742310321, 0.7631472146274778, 0.7637720010252905, 0.7673711978810663, 0.7692589072112097, 0.7736511021872864, 0.7648827323991797, 0.7641297846889952, 0.771304148154477, 0.7698890336637046, 0.7630804639439508, 0.778345010252905, 0.7799203263841422, 0.7639962833219412, 0.7731304468557758, 0.7620284731715652, 0.7600900333219411, 0.770631301264525, 0.7665888798701299, 0.7707968429596719, 0.7669600136705401, 0.7723107484620644, 0.7705138200615174, 0.7636651999316473, 0.7686261107313739, 0.7665514994873548, 0.7621165840738209, 0.7629389524948735, 0.7679399137047164, 0.7684151785714285, 0.7584719967532468, 0.7600339627477785, 0.757011491797676, 0.7637159304511278, 0.7627306903622693, 0.7676809210526315, 0.7701133159603555, 0.7754560406698565, 0.7709089841079972, 0.7626132091592618, 0.7568646402939165, 0.7671629357484622, 0.7697715524606972, 0.7639722530758715, 0.7

In [190]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [191]:
# 13.
column_to_drop_12 = 'Cat_가구주 성별'

In [192]:
if not column_to_drop_12.startswith('Cat_'):
    comp_13 = comp_12.drop(column_to_drop_12, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']
else:
    comp_13 = comp_12.drop(comp_12.filter(regex='^' + column_to_drop_12).columns, axis=1)
    X_13 = comp_13.drop('target', axis=1)
    y_13 = comp_13['target']

print(X_13.shape)

(6119, 200)


In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_13, y_13, test_size=0.2, shuffle=True, stratify=y_13, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [194]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [195]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7940425212230485


In [196]:
optuna_13 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_13.fit(X_train, y_train)

In [197]:
optuna_proba_13 = optuna_13.predict_proba(X_test)[:, 1]
auc_13 = roc_auc_score(y_test, optuna_proba_13)
print(decimal.Decimal(auc_13).quantize(decimal.Decimal('1.000')))

0.776


In [198]:
X_train = X_train.values
y_train = y_train.values

In [199]:
auc_bootstrap = []

In [200]:
rs = RandomState(seed = 13)
bootstrap_auc(optuna_13, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75485585, 0.77541919])

In [201]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988414645195007, pvalue=0.2094595581293106),
 0.7655914671266234)

In [202]:
t_13 = auc_bootstrap
print(t_13)

[0.7720277255639099, 0.7662951768626111, 0.7559194506151743, 0.7523229237867397, 0.7676809210526315, 0.7740649564251538, 0.7719396146616542, 0.7743506493506493, 0.7635797590567328, 0.7698436431989064, 0.761419706937799, 0.7677476717361584, 0.772636491797676, 0.772001025290499, 0.7688076725905674, 0.7662150760423786, 0.7574066558441559, 0.764826661825017, 0.7656143198906356, 0.768663491114149, 0.7672109962406015, 0.7657878716678058, 0.7648720522898155, 0.7702761876281612, 0.7638280715994531, 0.750200252050581, 0.760485197368421, 0.7702014268626111, 0.7629763328776487, 0.7720437457279563, 0.7740542763157896, 0.7761929682159945, 0.7634222274436089, 0.7655956296992481, 0.7689518540669856, 0.762084543745728, 0.7635690789473684, 0.7670134142173616, 0.7700011748120301, 0.7627493805536567, 0.7670000640806562, 0.7654728084415584, 0.7623435363978126, 0.7570515422077921, 0.7561544130211894, 0.7639295326384142, 0.7639589029391662, 0.762084543745728, 0.7716192113807245, 0.7617721505468216, 0.763539

In [203]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [204]:
#14.
column_to_drop_13 = 'Cat_소득 계층'

In [205]:
if not column_to_drop_13.startswith('Cat_'):
    comp_14 = comp_13.drop(column_to_drop_13, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']
else:
    comp_14 = comp_13.drop(comp_13.filter(regex='^' + column_to_drop_13).columns, axis=1)
    X_14 = comp_14.drop('target', axis=1)
    y_14 = comp_14['target']

print(X_14.shape)

(6119, 198)


In [206]:
X_train, X_test, y_train, y_test = train_test_split(X_14, y_14, test_size=0.2, shuffle=True, stratify=y_14, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [207]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [208]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 149, 'learning_rate': 0.05, 'max_depth': 6, 'num_leaves': 456, 'subsample': 0.9, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 8, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7942011202096845


In [209]:
optuna_14 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_14.fit(X_train, y_train)

In [210]:
optuna_proba_14 = optuna_14.predict_proba(X_test)[:, 1]
auc_14 = roc_auc_score(y_test, optuna_proba_14)
print(decimal.Decimal(auc_14).quantize(decimal.Decimal('1.000')))

0.775


In [211]:
X_train = X_train.values
y_train = y_train.values

In [212]:
auc_bootstrap = []

In [213]:
rs = RandomState(seed = 14)
bootstrap_auc(optuna_14, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7548965 , 0.77622808])

In [214]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984474778175354, pvalue=0.06022633984684944),
 0.7659983285628844)

In [215]:
t_14 = auc_bootstrap
print(t_14)

[0.7676462106971975, 0.7573986457621327, 0.7729515550239235, 0.7683083774777854, 0.768628780758715, 0.7553961252563226, 0.7582049940191387, 0.7519651401230348, 0.7584933569719754, 0.7681027853725222, 0.7656597103554341, 0.7565709372863978, 0.7669012730690362, 0.7628828819207107, 0.759483937115516, 0.7683911483253589, 0.7645089285714286, 0.7646103896103896, 0.763521018455229, 0.7698329630895421, 0.7734882305194805, 0.7640790541695147, 0.7719956852358167, 0.767827772556391, 0.7737952836637046, 0.7669012730690361, 0.7649040926179084, 0.7613796565276828, 0.7597963303144223, 0.7678197624743677, 0.7635797590567328, 0.7589846420027342, 0.7676782510252904, 0.7593130553656869, 0.7593557758031443, 0.7658145719412166, 0.767494019138756, 0.7768471249145591, 0.7698409731715653, 0.7735816814764183, 0.7616012687969926, 0.7659454032809295, 0.7725804212235133, 0.765491498632946, 0.7707567925495556, 0.7644715481886535, 0.763590439166097, 0.7589499316473, 0.770035885167464, 0.7672136662679425, 0.75806348

In [216]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [217]:
## 15.
column_to_drop_14 = 'Cat_현재 주변도로의 보행 안전'

In [218]:
if not column_to_drop_14.startswith('Cat_'):
    comp_15 = comp_14.drop(column_to_drop_14, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']
else:
    comp_15 = comp_14.drop(comp_14.filter(regex='^' + column_to_drop_14).columns, axis=1)
    X_15 = comp_15.drop('target', axis=1)
    y_15 = comp_15['target']

print(X_15.shape)

(6119, 194)


In [219]:
X_train, X_test, y_train, y_test = train_test_split(X_15, y_15, test_size=0.2, shuffle=True, stratify=y_15, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [220]:
X_train.shape

(3916, 194)

In [221]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [222]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 78, 'learning_rate': 0.09, 'max_depth': 7, 'num_leaves': 822, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.4, 'reg_alpha': 9, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.798057579779464


In [223]:
optuna_15 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_15.fit(X_train, y_train)

In [224]:
optuna_proba_15 = optuna_15.predict_proba(X_test)[:, 1]
auc_15 = roc_auc_score(y_test, optuna_proba_15)
print(decimal.Decimal(auc_15).quantize(decimal.Decimal('1.000')))

0.777


In [225]:
X_train = X_train.values
y_train = y_train.values

In [226]:
auc_bootstrap = []

In [227]:
rs = RandomState(seed = 15)
bootstrap_auc(optuna_15, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75657608, 0.77773731])

In [228]:
shapiro(auc_bootstrap),np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992523193359375, pvalue=0.6201215386390686),
 0.766927268455229)

In [229]:
t_15 = auc_bootstrap
print(t_15)

[0.7721398667122352, 0.7643674171223513, 0.7615638884142174, 0.7544455955228981, 0.7648506920710869, 0.763157894736842, 0.7707247522214629, 0.7681268156185919, 0.7649334629186604, 0.7595506877990431, 0.7629736628503075, 0.7716165413533835, 0.7621059039644567, 0.7671415755297335, 0.7609043916609706, 0.7694751794258374, 0.7701266660970609, 0.7717740729665071, 0.7657745215311005, 0.7690559851332878, 0.7729622351332877, 0.7643140165755298, 0.7599325017088174, 0.7690399649692413, 0.7750288362952836, 0.7688450529733425, 0.7606881194463431, 0.7641191045796308, 0.7707087320574163, 0.7752771488380041, 0.7494579844497608, 0.7640149735133288, 0.7659454032809296, 0.7662497863978127, 0.7698730134996583, 0.7619617224880382, 0.7618068609022557, 0.7712988080997949, 0.7683163875598086, 0.7691814764183185, 0.7766361927546138, 0.7756990131578947, 0.7616066088516746, 0.7636465097402597, 0.7655609193438142, 0.7723694890635681, 0.7721211765208474, 0.7676782510252904, 0.7681001153451811, 0.774137047163363, 0

In [230]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [231]:
# 16.
column_to_drop_15 = 'Cat_현재 공공기관 접근용이성'

In [232]:
if not column_to_drop_15.startswith('Cat_'):
    comp_16 = comp_15.drop(column_to_drop_15, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']
else:
    comp_16 = comp_15.drop(comp_15.filter(regex='^' + column_to_drop_15).columns, axis=1)
    X_16 = comp_16.drop('target', axis=1)
    y_16 = comp_16['target']

print(X_16.shape)

(6119, 190)


In [233]:
X_train, X_test, y_train, y_test = train_test_split(X_16, y_16, test_size=0.2, shuffle=True, stratify=y_16, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [234]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [235]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 102, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 542, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7948480371288575


In [236]:
optuna_16 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_16.fit(X_train, y_train)

In [237]:
optuna_proba_16 = optuna_16.predict_proba(X_test)[:, 1]
auc_16 = roc_auc_score(y_test, optuna_proba_16)
print(decimal.Decimal(auc_16).quantize(decimal.Decimal('1.000')))

0.779


In [238]:
X_train = X_train.values
y_train = y_train.values

In [239]:
auc_bootstrap = []

In [240]:
rs = RandomState(seed = 16)
bootstrap_auc(optuna_16, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75866864, 0.77852924])

In [241]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988211393356323, pvalue=0.19698192179203033),
 0.7687445544792378)

In [242]:
t_16 = auc_bootstrap
print(t_16)

[0.7698917036910458, 0.7679052033492824, 0.760485197368421, 0.7650189037935748, 0.7675661098769652, 0.7747885338345866, 0.7610405630553656, 0.7683137175324675, 0.7718087833219411, 0.7683751281613123, 0.7593370856117565, 0.7519598000683527, 0.7695232399179768, 0.7792661696855776, 0.77103981544771, 0.7705298402255638, 0.7737712534176351, 0.7700786056049214, 0.7705458603896105, 0.7708796138072453, 0.772502990430622, 0.7582877648667122, 0.7748312542720437, 0.7711439465140124, 0.7641992053998633, 0.7703055579289132, 0.7769405758714969, 0.7737045027341081, 0.7688183526999317, 0.7693523581681477, 0.7785826426862611, 0.7665167891319207, 0.7660495343472318, 0.7734882305194806, 0.7720063653451812, 0.7634782980177717, 0.7619937628161313, 0.7747244531784006, 0.7716085312713602, 0.7681935663021189, 0.766057544429255, 0.7720570958646615, 0.7704470693779903, 0.7745802717019823, 0.7688824333561175, 0.7631285244360901, 0.7721532168489406, 0.7767910543403964, 0.76953125, 0.7683137175324676, 0.7687863123

In [243]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [244]:
# 17.
column_to_drop_16 = 'Cat_현재 대기오염 정도'

In [245]:
if not column_to_drop_16.startswith('Cat_'):
    comp_17 = comp_16.drop(column_to_drop_16, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']
else:
    comp_17 = comp_16.drop(comp_16.filter(regex='^' + column_to_drop_16).columns, axis=1)
    X_17 = comp_17.drop('target', axis=1)
    y_17 = comp_17['target']

print(X_17.shape)

(6119, 186)


In [246]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, test_size=0.2, shuffle=True, stratify=y_17, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [247]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [248]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7959331880900509


In [249]:
optuna_17 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_17.fit(X_train, y_train)

In [250]:
optuna_proba_17 = optuna_17.predict_proba(X_test)[:, 1]
auc_17 = roc_auc_score(y_test, optuna_proba_17)
print(decimal.Decimal(auc_17).quantize(decimal.Decimal('1.000')))

0.776


In [251]:
X_train = X_train.values
y_train = y_train.values

In [252]:
auc_bootstrap = []

In [253]:
rs = RandomState(seed = 17)
bootstrap_auc(optuna_17, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75631628, 0.77634516])

In [254]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987847805023193, pvalue=0.1762365996837616),
 0.7664704427973342)

In [255]:
t_17 = auc_bootstrap
print(t_17)

[0.7655155288790156, 0.7720837961380724, 0.7600927033492823, 0.7679586038961039, 0.7658706425153794, 0.7659747735816815, 0.7690132646958305, 0.7699531143198906, 0.758880510936432, 0.7614250469924813, 0.7685673701298702, 0.7736938226247437, 0.7567204588174982, 0.7627627306903622, 0.7630644437799043, 0.7788416353383458, 0.7701560363978126, 0.7752691387559809, 0.7662497863978127, 0.7685753802118932, 0.7595586978810663, 0.7667090311004785, 0.7639028323650034, 0.7619590524606972, 0.7742865686944636, 0.7700705955228981, 0.765123034859877, 0.7587283193779906, 0.7655021787423104, 0.76382006151743, 0.77220127734108, 0.7654274179767602, 0.7668291823308271, 0.7681081254272044, 0.7753278793574846, 0.7753599196855776, 0.7619083219412166, 0.7772583091250855, 0.7629229323308271, 0.7693843984962406, 0.7659080228981545, 0.7593584458304854, 0.7738299940191388, 0.7705431903622693, 0.7705004699248119, 0.7763398197197539, 0.7697475222146275, 0.7701400162337662, 0.7597589499316474, 0.7739688354408749, 0.761

In [256]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [257]:
## 18.
column_to_drop_17 ='Cat_현재 문화시설 접근용이성'

In [258]:
if not column_to_drop_17.startswith('Cat_'):
    comp_18 = comp_17.drop(column_to_drop_17, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']
else:
    comp_18 = comp_17.drop(comp_17.filter(regex='^' + column_to_drop_17).columns, axis=1)
    X_18 = comp_18.drop('target', axis=1)
    y_18 = comp_18['target']

print(X_18.shape)

(6119, 182)


In [259]:
X_train, X_test, y_train, y_test = train_test_split(X_18, y_18, test_size=0.2, shuffle=True, stratify=y_18, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [260]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [261]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 142, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 448, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7962253441180646


In [262]:
optuna_18 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_18.fit(X_train, y_train)

In [263]:
optuna_proba_18 = optuna_18.predict_proba(X_test)[:, 1]
auc_18 = roc_auc_score(y_test, optuna_proba_18)
print(decimal.Decimal(auc_18).quantize(decimal.Decimal('1.000')))

0.782


In [264]:
X_train = X_train.values
y_train = y_train.values

In [265]:
auc_bootstrap = []

In [266]:
rs = RandomState(seed = 18)
bootstrap_auc(optuna_18, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75516337, 0.77678698])

In [267]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9982642531394958, pvalue=0.03313184529542923),
 0.7663748090930452)

In [268]:
t_18 = auc_bootstrap
print(t_18)

[0.7636785500683527, 0.7645463089542036, 0.7677423316814764, 0.7618976418318524, 0.7656570403280929, 0.7726765422077921, 0.7639722530758716, 0.7674512987012987, 0.7748259142173617, 0.7789190661312373, 0.759318395420369, 0.7596895292207793, 0.762789430963773, 0.7626746197881067, 0.7534122949419002, 0.7641164345522897, 0.7774986115857827, 0.7664553785030759, 0.7661803656869446, 0.7684231886534518, 0.773501580656186, 0.7581729536910458, 0.766725051264525, 0.7693016276486673, 0.7719663149350651, 0.7729088345864663, 0.765259206254272, 0.7772369489063568, 0.7676782510252905, 0.7634382476076554, 0.7624102870813398, 0.7623408663704716, 0.767761021872864, 0.7756189123376623, 0.7680573949077238, 0.7611046437115516, 0.7662417763157895, 0.7662097359876965, 0.7650082236842105, 0.7735603212576896, 0.7720624359193438, 0.7674059082365003, 0.7674352785372522, 0.768529989747095, 0.7675580997949418, 0.7503417634996582, 0.7624690276828434, 0.7704764396787424, 0.7644047975051265, 0.7723507988721805, 0.7643

In [269]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [270]:
# 19
column_to_drop_18 = 'Cat_현재 자동차 경적/집주변의 소음 정도'

In [271]:
if not column_to_drop_18.startswith('Cat_'):
    comp_19 = comp_18.drop(column_to_drop_18, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']
else:
    comp_19 = comp_18.drop(comp_18.filter(regex='^' + column_to_drop_18).columns, axis=1)
    X_19 = comp_19.drop('target', axis=1)
    y_19 = comp_19['target']

print(X_19.shape)

(6119, 178)


In [272]:
X_train, X_test, y_train, y_test = train_test_split(X_19, y_19, test_size=0.2, shuffle=True, stratify=y_19, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [273]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [274]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7973104950792578


In [275]:
optuna_19 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_19.fit(X_train, y_train)

In [276]:
optuna_proba_19 = optuna_19.predict_proba(X_test)[:, 1]
auc_19 = roc_auc_score(y_test, optuna_proba_19)
print(decimal.Decimal(auc_19).quantize(decimal.Decimal('1.000')))

0.775


In [277]:
X_train = X_train.values
y_train = y_train.values

In [278]:
auc_bootstrap = []

In [279]:
rs = RandomState(seed = 19)
bootstrap_auc(optuna_19, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75215218, 0.77495434])

In [280]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988178610801697, pvalue=0.19501076638698578),
 0.7635639257946002)

In [281]:
t_19 = auc_bootstrap
print(t_19)

[0.7609471120984279, 0.7466491156869446, 0.7670160842447026, 0.7662711466165414, 0.759916481544771, 0.767859812884484, 0.7648693822624744, 0.7697421821599453, 0.7695766404647985, 0.765192455570745, 0.7648854024265207, 0.7613876666097061, 0.763988273239918, 0.7661323051948052, 0.7569741114149009, 0.7689758843130553, 0.77400621582365, 0.7549368805536569, 0.7629763328776487, 0.7787882347915243, 0.7569607612781956, 0.767128225393028, 0.778873675666439, 0.7665461594326726, 0.7600526529391661, 0.7675554297676008, 0.7705725606630212, 0.7612621753246753, 0.7664660586124403, 0.7628615217019823, 0.7696834415584416, 0.7713468685919345, 0.7667784518113465, 0.769197496582365, 0.7733921095352015, 0.76953125, 0.7736564422419687, 0.7674486286739577, 0.7665434894053316, 0.7596201085099111, 0.7657985517771703, 0.7829214371155161, 0.7684525589542037, 0.7704230391319207, 0.7587229793233082, 0.7619777426520848, 0.7623755767259056, 0.7620658535543403, 0.7740142259056731, 0.7615612183868763, 0.77019875683527

In [282]:
# 20.
column_to_drop_19 = '총 이사 횟수'

In [283]:
if not column_to_drop_19.startswith('Cat_'):
    comp_20 = comp_19.drop(column_to_drop_19, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']
else:
    comp_20 = comp_19.drop(comp_19.filter(regex='^' + column_to_drop_19).columns, axis=1)
    X_20 = comp_20.drop('target', axis=1)
    y_20 = comp_20['target']

print(X_20.shape)

(6119, 177)


In [284]:
X_train, X_test, y_train, y_test = train_test_split(X_20, y_20, test_size=0.2, shuffle=True, stratify=y_20, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [285]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [286]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 173, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 726, 'subsample': 0.8, 'colsample_bytree': 0.4, 'reg_alpha': 2, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.800507516757235


In [287]:
optuna_20 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_20.fit(X_train, y_train)

In [288]:
optuna_proba_20 = optuna_20.predict_proba(X_test)[:, 1]
auc_20 = roc_auc_score(y_test, optuna_proba_20)
print(decimal.Decimal(auc_20).quantize(decimal.Decimal('1.000')))

0.779


In [289]:
X_train = X_train.values
y_train = y_train.values

In [290]:
auc_bootstrap = []

In [291]:
rs = RandomState(seed = 20)
bootstrap_auc(optuna_20, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75395391, 0.77704804])

In [292]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991893768310547, pvalue=0.5398953557014465),
 0.765503820809125)

In [293]:
t_20 = auc_bootstrap
print(t_20)

[0.7658759825700616, 0.7700091848940532, 0.7691067156527682, 0.7673525076896788, 0.7552599538619275, 0.7806278836295284, 0.7675527597402598, 0.7722840481886535, 0.7599271616541353, 0.7700705955228981, 0.7644234876965139, 0.7775359919685577, 0.7719876751537936, 0.7735496411483254, 0.759782980177717, 0.7708876238892686, 0.7622020249487356, 0.7647118506493507, 0.7600606630211892, 0.755641767771702, 0.7688744232740943, 0.7561357228298018, 0.7619296821599454, 0.7647438909774436, 0.7651817754613808, 0.7691307458988379, 0.7751703477443609, 0.756242523923445, 0.7599565319548873, 0.7738807245386193, 0.7648827323991798, 0.7598363807245386, 0.7631632347915243, 0.7698943737183869, 0.7508223684210527, 0.7662658065618593, 0.7679505938140806, 0.7694618292891319, 0.7607735603212578, 0.7756803229665072, 0.762020463089542, 0.765125704887218, 0.7679746240601504, 0.761852251367054, 0.7616386491797676, 0.7666022300068354, 0.7598363807245386, 0.763157894736842, 0.7593397556390977, 0.7632086252563226, 0.7666

In [294]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [295]:
# 21
column_to_drop_20 = 'Cat_이사 계획 중인 주택의 유형'

In [296]:
if not column_to_drop_20.startswith('Cat_'):
    comp_21 = comp_20.drop(column_to_drop_20, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']
else:
    comp_21 = comp_20.drop(comp_20.filter(regex='^' + column_to_drop_20).columns, axis=1)
    X_21 = comp_21.drop('target', axis=1)
    y_21 = comp_21['target']

print(X_21.shape)

(6119, 163)


In [297]:
X_train, X_test, y_train, y_test = train_test_split(X_21, y_21, test_size=0.2, shuffle=True, stratify=y_21, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [298]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [299]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 84, 'learning_rate': 0.06999999999999999, 'max_depth': 6, 'num_leaves': 456, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 4, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7975233516139535


In [300]:
optuna_21 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_21.fit(X_train, y_train)

In [301]:
optuna_proba_21 = optuna_21.predict_proba(X_test)[:, 1]
auc_21 = roc_auc_score(y_test, optuna_proba_21)
print(decimal.Decimal(auc_21).quantize(decimal.Decimal('1.000')))

0.780


In [302]:
X_train = X_train.values
y_train = y_train.values

In [303]:
auc_bootstrap = []

In [304]:
rs = RandomState(seed = 21)
bootstrap_auc(optuna_21, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75543471, 0.77712641])

In [305]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984245896339417, pvalue=0.05590683966875076),
 0.7669862720544258)

In [306]:
t_21 = auc_bootstrap
print(t_21)

[0.7697395121326043, 0.7642125555365687, 0.7591074632604239, 0.7718728639781272, 0.7658546223513328, 0.7661189550580998, 0.7630671138072453, 0.7732746283321941, 0.7692749273752564, 0.7763745300751879, 0.7693763884142173, 0.7724949803485989, 0.7727432928913192, 0.769830293062201, 0.7669439935064934, 0.7699237440191388, 0.762522428229665, 0.7801392686261107, 0.7680440447710184, 0.770828883287765, 0.7646557800751879, 0.7605919984620643, 0.774943395420369, 0.77410500683527, 0.7726311517429939, 0.7648800623718388, 0.7714189593301435, 0.769469839371155, 0.7613663063909775, 0.7734588602187286, 0.7643273667122351, 0.765325956937799, 0.7716859620642516, 0.7596788491114148, 0.7625491285030759, 0.7655395591250854, 0.7605546180792891, 0.7771061175666438, 0.7661563354408749, 0.7732853084415585, 0.7802834501025291, 0.7685379998291183, 0.7665995599794942, 0.7552572838345865, 0.7695713004101161, 0.7581836338004102, 0.7640336637047164, 0.7667036910457963, 0.7587336594326726, 0.764524948735475, 0.764682

In [307]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [308]:
# 22
column_to_drop_21 = 'Cat_가족계획 시 중요 고려 사항 1순위'

In [309]:
if not column_to_drop_21.startswith('Cat_'):
    comp_22 = comp_21.drop(column_to_drop_21, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']
else:
    comp_22 = comp_21.drop(comp_21.filter(regex='^' + column_to_drop_21).columns, axis=1)
    X_22 = comp_22.drop('target', axis=1)
    y_22 = comp_22['target']

print(X_22.shape)

(6119, 156)


In [310]:
X_train, X_test, y_train, y_test = train_test_split(X_22, y_22, test_size=0.2, shuffle=True, stratify=y_22, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [311]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [312]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 102, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 542, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.5, 'reg_alpha': 10, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7947896059232548


In [313]:
optuna_22 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_22.fit(X_train, y_train)

In [314]:
optuna_proba_22 = optuna_22.predict_proba(X_test)[:, 1]
auc_22 = roc_auc_score(y_test, optuna_proba_22)
print(decimal.Decimal(auc_22).quantize(decimal.Decimal('1.000')))

0.778


In [315]:
X_train = X_train.values
y_train = y_train.values

In [316]:
auc_bootstrap = []

In [317]:
rs = RandomState(seed = 22)
bootstrap_auc(optuna_22, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75978411, 0.77883122])

In [318]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987693428993225, pvalue=0.16804194450378418),
 0.7696781402191558)

In [319]:
t_22 = auc_bootstrap
print(t_22)

[0.7671202153110048, 0.7676595608339029, 0.7629709928229664, 0.7611340140123036, 0.7611527042036911, 0.7708101930963773, 0.7709890849282297, 0.766927973342447, 0.7616973897812713, 0.7723294386534518, 0.7740195659603555, 0.7683083774777855, 0.7659347231715652, 0.7698917036910458, 0.7732853084415584, 0.7642205656185919, 0.7691627862269309, 0.7763184595010253, 0.7704390592959672, 0.7671041951469583, 0.7613102358168148, 0.7747191131237183, 0.7746470223855092, 0.7771648581681476, 0.7703482783663705, 0.7731144266917294, 0.7611900845864661, 0.7596281185919345, 0.7738780545112782, 0.7597242395762133, 0.7741584073820915, 0.7693069677033493, 0.7736751324333562, 0.7650002136021872, 0.765090994531784, 0.7658466122693096, 0.7620578434723171, 0.7647171907040329, 0.7708956339712918, 0.7563493250170882, 0.7815009825700616, 0.7702254571086808, 0.7705351802802459, 0.76666364063568, 0.7723881792549556, 0.7693550281954886, 0.7667864618933697, 0.7688744232740944, 0.7711145762132605, 0.7685032894736842, 0.7

In [320]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [321]:
# 23.
column_to_drop_22 = 'Cat_현재 의료시설 접근용이성'

In [322]:
if not column_to_drop_22.startswith('Cat_'):
    comp_23 = comp_22.drop(column_to_drop_22, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']
else:
    comp_23 = comp_22.drop(comp_22.filter(regex='^' + column_to_drop_22).columns, axis=1)
    X_23 = comp_23.drop('target', axis=1)
    y_23 = comp_23['target']

print(X_23.shape)

(6119, 152)


In [323]:
X_train, X_test, y_train, y_test = train_test_split(X_23, y_23, test_size=0.2, shuffle=True, stratify=y_23, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [324]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [325]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7945099708678705


In [326]:
optuna_23 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_23.fit(X_train, y_train)

In [327]:
optuna_proba_23 = optuna_23.predict_proba(X_test)[:, 1]
auc_23 = roc_auc_score(y_test, optuna_proba_23)
print(decimal.Decimal(auc_23).quantize(decimal.Decimal('1.000')))

0.777


In [328]:
X_train = X_train.values
y_train = y_train.values

In [329]:
auc_bootstrap = []

In [330]:
rs = RandomState(seed = 23)
bootstrap_auc(optuna_23, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75767439, 0.77682216])

In [331]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999014675617218, pvalue=0.34596359729766846),
 0.7674719286675495)

In [332]:
t_23 = auc_bootstrap
print(t_23)

[0.7675447496582365, 0.7674299384825701, 0.7724175495557074, 0.7664660586124403, 0.763387517088175, 0.7718835440874915, 0.7674139183185237, 0.7699531143198907, 0.7614704374572796, 0.7798215353725222, 0.7775199718045114, 0.7643861073137389, 0.7663378973000683, 0.7705565404989746, 0.7658786525974026, 0.7719155844155845, 0.7690506450786057, 0.7595693779904307, 0.7734855604921395, 0.7725323607313739, 0.7652004656527682, 0.764089734278879, 0.7691307458988379, 0.7658653024606973, 0.7617321001367054, 0.7659560833902939, 0.7656196599453179, 0.759916481544771, 0.760084693267259, 0.7726525119617225, 0.770329588174983, 0.7671736158578264, 0.7726284817156528, 0.7594492267600821, 0.7676408706425155, 0.7598791011619959, 0.7671415755297334, 0.7683350777511962, 0.7675420796308955, 0.7640550239234449, 0.7674219284005468, 0.7603330058099795, 0.767793062200957, 0.772775333219412, 0.7652164858168149, 0.7636384996582366, 0.7663432373547505, 0.7709436944634314, 0.765956083390294, 0.7718194634313056, 0.76985

In [333]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [334]:
# 24
column_to_drop_23 = '부채 중 금융기관 대출금의 비중'

In [335]:
if not column_to_drop_23.startswith('Cat_'):
    comp_24 = comp_23.drop(column_to_drop_23, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']
else:
    comp_24 = comp_23.drop(comp_23.filter(regex='^' + column_to_drop_23).columns, axis=1)
    X_24 = comp_24.drop('target', axis=1)
    y_24 = comp_24['target']

print(X_24.shape)

(6119, 151)


In [336]:
X_train, X_test, y_train, y_test = train_test_split(X_24, y_24, test_size=0.2, shuffle=True, stratify=y_24, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [337]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [338]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7947896059232548


In [339]:
optuna_24 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_24.fit(X_train, y_train)

In [340]:
optuna_proba_24 = optuna_24.predict_proba(X_test)[:, 1]
auc_24 = roc_auc_score(y_test, optuna_proba_24)
print(decimal.Decimal(auc_24).quantize(decimal.Decimal('1.000')))

0.781


In [341]:
X_train = X_train.values
y_train = y_train.values

In [342]:
auc_bootstrap = []

In [343]:
rs = RandomState(seed = 24)
bootstrap_auc(optuna_24, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76149453, 0.780241  ])

In [344]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988231062889099, pvalue=0.19816850125789642),
 0.7708377891639612)

In [345]:
t_24 = auc_bootstrap
print(t_24)

[0.7713708988380041, 0.772703242481203, 0.7880131792549556, 0.7692028366370472, 0.7668371924128503, 0.7693363380041012, 0.7709196642173616, 0.7754907510252906, 0.7688770933014354, 0.7628962320574162, 0.7733921095352015, 0.7731971975393028, 0.7736190618591935, 0.7721932672590568, 0.7729435449419002, 0.7682042464114832, 0.7724469198564593, 0.7705351802802461, 0.7749994659945318, 0.7703482783663704, 0.7686768412508544, 0.7689625341763499, 0.7665835398154477, 0.7706473214285714, 0.7675981502050582, 0.766759761619959, 0.772433569719754, 0.7694191088516747, 0.7678758330485305, 0.7726391618250171, 0.7681161355092275, 0.7762970992822966, 0.7736778024606972, 0.7602128545796307, 0.7710718557758031, 0.7688984535201642, 0.7674352785372521, 0.7758004741968558, 0.7650295839029392, 0.7714216293574847, 0.764455528024607, 0.7772102486329461, 0.7762276785714286, 0.7733120087149693, 0.7705378503075871, 0.7636785500683526, 0.7634756279904306, 0.7682069164388243, 0.7757791139781272, 0.7769726161995898, 0.7

In [346]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [347]:
column_to_drop_24 = 'Cat_주택 보유 의식'

In [348]:
if not column_to_drop_24.startswith('Cat_'):
    comp_25 = comp_24.drop(column_to_drop_24, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']
else:
    comp_25 = comp_24.drop(comp_24.filter(regex='^' + column_to_drop_24).columns, axis=1)
    X_25 = comp_25.drop('target', axis=1)
    y_25 = comp_25['target']

print(X_25.shape)

(6119, 149)


In [349]:
X_train, X_test, y_train, y_test = train_test_split(X_25, y_25, test_size=0.2, shuffle=True, stratify=y_25, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [350]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [351]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.79554921159609


In [352]:
optuna_25 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_25.fit(X_train, y_train)

In [353]:
optuna_proba_25 = optuna_25.predict_proba(X_test)[:, 1]
auc_25 = roc_auc_score(y_test, optuna_proba_25)
print(decimal.Decimal(auc_25).quantize(decimal.Decimal('1.000')))

0.778


In [354]:
X_train = X_train.values
y_train = y_train.values

In [355]:
auc_bootstrap = []

In [356]:
rs = RandomState(seed = 25)
bootstrap_auc(optuna_25, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75724512, 0.77646311])

In [357]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9985522627830505, pvalue=0.08455441147089005),
 0.7669983098726931)

In [358]:
t_25 = auc_bootstrap
print(t_25)

[0.7690212747778538, 0.7631952751196173, 0.770596590909091, 0.7670481245727956, 0.7688610731373888, 0.7581836338004101, 0.7608563311688312, 0.7712053571428571, 0.7707274222488039, 0.7755655117908407, 0.7695686303827751, 0.7649628332194122, 0.7700225350307588, 0.7674005681818181, 0.7736964926520848, 0.7642766361927547, 0.7700438952494872, 0.7689785543403965, 0.768126815618592, 0.7642472658920028, 0.7701320061517429, 0.7629629827409432, 0.7741477272727273, 0.7691601161995898, 0.7719823350991113, 0.7721371966848942, 0.7611527042036911, 0.7650482740943267, 0.763627819548872, 0.7719449547163364, 0.7632406655844156, 0.7631472146274777, 0.7692215268284347, 0.7804036013328777, 0.7627146701982228, 0.7719129143882433, 0.7754827409432673, 0.7652725563909776, 0.7749086850649352, 0.7667544215652768, 0.7717340225563911, 0.7687275717703349, 0.769165456254272, 0.7658172419685577, 0.7678224325017089, 0.7585334073820916, 0.764922782809296, 0.7627093301435406, 0.7698356331168831, 0.7703803186944634, 0.76

In [359]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [360]:
# 26
column_to_drop_25 = 'Cat_현재 상업시설 접근용이성'

In [361]:
if not column_to_drop_25.startswith('Cat_'):
    comp_26 = comp_25.drop(column_to_drop_25, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']
else:
    comp_26 = comp_25.drop(comp_25.filter(regex='^' + column_to_drop_25).columns, axis=1)
    X_26 = comp_26.drop('target', axis=1)
    y_26 = comp_26['target']

print(X_26.shape)

(6119, 145)


In [362]:
X_train, X_test, y_train, y_test = train_test_split(X_26, y_26, test_size=0.2, shuffle=True, stratify=y_26, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [363]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [364]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 170, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 630, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 3, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7970183390512442


In [365]:
optuna_26 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_26.fit(X_train, y_train)

In [366]:
optuna_proba_26 = optuna_26.predict_proba(X_test)[:, 1]
auc_26 = roc_auc_score(y_test, optuna_proba_26)
print(decimal.Decimal(auc_26).quantize(decimal.Decimal('1.000')))

0.780


In [367]:
X_train = X_train.values
y_train = y_train.values

In [368]:
auc_bootstrap = []

In [369]:
rs = RandomState(seed = 26)
bootstrap_auc(optuna_26, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75745959, 0.77843379])

In [370]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999053955078125, pvalue=0.3848409354686737),
 0.7682305855636962)

In [371]:
t_26 = auc_bootstrap
print(t_26)

[0.7669493335611758, 0.7672804169514695, 0.7662951768626112, 0.7697742224880382, 0.7642712961380724, 0.766295176862611, 0.7565602571770335, 0.7756616327751196, 0.7678304425837321, 0.767224346377307, 0.7727886833561176, 0.7624743677375256, 0.758346505468216, 0.7593103853383459, 0.765793211722488, 0.7637105903964456, 0.7828733766233766, 0.7579112910116199, 0.7674326085099112, 0.7637025803144224, 0.7699504442925496, 0.772502990430622, 0.7642445958646616, 0.7668051520847573, 0.7775039516404649, 0.7574413661995899, 0.7767563439849623, 0.7772556390977443, 0.7700171949760766, 0.7725804212235134, 0.7828600264866714, 0.7770260167464114, 0.7694484791524264, 0.7690960355434038, 0.7700625854408749, 0.7610619232740943, 0.7715524606971976, 0.764188525290499, 0.770065255468216, 0.7722867182159945, 0.769031954887218, 0.7622260551948052, 0.7678651529391661, 0.7671148752563225, 0.7722920582706767, 0.7686955314422419, 0.7713281784005469, 0.7637960312713601, 0.761951042378674, 0.761286205570745, 0.7827879

In [372]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [373]:
# 27
column_to_drop_26 = '소득 중 근로/사업소득의 비중(월평균)'

In [374]:
if not column_to_drop_26.startswith('Cat_'):
    comp_27 = comp_26.drop(column_to_drop_26, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']
else:
    comp_27 = comp_26.drop(comp_26.filter(regex='^' + column_to_drop_26).columns, axis=1)
    X_27 = comp_27.drop('target', axis=1)
    y_27 = comp_27['target']

print(X_27.shape)

(6119, 144)


In [375]:
X_train, X_test, y_train, y_test = train_test_split(X_27, y_27, test_size=0.2, shuffle=True, stratify=y_27, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [376]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [377]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7937086286196045


In [378]:
optuna_27 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_27.fit(X_train, y_train)

In [379]:
optuna_proba_27 = optuna_27.predict_proba(X_test)[:, 1]
auc_27 = roc_auc_score(y_test, optuna_proba_27)
print(decimal.Decimal(auc_27).quantize(decimal.Decimal('1.000')))

0.777


In [380]:
X_train = X_train.values
y_train = y_train.values

In [381]:
auc_bootstrap = []

In [382]:
rs = RandomState(seed = 27)
bootstrap_auc(optuna_27, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75741981, 0.77806326])

In [383]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.999136209487915, pvalue=0.4754830598831177),
 0.7677681228105776)

In [384]:
t_27 = auc_bootstrap
print(t_27)

[0.7685460099111414, 0.7725483808954204, 0.7665514994873547, 0.7639268626110732, 0.7621993549213945, 0.7731197667464115, 0.7755334714627479, 0.7662017259056731, 0.7723855092276144, 0.768292357313739, 0.7631285244360902, 0.7677102913533835, 0.7715070702323992, 0.7645142686261108, 0.7625571385850991, 0.7729141746411483, 0.7697395121326042, 0.7714349794941899, 0.7769859663362955, 0.7659694335269993, 0.766562179596719, 0.7718461637047164, 0.7666022300068353, 0.7708262132604238, 0.7608990516062883, 0.7714937200956938, 0.7665381493506493, 0.761152704203691, 0.7747618335611756, 0.770233467190704, 0.7633367865686945, 0.7724843002392344, 0.7726952323991797, 0.7691467660628843, 0.764693160457963, 0.7736484321599453, 0.7755815319548872, 0.7668719027682844, 0.7783396701982228, 0.7593904861585783, 0.7653312969924813, 0.7632006151742994, 0.7692028366370471, 0.7649922035201641, 0.7680093344155845, 0.7705805707450445, 0.7592970352016404, 0.7697261619958988, 0.7655181989063569, 0.7661563354408749, 0.76

In [385]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [386]:
# 28
column_to_drop_27  = '총 가구원 수'

In [387]:
if not column_to_drop_27.startswith('Cat_'):
    comp_28 = comp_27.drop(column_to_drop_27, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']
else:
    comp_28 = comp_27.drop(comp_27.filter(regex='^' + column_to_drop_27).columns, axis=1)
    X_28 = comp_28.drop('target', axis=1)
    y_28 = comp_28['target']

print(X_28.shape)

(6119, 143)


In [388]:
X_train, X_test, y_train, y_test = train_test_split(X_28, y_28, test_size=0.2, shuffle=True, stratify=y_28, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [389]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [390]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 200, 'learning_rate': 0.04, 'max_depth': 4, 'num_leaves': 710, 'subsample': 0.2, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 2, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.800661942086328


In [391]:
optuna_28 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_28.fit(X_train, y_train)

In [392]:
optuna_proba_28 = optuna_28.predict_proba(X_test)[:, 1]
auc_28 = roc_auc_score(y_test, optuna_proba_28)
print(decimal.Decimal(auc_28).quantize(decimal.Decimal('1.000')))

0.781


In [393]:
X_train = X_train.values
y_train = y_train.values

In [394]:
auc_bootstrap = []

In [395]:
rs = RandomState(seed = 28)
bootstrap_auc(optuna_28, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76066789, 0.78006791])

In [396]:
np.mean(auc_bootstrap)

0.7708056126644738

In [397]:
t_28 = auc_bootstrap
print(t_28)

[0.7701239960697197, 0.7703883287764867, 0.7722226375598087, 0.7700145249487356, 0.7668104921394394, 0.7635770890293916, 0.7675367395762133, 0.7783583603896104, 0.7710878759398496, 0.7699451042378673, 0.7651684253246753, 0.7648667122351333, 0.762650589542037, 0.7714136192754614, 0.778075337491456, 0.7758218344155845, 0.7621593045112782, 0.7798242053998633, 0.7730316558441559, 0.7747217831510594, 0.7728741242310322, 0.7609444420710868, 0.7678518028024607, 0.7718621838687629, 0.7759259654818865, 0.7685940704032809, 0.7708101930963773, 0.7629309424128503, 0.7716699419002051, 0.769533920027341, 0.7612194548872181, 0.7729275247778538, 0.773301328605605, 0.7726124615516062, 0.7650936645591251, 0.7773117096719071, 0.7732265678400547, 0.775741733595352, 0.771766062884484, 0.7700812756322625, 0.7675580997949418, 0.7762303485987697, 0.7664713986671223, 0.7713468685919344, 0.7630938140806561, 0.763854771872864, 0.7725430408407382, 0.7737685833902939, 0.7714429895762133, 0.7699290840738209, 0.7700

In [398]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [399]:
# 29
column_to_drop_28 = 'Cat_현재 치안 및 범죄 등 방범 상태'

In [400]:
if not column_to_drop_28.startswith('Cat_'):
    comp_29 = comp_28.drop(column_to_drop_28, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']
else:
    comp_29 = comp_28.drop(comp_28.filter(regex='^' + column_to_drop_28).columns, axis=1)
    X_29 = comp_29.drop('target', axis=1)
    y_29 = comp_29['target']

print(X_29.shape)

(6119, 139)


In [401]:
X_train, X_test, y_train, y_test = train_test_split(X_29, y_29, test_size=0.2, shuffle=True, stratify=y_29, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [402]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [403]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7949440312523477


In [404]:
optuna_29 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_29.fit(X_train, y_train)

In [405]:
optuna_proba_29 = optuna_29.predict_proba(X_test)[:, 1]
auc_29 = roc_auc_score(y_test, optuna_proba_29)
print(decimal.Decimal(auc_29).quantize(decimal.Decimal('1.000')))

0.778


In [406]:
X_train = X_train.values
y_train = y_train.values

In [407]:
auc_bootstrap = []

In [408]:
rs = RandomState(seed = 29)
bootstrap_auc(optuna_29, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75759549, 0.77688224])

In [409]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9994390606880188, pvalue=0.8500477075576782),
 0.7674338113572282)

In [410]:
t_29 = auc_bootstrap
print(t_29)

[0.76629250683527, 0.7694324589883801, 0.7649014225905675, 0.7696247009569377, 0.7683163875598085, 0.7670561346548188, 0.7567791994190021, 0.7713842489747096, 0.7637720010252905, 0.7749487354750513, 0.7722920582706767, 0.7635263585099111, 0.76325134569378, 0.7679078733766234, 0.7706419813738893, 0.7703509483937114, 0.7664153280929598, 0.770332258202324, 0.7633367865686944, 0.7581889738550923, 0.7623915968899522, 0.7645196086807928, 0.7670294343814081, 0.7665301392686261, 0.7629976930963773, 0.7658759825700615, 0.7607682202665755, 0.758909881237184, 0.7584719967532467, 0.7715391105604922, 0.768730241797676, 0.7672643967874231, 0.7574627264183186, 0.7690372949419002, 0.7670454545454546, 0.7701506963431306, 0.7588404605263158, 0.7780459671907041, 0.763219305365687, 0.7738433441558441, 0.768329737696514, 0.7642392558099795, 0.7559595010252904, 0.7582023239917977, 0.7577804596719071, 0.7651550751879699, 0.7703135680109363, 0.7608403110047848, 0.7713975991114148, 0.7736724624060151, 0.762455

In [411]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [412]:
column_to_drop_29 = 'Cat_이사 예상 기간'

In [413]:
if not column_to_drop_29.startswith('Cat_'):
    comp_30 = comp_29.drop(column_to_drop_29, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']
else:
    comp_30 = comp_29.drop(comp_29.filter(regex='^' + column_to_drop_29).columns, axis=1)
    X_30 = comp_30.drop('target', axis=1)
    y_30 = comp_30['target']

print(X_30.shape)

(6119, 135)


In [414]:
X_train, X_test, y_train, y_test = train_test_split(X_30, y_30, test_size=0.2, shuffle=True, stratify=y_30, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [415]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [416]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 107, 'learning_rate': 0.06999999999999999, 'max_depth': 5, 'num_leaves': 442, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7956285110894079


In [417]:
optuna_30 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_30.fit(X_train, y_train)

In [418]:
optuna_proba_30 = optuna_30.predict_proba(X_test)[:, 1]
auc_30 = roc_auc_score(y_test, optuna_proba_30)
print(decimal.Decimal(auc_30).quantize(decimal.Decimal('1.000')))

0.777


In [419]:
X_train = X_train.values
y_train = y_train.values

In [420]:
auc_bootstrap = []

In [421]:
rs = RandomState(seed = 30)
bootstrap_auc(optuna_30, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75998577, 0.7795014 ])

In [422]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9971650838851929, pvalue=0.0010192056652158499),
 0.769978795310364)

In [423]:
t_30 = auc_bootstrap
print(t_30)

[0.7721772470950103, 0.7728314037935748, 0.7672884270334928, 0.7691494360902256, 0.7670454545454545, 0.7734909005468216, 0.7694858595352017, 0.7646344198564594, 0.7714830399863295, 0.7694030886876282, 0.7681855562200957, 0.7661082749487356, 0.7735790114490773, 0.7635957792207793, 0.7675607698222829, 0.7673097872522215, 0.7652458561175666, 0.7706286312371837, 0.7703055579289133, 0.7739661654135338, 0.7695419301093643, 0.7625384483937114, 0.7691974965823649, 0.7718061132946001, 0.7721265165755298, 0.7768204246411483, 0.7733654092617909, 0.7751009270334928, 0.7745081809637732, 0.7709864149008885, 0.7748205741626795, 0.7678838431305537, 0.7697849025974025, 0.7738940746753247, 0.7680066643882433, 0.7691814764183186, 0.7662684765892003, 0.7686875213602187, 0.7713201683185236, 0.7624983979835953, 0.7714563397129187, 0.7796987141148325, 0.7781073778195489, 0.7764573009227614, 0.7703215780929596, 0.7638654519822283, 0.763422227443609, 0.771699312200957, 0.7703669685577581, 0.7613422761449078, 0

In [424]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [425]:
# 31
column_to_drop_30 = '자산 중 기타자산의 비중'

In [426]:
if not column_to_drop_30.startswith('Cat_'):
    comp_31 = comp_30.drop(column_to_drop_30, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']
else:
    comp_31 = comp_30.drop(comp_30.filter(regex='^' + column_to_drop_30).columns, axis=1)
    X_31 = comp_31.drop('target', axis=1)
    y_31 = comp_31['target']

print(X_31.shape)

(6119, 134)


In [427]:
X_train, X_test, y_train, y_test = train_test_split(X_31, y_31, test_size=0.2, shuffle=True, stratify=y_31, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [428]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [429]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7945391864706717


In [430]:
optuna_31= LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_31.fit(X_train, y_train)

In [431]:
optuna_proba_31 = optuna_31.predict_proba(X_test)[:, 1]
auc_31 = roc_auc_score(y_test, optuna_proba_31)
print(decimal.Decimal(auc_31).quantize(decimal.Decimal('1.000')))

0.777


In [432]:
X_train = X_train.values
y_train = y_train.values

In [433]:
auc_bootstrap = []

In [434]:
rs = RandomState(seed = 31)
bootstrap_auc(optuna_31, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75691457, 0.7772539 ])

In [435]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9983747005462646, pvalue=0.04751453548669815),
 0.7672802660949248)

In [436]:
t_31 = auc_bootstrap
print(t_31)

[0.7604157766575529, 0.7698676734449761, 0.7569153708133971, 0.7640470138414217, 0.7668825828776487, 0.7627493805536569, 0.772641831852358, 0.7639749231032126, 0.7610352230006836, 0.763189935064935, 0.7702574974367737, 0.7723961893369788, 0.7716085312713602, 0.7715764909432674, 0.7581676136363636, 0.7703108979835953, 0.7691227358168148, 0.7678411226930963, 0.7712907980177718, 0.7742064678742311, 0.7662871667805877, 0.765325956937799, 0.76886374316473, 0.7738299940191387, 0.7648987525632265, 0.7743853597060834, 0.7610645933014355, 0.7672777469241285, 0.768028024606972, 0.7655876196172249, 0.7763024393369787, 0.7715230903964456, 0.7746550324675324, 0.7717126623376622, 0.7694110987696514, 0.7616760295625428, 0.7661937158236499, 0.7696727614490771, 0.7733814294258373, 0.763221975393028, 0.7654060577580314, 0.7710211252563225, 0.7679105434039645, 0.763825401572112, 0.7674539687286399, 0.7562745642515378, 0.7725510509227613, 0.7616973897812713, 0.770399008885851, 0.7618709415584416, 0.764396

In [437]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [438]:
# 32
column_to_drop_31 = 'Cat_현재 도시공원 및 녹지 접근용이성'

In [439]:
if not column_to_drop_31.startswith('Cat_'):
    comp_32 = comp_31.drop(column_to_drop_31, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']
else:
    comp_32 = comp_31.drop(comp_31.filter(regex='^' + column_to_drop_31).columns, axis=1)
    X_32 = comp_32.drop('target', axis=1)
    y_32 = comp_32['target']

print(X_32.shape)

(6119, 130)


In [440]:
X_train, X_test, y_train, y_test = train_test_split(X_32, y_32, test_size=0.2, shuffle=True, stratify=y_32, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [441]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [442]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7948104742109702


In [443]:
optuna_32 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_32.fit(X_train, y_train)

In [444]:
optuna_proba_32 = optuna_32.predict_proba(X_test)[:, 1]
auc_32 = roc_auc_score(y_test, optuna_proba_32)
print(decimal.Decimal(auc_32).quantize(decimal.Decimal('1.000')))

0.777


In [445]:
X_train = X_train.values
y_train = y_train.values

In [446]:
auc_bootstrap = []

In [447]:
rs = RandomState(seed = 32)
bootstrap_auc(optuna_32, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75637082, 0.77629189])

In [448]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9987779259681702, pvalue=0.1725769340991974),
 0.7667346460077752)

In [449]:
t_32 = auc_bootstrap
print(t_32)

[0.7691734663362952, 0.7717954331852358, 0.7662124060150377, 0.7618015208475736, 0.7624129571086807, 0.7643140165755298, 0.7699317541011621, 0.769200166609706, 0.7718408236500343, 0.7749113550922763, 0.7645142686261107, 0.7676862611073137, 0.7625598086124402, 0.7636972402597403, 0.7650162337662337, 0.7660522043745728, 0.7713869190020506, 0.7608136107313739, 0.7745802717019823, 0.7651737653793576, 0.7701346761790842, 0.7668772428229664, 0.7715070702323992, 0.7779578562884484, 0.7588885210184553, 0.7707941729323309, 0.7703963388585099, 0.7641404647983594, 0.7651657552973342, 0.7667998120300752, 0.7657932117224879, 0.7718968942241968, 0.7712560876623377, 0.7624583475734792, 0.7599351717361587, 0.7558099794941899, 0.7623408663704717, 0.7626585996240601, 0.7655288790157211, 0.7702014268626111, 0.773034325871497, 0.7729221847231715, 0.764690490430622, 0.7694137687969925, 0.7640683740601504, 0.7730316558441559, 0.7688904434381408, 0.7690372949419003, 0.7681722060833902, 0.7727539730006836, 0.

In [450]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [451]:
# 33.
column_to_drop_32 = '현재 주택 거주 기간(총 개월)'

In [452]:
if not column_to_drop_32.startswith('Cat_'):
    comp_33 = comp_32.drop(column_to_drop_32, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']
else:
    comp_33 = comp_32.drop(comp_32.filter(regex='^' + column_to_drop_32).columns, axis=1)
    X_33 = comp_33.drop('target', axis=1)
    y_33 = comp_33['target']

print(X_33.shape)

(6119, 129)


In [453]:
X_train, X_test, y_train, y_test = train_test_split(X_33, y_33, test_size=0.2, shuffle=True, stratify=y_33, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [454]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [455]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7956577266922094


In [456]:
optuna_33 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_33.fit(X_train, y_train)

In [457]:
optuna_proba_33 = optuna_33.predict_proba(X_test)[:, 1]
auc_33 = roc_auc_score(y_test, optuna_proba_33)
print(decimal.Decimal(auc_33).quantize(decimal.Decimal('1.000')))

0.778


In [458]:
X_train = X_train.values
y_train = y_train.values

In [459]:
auc_bootstrap = []

In [460]:
rs = RandomState(seed = 33)
bootstrap_auc(optuna_33, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75725927, 0.77748539])

In [461]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989117383956909, pvalue=0.2581450641155243),
 0.7674893479259228)

In [462]:
t_33 = auc_bootstrap
print(t_33)

[0.764754571086808, 0.767795732228298, 0.7657932117224879, 0.7654354280587832, 0.758744339542037, 0.770903644053315, 0.7643460569036227, 0.7737525632262473, 0.766428678229665, 0.7674833390293916, 0.7655796095352017, 0.769093365516063, 0.7689438439849624, 0.7750688867053999, 0.7662764866712235, 0.7642178955912509, 0.767929233595352, 0.7765694420710869, 0.763155224709501, 0.7639455528024607, 0.7647278708133971, 0.7565015165755298, 0.7620391532809296, 0.7655849495898838, 0.7671495856117567, 0.7708288832877649, 0.7610886235475052, 0.7689758843130554, 0.7746416823308271, 0.7682683270676693, 0.7633047462406015, 0.7744788106630213, 0.7738139738550922, 0.7647065105946684, 0.7652218258714969, 0.7753171992481203, 0.7700412252221462, 0.7719289345522898, 0.7652244958988379, 0.7652031356801094, 0.769929084073821, 0.7563493250170881, 0.7718061132946001, 0.7652244958988381, 0.7709517045454546, 0.765259206254272, 0.7732292378673957, 0.765555579289132, 0.7699424342105263, 0.7691440960355433, 0.76573714

In [463]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [464]:
# 34
column_to_drop_33 = 'Cat_현재 주택에 대한 전반적인 만족도'

In [465]:
if not column_to_drop_33.startswith('Cat_'):
    comp_34 = comp_33.drop(column_to_drop_33, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']
else:
    comp_34 = comp_33.drop(comp_33.filter(regex='^' + column_to_drop_33).columns, axis=1)
    X_34 = comp_34.drop('target', axis=1)
    y_34 = comp_34['target']

print(X_34.shape)

(6119, 125)


In [466]:
X_train, X_test, y_train, y_test = train_test_split(X_34, y_34, test_size=0.2, shuffle=True, stratify=y_34, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [467]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [468]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.796258733378409


In [469]:
optuna_34 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_34.fit(X_train, y_train)

In [470]:
optuna_proba_34 = optuna_34.predict_proba(X_test)[:, 1]
auc_34 = roc_auc_score(y_test, optuna_proba_34)
print(decimal.Decimal(auc_34).quantize(decimal.Decimal('1.000')))

0.777


In [471]:
X_train = X_train.values
y_train = y_train.values

In [472]:
auc_bootstrap = []

In [473]:
rs = RandomState(seed = 34)
bootstrap_auc(optuna_34, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7570725 , 0.77660629])

In [474]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988400340080261, pvalue=0.20855283737182617),
 0.7670225991114148)

In [475]:
t_34 = auc_bootstrap
print(t_34)

[0.7691814764183185, 0.7638734620642514, 0.7666850008544086, 0.7762837491455912, 0.765625, 0.7725136705399863, 0.7730770463089542, 0.7676675709159262, 0.7678090823650034, 0.7630591037252221, 0.766460718557758, 0.7617721505468216, 0.7587069591592618, 0.7717126623376623, 0.7698569933356118, 0.7712827879357484, 0.7655075187969924, 0.7703883287764867, 0.7684231886534518, 0.7679132134313055, 0.7770500469924813, 0.764791951469583, 0.778278259569378, 0.7705191601161996, 0.7645329588174983, 0.7622020249487355, 0.7596574888926861, 0.7579326512303486, 0.7653660073479153, 0.7672697368421053, 0.7694805194805195, 0.7717099923103212, 0.7620097829801777, 0.7669973940533151, 0.7621726546479836, 0.7715204203691046, 0.7643620770676691, 0.7674913491114149, 0.7667490815105946, 0.7713281784005468, 0.7553267045454545, 0.764223235645933, 0.7608029306220097, 0.7720357356459331, 0.7617481203007519, 0.762586508885851, 0.7671335654477102, 0.7656036397812713, 0.7728314037935748, 0.7679399137047165, 0.770276187628

In [476]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [477]:
# 35
column_to_drop_34 = 'Cat_현재 교육환경'

In [478]:
if not column_to_drop_34.startswith('Cat_'):
    comp_35 = comp_34.drop(column_to_drop_34, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']
else:
    comp_35 = comp_34.drop(comp_34.filter(regex='^' + column_to_drop_34).columns, axis=1)
    X_35 = comp_35.drop('target', axis=1)
    y_35 = comp_35['target']

print(X_35.shape)

(6119, 121)


In [479]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y_35, test_size=0.2, shuffle=True, stratify=y_35, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [480]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [481]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7976276930525297


In [482]:
optuna_35 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_35.fit(X_train, y_train)

In [483]:
optuna_proba_35 = optuna_35.predict_proba(X_test)[:, 1]
auc_35 = roc_auc_score(y_test, optuna_proba_35)
print(decimal.Decimal(auc_35).quantize(decimal.Decimal('1.000')))

0.778


In [484]:
X_train = X_train.values
y_train = y_train.values

In [485]:
auc_bootstrap = []

In [486]:
rs = RandomState(seed = 35)
bootstrap_auc(optuna_35, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75814886, 0.7775153 ])

In [487]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9978082180023193, pvalue=0.00754776643589139),
 0.7684502280203349)

In [488]:
t_35 = auc_bootstrap
print(t_35)

[0.7666422804169514, 0.7703909988038278, 0.766527469241285, 0.7697475222146275, 0.7707621326042378, 0.7738273239917977, 0.7693710483595353, 0.765021573820916, 0.7623782467532467, 0.7711893369788105, 0.7574013157894737, 0.7729408749145591, 0.7626719497607656, 0.767995984278879, 0.777915135850991, 0.763352806732741, 0.768628780758715, 0.7698623333902939, 0.7737498931989064, 0.7713255083732058, 0.7747511534518113, 0.7711466165413534, 0.7754854109706083, 0.7692482271018455, 0.7659454032809295, 0.7692268668831168, 0.7672136662679425, 0.7708502435064934, 0.7714830399863295, 0.7700358851674642, 0.7713789089200274, 0.7731891874572795, 0.7722760381066303, 0.7667036910457964, 0.7708742737525632, 0.7677797120642516, 0.782416801948052, 0.7660441942925497, 0.7608456510594669, 0.7767002734107997, 0.7678144224196854, 0.7631044941900206, 0.7646611201298701, 0.7685540199931647, 0.7676141703691046, 0.7709784048188655, 0.7608109407040328, 0.7735122607655502, 0.7714910500683527, 0.7579112910116199, 0.7672

In [489]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [490]:
# 36
column_to_drop_35 = '현재 무주택 기간(총 개월)'

In [491]:
if not column_to_drop_35.startswith('Cat_'):
    comp_36 = comp_35.drop(column_to_drop_35, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']
else:
    comp_36 = comp_35.drop(comp_35.filter(regex='^' + column_to_drop_35).columns, axis=1)
    X_36 = comp_36.drop('target', axis=1)
    y_36 = comp_36['target']

print(X_36.shape)

(6119, 120)


In [492]:
X_train, X_test, y_train, y_test = train_test_split(X_36, y_36, test_size=0.2, shuffle=True, stratify=y_36, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [493]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [494]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 85, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 200, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.5, 'reg_alpha': 7, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7989382215210479


In [495]:
optuna_36 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_36.fit(X_train, y_train)

In [496]:
optuna_proba_36 = optuna_36.predict_proba(X_test)[:, 1]
auc_36 = roc_auc_score(y_test, optuna_proba_36)
print(decimal.Decimal(auc_36).quantize(decimal.Decimal('1.000')))

0.779


In [497]:
X_train = X_train.values
y_train = y_train.values

In [498]:
auc_bootstrap = []

In [499]:
rs = RandomState(seed = 36)
bootstrap_auc(optuna_36, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75957866, 0.77961614])

In [500]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9994848966598511, pvalue=0.8950105905532837),
 0.7696162970458817)

In [501]:
t_36 = auc_bootstrap
print(t_36)

[0.7687783022898155, 0.780179319036227, 0.7691254058441557, 0.7678651529391662, 0.7708101930963772, 0.7701747265892003, 0.7740569463431305, 0.7688717532467532, 0.7730957365003418, 0.7714082792207791, 0.7716218814080655, 0.7624636876281613, 0.7672884270334928, 0.7763157894736842, 0.7732826384142173, 0.7720757860560492, 0.7679212235133288, 0.7657905416951469, 0.7594946172248803, 0.7616680194805194, 0.7691013755980862, 0.7645302887901573, 0.7653633373205742, 0.7729942754613807, 0.7597082194121668, 0.7709784048188653, 0.7743292891319207, 0.7767189636021872, 0.7731224367737526, 0.7677156314080655, 0.7718621838687628, 0.7607415199931646, 0.7706393113465482, 0.7681802161654137, 0.7702548274094326, 0.7654300880041012, 0.7710852059125085, 0.7643460569036227, 0.7699798145933014, 0.7776401230348597, 0.7652672163362954, 0.7735977016404648, 0.7693763884142173, 0.7768524649692412, 0.7707808227956253, 0.7736190618591933, 0.7742678785030759, 0.773368079289132, 0.7708101930963773, 0.7777442541011621, 0

In [502]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [503]:
# 37
column_to_drop_36 = 'Cat_현재 주차시설 이용편의성'

In [504]:
if not column_to_drop_36.startswith('Cat_'):
    comp_37 = comp_36.drop(column_to_drop_36, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']
else:
    comp_37 = comp_36.drop(comp_36.filter(regex='^' + column_to_drop_36).columns, axis=1)
    X_37 = comp_37.drop('target', axis=1)
    y_37 = comp_37['target']

print(X_37.shape)

(6119, 116)


In [505]:
X_train, X_test, y_train, y_test = train_test_split(X_37, y_37, test_size=0.2, shuffle=True, stratify=y_37, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [506]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [507]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7981786158482125


In [508]:
optuna_37 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_37.fit(X_train, y_train)

In [509]:
optuna_proba_37 = optuna_37.predict_proba(X_test)[:, 1]
auc_37 = roc_auc_score(y_test, optuna_proba_37)
print(decimal.Decimal(auc_37).quantize(decimal.Decimal('1.000')))

0.777


In [510]:
X_train = X_train.values
y_train = y_train.values

In [511]:
auc_bootstrap = []

In [512]:
rs = RandomState(seed = 37)
bootstrap_auc(optuna_37, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75768968, 0.77693797])

In [513]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980981945991516, pvalue=0.019271645694971085),
 0.7674284753075872)

In [514]:
t_37 = auc_bootstrap
print(t_37)

[0.7725083304853041, 0.76438877734108, 0.7662337662337663, 0.7650776443950786, 0.7635824290840738, 0.766693010936432, 0.7723988593643198, 0.7598764311346549, 0.7729275247778536, 0.7687435919343812, 0.7704710996240602, 0.7645890293916611, 0.766962683697881, 0.7713495386192755, 0.7695659603554341, 0.7661029348940533, 0.7725537209501027, 0.7557539089200274, 0.7648373419343814, 0.7641431348257006, 0.7666022300068354, 0.7649414730006835, 0.7683110475051265, 0.7771515080314422, 0.7712721078263842, 0.7730316558441558, 0.7598871112440192, 0.7678998632946001, 0.762850841592618, 0.7674806690020506, 0.7629603127136021, 0.7647625811688311, 0.7769405758714969, 0.7704497394053316, 0.7687489319890636, 0.7802914601845523, 0.7636091293574845, 0.7686581510594668, 0.7682629870129871, 0.7658412722146274, 0.7661269651401231, 0.7704977998974709, 0.7636598598769652, 0.7688824333561176, 0.7621486244019138, 0.7667570915926178, 0.7611794044771019, 0.779316900205058, 0.7715845010252904, 0.7703829887218044, 0.768

In [515]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [516]:
# 38
column_to_drop_37 = 'Cat_현재 청소/쓰레기 처리상태'

In [517]:
if not column_to_drop_37.startswith('Cat_'):
    comp_38 = comp_37.drop(column_to_drop_37, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']
else:
    comp_38 = comp_37.drop(comp_37.filter(regex='^' + column_to_drop_37).columns, axis=1)
    X_38 = comp_38.drop('target', axis=1)
    y_38 = comp_38['target']

print(X_38.shape)

(6119, 112)


In [518]:
X_train, X_test, y_train, y_test = train_test_split(X_38, y_38, test_size=0.2, shuffle=True, stratify=y_38, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [519]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [520]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 116, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 9, 'reg_lambda': 6, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.799209509261346


In [521]:
optuna_38 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_38.fit(X_train, y_train)

In [522]:
optuna_proba_38 = optuna_38.predict_proba(X_test)[:, 1]
auc_38 = roc_auc_score(y_test, optuna_proba_38)
print(decimal.Decimal(auc_38).quantize(decimal.Decimal('1.000')))

0.778


In [523]:
X_train = X_train.values
y_train = y_train.values

In [524]:
auc_bootstrap = []

In [525]:
rs = RandomState(seed = 38)
bootstrap_auc(optuna_38, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75798064, 0.77803842])

In [526]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992244243621826, pvalue=0.584194540977478),
 0.7679333614576213)

In [527]:
t_38 = auc_bootstrap
print(t_38)

[0.7760300965481887, 0.7786253631237183, 0.7688343728639782, 0.76362247949419, 0.7683537679425838, 0.7662070659603555, 0.7602422248803827, 0.768727571770335, 0.7624983979835953, 0.7709036440533151, 0.7662764866712235, 0.7742224880382775, 0.7581008629528365, 0.766660970608339, 0.7689571941216677, 0.7706072710184553, 0.7573639354066986, 0.7645596590909092, 0.7685193096377306, 0.7604291267942583, 0.7634809680451127, 0.7702922077922078, 0.764020313568011, 0.7691841464456597, 0.767427268455229, 0.7623435363978127, 0.7700625854408749, 0.7501201512303486, 0.7768684851332877, 0.7621806647300069, 0.7670935150375939, 0.771934274606972, 0.7654434381408066, 0.7728527640123035, 0.7687168916609706, 0.7618175410116199, 0.7665568395420369, 0.7691360859535201, 0.7759126153451812, 0.7710958860218728, 0.7555910372522214, 0.764655780075188, 0.7634702879357484, 0.7688103426179084, 0.7664633885850991, 0.7734989106288448, 0.7757123632946001, 0.7634435876623376, 0.7666209201982228, 0.7750768967874231, 0.77541

In [528]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [529]:
# 38
column_to_drop_38 = '소득 대비 생활비의 비율'

In [530]:
if not column_to_drop_38.startswith('Cat_'):
    comp_39 = comp_38.drop(column_to_drop_38, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']
else:
    comp_39 = comp_38.drop(comp_38.filter(regex='^' + column_to_drop_38).columns, axis=1)
    X_39 = comp_39.drop('target', axis=1)
    y_39 = comp_39['target']

print(X_39.shape)

(6119, 111)


In [531]:
X_train, X_test, y_train, y_test = train_test_split(X_39, y_39, test_size=0.2, shuffle=True, stratify=y_39, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [532]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [533]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 152, 'learning_rate': 0.09999999999999999, 'max_depth': 5, 'num_leaves': 636, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8028364176662577


In [534]:
optuna_39 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_39.fit(X_train, y_train)

In [535]:
optuna_proba_39 = optuna_39.predict_proba(X_test)[:, 1]
auc_39 = roc_auc_score(y_test, optuna_proba_39)
print(decimal.Decimal(auc_39).quantize(decimal.Decimal('1.000')))

0.779


In [536]:
X_train = X_train.values
y_train = y_train.values

In [537]:
auc_bootstrap = []

In [538]:
rs = RandomState(seed = 39)
bootstrap_auc(optuna_39, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7537315 , 0.77668979])

In [539]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9978386163711548, pvalue=0.008321705274283886),
 0.7657674472936602)

In [540]:
t_39 = auc_bootstrap
print(t_39)

[0.7574093258714969, 0.7766869232740944, 0.7679238935406699, 0.7597642899863295, 0.7587870599794942, 0.759753609876965, 0.7669012730690361, 0.7650429340396445, 0.773872714456596, 0.7667864618933697, 0.7677503417634997, 0.7609230818523582, 0.7832712106971975, 0.7626372394053316, 0.7680013243335612, 0.767728981544771, 0.7601461038961039, 0.769392408578264, 0.7678571428571429, 0.7707060620300752, 0.763088474025974, 0.7687062115516061, 0.7621673145933014, 0.7691948265550239, 0.7650322539302802, 0.7691627862269309, 0.774441430280246, 0.7727966934381408, 0.7586936090225564, 0.7729515550239234, 0.7717206724196854, 0.7534603554340396, 0.763120514354067, 0.7655982997265891, 0.7618575914217361, 0.7643513969583049, 0.770764802631579, 0.7620738636363636, 0.7748526144907724, 0.769563290328093, 0.7624369873547505, 0.7617401102187287, 0.7623702366712235, 0.7620418233082706, 0.7557485688653451, 0.7679185534859877, 0.7740889866712235, 0.7690880254613808, 0.7622340652768284, 0.7694271189336979, 0.761611

In [541]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [542]:
# 40
column_to_drop_39 = 'Cat_가구주 종사상 지위'

In [543]:
if not column_to_drop_39.startswith('Cat_'):
    comp_40 = comp_39.drop(column_to_drop_39, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']
else:
    comp_40 = comp_39.drop(comp_39.filter(regex='^' + column_to_drop_39).columns, axis=1)
    X_40 = comp_40.drop('target', axis=1)
    y_40 = comp_40['target']

print(X_40.shape)

(6119, 106)


In [544]:
X_train, X_test, y_train, y_test = train_test_split(X_40, y_40, test_size=0.2, shuffle=True, stratify=y_40, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [545]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [546]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 136, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 542, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8021686324593693


In [547]:
optuna_40 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_40.fit(X_train, y_train)

In [548]:
optuna_proba_40 = optuna_40.predict_proba(X_test)[:, 1]
auc_40 = roc_auc_score(y_test, optuna_proba_40)
print(decimal.Decimal(auc_40).quantize(decimal.Decimal('1.000')))

0.780


In [549]:
X_train = X_train.values
y_train = y_train.values

In [550]:
auc_bootstrap = []

In [551]:
rs = RandomState(seed = 40)
bootstrap_auc(optuna_40, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75806615, 0.77932498])

In [552]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991445541381836, pvalue=0.48536911606788635),
 0.7687331681476419)

In [553]:
t_40 = auc_bootstrap
print(t_40)

[0.7690212747778539, 0.7711946770334928, 0.7679399137047163, 0.7598737611073137, 0.7731998675666438, 0.775338559466849, 0.7719716549897471, 0.7694911995898839, 0.7673631877990431, 0.7715604707792207, 0.7525098257006152, 0.7721665669856459, 0.7721265165755299, 0.7632513456937798, 0.761854921394395, 0.7734828904647983, 0.7670427845181136, 0.7686768412508543, 0.7603650461380725, 0.7653660073479154, 0.7680654049897472, 0.7725510509227614, 0.7689892344497606, 0.7721104964114833, 0.7634275674982911, 0.759777640123035, 0.7592302845181134, 0.7706366413192071, 0.7670855049555707, 0.7749460654477102, 0.7652511961722488, 0.7645649991455912, 0.7758939251537935, 0.7659881237183869, 0.7707114020847574, 0.7697261619958988, 0.7684525589542036, 0.7693817284688995, 0.7687916524265208, 0.7740435962064252, 0.7676328605604921, 0.7710985560492138, 0.772070446001367, 0.7730209757347914, 0.7603677161654134, 0.771266767771702, 0.7683591079972658, 0.768094775290499, 0.771165306732741, 0.7635770890293916, 0.7714

In [554]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [555]:
# 41.
column_to_drop_40 = '자산 중 금융자산의 비중'

In [556]:
if not column_to_drop_40.startswith('Cat_'):
    comp_41 = comp_40.drop(column_to_drop_40, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']
else:
    comp_41 = comp_40.drop(comp_40.filter(regex='^' + column_to_drop_40).columns, axis=1)
    X_41 = comp_41.drop('target', axis=1)
    y_41 = comp_41['target']

print(X_41.shape)

(6119, 105)


In [557]:
X_train, X_test, y_train, y_test = train_test_split(X_41, y_41, test_size=0.2, shuffle=True, stratify=y_41, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [558]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [559]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 362, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_alpha': 4, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8019056920341572


In [560]:
optuna_41 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_41.fit(X_train, y_train)

In [561]:
optuna_proba_41 = optuna_41.predict_proba(X_test)[:, 1]
auc_41 = roc_auc_score(y_test, optuna_proba_41)
print(decimal.Decimal(auc_41).quantize(decimal.Decimal('1.000')))

0.778


In [562]:
X_train = X_train.values
y_train = y_train.values

In [563]:
auc_bootstrap = []

In [564]:
rs = RandomState(seed = 41)
bootstrap_auc(optuna_41, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75192469, 0.77546252])

In [565]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9992793202400208, pvalue=0.655146062374115),
 0.7639114084928229)

In [566]:
t_41 = auc_bootstrap
print(t_41)

[0.7761529178058783, 0.761515827922078, 0.768796992481203, 0.7636117993848257, 0.7705965909090909, 0.7631498846548189, 0.7703108979835954, 0.7694885295625427, 0.7663005169172932, 0.7621539644565961, 0.7657878716678058, 0.7633367865686944, 0.7652859065276829, 0.7657051008202324, 0.7519250897129186, 0.7565362269309637, 0.7692108467190705, 0.7570114917976759, 0.7710531655844156, 0.7542640336637048, 0.7577377392344498, 0.76982762303486, 0.7718301435406698, 0.7559568309979494, 0.755073051948052, 0.7677183014354068, 0.7612621753246753, 0.7742919087491457, 0.7596014183185236, 0.7568619702665755, 0.7720330656185919, 0.7669493335611757, 0.7717180023923444, 0.757078242481203, 0.7670855049555707, 0.7685647001025291, 0.7668398624401913, 0.7648373419343814, 0.7566216678058784, 0.758311795112782, 0.7712053571428572, 0.7651176948051949, 0.7672563867053999, 0.7632032852016404, 0.7597963303144224, 0.7655475692071085, 0.7641324547163364, 0.7700065148667122, 0.7609711423444976, 0.7665061090225564, 0.7609

In [567]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [568]:
# 42.
column_to_drop_41 = 'Cat_현재 대중교통 접근용이성'

In [569]:
if not column_to_drop_41.startswith('Cat_'):
    comp_42 = comp_41.drop(column_to_drop_41, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']
else:
    comp_42 = comp_41.drop(comp_41.filter(regex='^' + column_to_drop_41).columns, axis=1)
    X_42 = comp_42.drop('target', axis=1)
    y_42 = comp_42['target']

print(X_42.shape)

(6119, 101)


In [570]:
X_train, X_test, y_train, y_test = train_test_split(X_42, y_42, test_size=0.2, shuffle=True, stratify=y_42, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [571]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [572]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 136, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 542, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8036878438050401


In [573]:
optuna_42 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_42.fit(X_train, y_train)

In [574]:
optuna_proba_42 = optuna_42.predict_proba(X_test)[:, 1]
auc_42 = roc_auc_score(y_test, optuna_proba_42)
print(decimal.Decimal(auc_42).quantize(decimal.Decimal('1.000')))

0.778


In [575]:
X_train = X_train.values
y_train = y_train.values

In [576]:
auc_bootstrap = []

In [577]:
rs = RandomState(seed = 42)
bootstrap_auc(optuna_42, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75645332, 0.77708222])

In [578]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991364479064941, pvalue=0.47580280900001526),
 0.7668987659133629)

In [579]:
t_42 = auc_bootstrap
print(t_42)

[0.7621059039644565, 0.7612728554340397, 0.7698756835269993, 0.7645997095010254, 0.7591208133971292, 0.7694671693438141, 0.7701133159603555, 0.759347765721121, 0.7661509953861927, 0.7793970010252905, 0.7628001110731373, 0.7705458603896104, 0.7637666609706084, 0.7582877648667122, 0.7712267173615858, 0.7679105434039645, 0.765053614149009, 0.7725136705399863, 0.7629416225222145, 0.7576709885509229, 0.7670507946001367, 0.7646531100478469, 0.7652164858168148, 0.7587042891319207, 0.766789131920711, 0.7630270633971292, 0.7759526657552974, 0.7661296351674641, 0.7662017259056733, 0.7600072624743677, 0.7704337192412851, 0.7673311474709501, 0.7599912423103212, 0.7676115003417635, 0.7696727614490773, 0.7673818779904307, 0.7770633971291866, 0.7616546693438142, 0.7676996112440192, 0.7689198137388927, 0.7731678272385509, 0.767194976076555, 0.7703135680109363, 0.7692749273752563, 0.7703429383116884, 0.7713815789473685, 0.7668799128503075, 0.7662551264524949, 0.759248974709501, 0.7727860133287764, 0.76

In [580]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [581]:
# 43.
column_to_drop_42 = 'Cat_남편/아내의 부모님과 동거 의향'

In [582]:
if not column_to_drop_42.startswith('Cat_'):
    comp_43 = comp_42.drop(column_to_drop_42, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']
else:
    comp_43 = comp_42.drop(comp_42.filter(regex='^' + column_to_drop_42).columns, axis=1)
    X_43 = comp_43.drop('target', axis=1)
    y_43 = comp_43['target']

print(X_43.shape)

(6119, 96)


In [583]:
X_train, X_test, y_train, y_test = train_test_split(X_43, y_43, test_size=0.2, shuffle=True, stratify=y_43, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [584]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [585]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 144, 'learning_rate': 0.060000000000000005, 'max_depth': 6, 'num_leaves': 212, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_alpha': 4, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8009666190869706


In [586]:
optuna_43 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_43.fit(X_train, y_train)

In [587]:
optuna_proba_43 = optuna_43.predict_proba(X_test)[:, 1]
auc_43 = roc_auc_score(y_test, optuna_proba_43)
print(decimal.Decimal(auc_43).quantize(decimal.Decimal('1.000')))

0.780


In [588]:
X_train = X_train.values
y_train = y_train.values

In [589]:
auc_bootstrap = []

In [590]:
rs = RandomState(seed = 43)
bootstrap_auc(optuna_43, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75510943, 0.77707862])

In [591]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9986960887908936, pvalue=0.13373291492462158),
 0.7665281808035713)

In [592]:
t_43 = auc_bootstrap
print(t_43)

[0.7664980989405332, 0.7629950230690363, 0.7584559765892003, 0.7624396573820916, 0.7778937756322626, 0.764551649008886, 0.7700278750854408, 0.7634062072795627, 0.7726338217703349, 0.7728047035201641, 0.7640550239234449, 0.76428998632946, 0.7590647428229665, 0.7673551777170198, 0.7651310449419002, 0.7707087320574163, 0.7744227400888585, 0.7598363807245386, 0.7621245941558441, 0.7630804639439507, 0.7669787038619276, 0.7790312072795625, 0.7744494403622694, 0.7627013200615174, 0.7615238380041012, 0.7697048017771702, 0.7654060577580315, 0.7576549683868763, 0.7764039003759399, 0.7610645933014353, 0.7641538149350648, 0.7754213303144224, 0.7694244489063568, 0.7750341763499659, 0.765189785543404, 0.7651283749145591, 0.7631258544087491, 0.7751356373889269, 0.7760701469583049, 0.7641778451811346, 0.7666716507177034, 0.7619403622693097, 0.756877990430622, 0.7598977913533834, 0.7678144224196856, 0.7756002221462748, 0.7601247436773751, 0.7635770890293916, 0.7653499871838688, 0.7695846505468216, 0.76

In [593]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [594]:
## 44
column_to_drop_43 = '소득 중 정부 보조금의 비중(월평균)'

In [595]:
if not column_to_drop_43.startswith('Cat_'):
    comp_44 = comp_43.drop(column_to_drop_43, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']
else:
    comp_44 = comp_43.drop(comp_43.filter(regex='^' + column_to_drop_43).columns, axis=1)
    X_44 = comp_44.drop('target', axis=1)
    y_44 = comp_44['target']

print(X_44.shape)

(6119, 95)


In [596]:
X_train, X_test, y_train, y_test = train_test_split(X_44, y_44, test_size=0.2, shuffle=True, stratify=y_44, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [597]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [598]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 160, 'learning_rate': 0.08, 'max_depth': 4, 'num_leaves': 402, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.7000000000000001, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.8015384101703688


In [599]:
optuna_44 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_44.fit(X_train, y_train)

In [600]:
optuna_proba_44 = optuna_44.predict_proba(X_test)[:, 1]
auc_44 = roc_auc_score(y_test, optuna_proba_44)
print(decimal.Decimal(auc_44).quantize(decimal.Decimal('1.000')))

0.781


In [601]:
X_train = X_train.values
y_train = y_train.values

In [602]:
auc_bootstrap = []

In [603]:
rs = RandomState(seed = 44)
bootstrap_auc(optuna_44, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75779261, 0.77893308])

In [604]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9982903599739075, pvalue=0.03608318790793419),
 0.7685812622821258)

In [605]:
t_44 = auc_bootstrap
print(t_44)

[0.777279669343814, 0.7634462576896788, 0.7717954331852358, 0.7690853554340396, 0.7603623761107314, 0.7636091293574847, 0.7748419343814079, 0.7675394096035544, 0.7634222274436091, 0.775506771189337, 0.7679772940874915, 0.7540664516404648, 0.7827959458304853, 0.7714830399863294, 0.7764893412508544, 0.7607655502392343, 0.7693763884142174, 0.7643193566302119, 0.7605118976418319, 0.7654701384142173, 0.7783583603896104, 0.7589152212918662, 0.7670214242993848, 0.7606961295283664, 0.7743880297334245, 0.7596094284005468, 0.7724255596377306, 0.768869083219412, 0.7739448051948052, 0.7724415798017772, 0.7758351845522898, 0.7703936688311688, 0.7684525589542037, 0.7706980519480519, 0.7670214242993849, 0.7648506920710868, 0.7717900931305537, 0.7694511491797675, 0.7653766874572796, 0.7738113038277512, 0.7776828434723171, 0.7713815789473683, 0.7691120557074504, 0.7582744147300068, 0.7628001110731374, 0.7722306476418318, 0.7733226888243336, 0.7657611713943949, 0.7647038405673273, 0.7601995044429255, 0.

In [606]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [607]:
# 45
column_to_drop_44 = '가구주 나이'

In [608]:
if not column_to_drop_44.startswith('Cat_'):
    comp_45 = comp_44.drop(column_to_drop_44, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']
else:
    comp_45 = comp_44.drop(comp_44.filter(regex='^' + column_to_drop_44).columns, axis=1)
    X_45 = comp_45.drop('target', axis=1)
    y_45 = comp_45['target']

print(X_45.shape)

(6119, 94)


In [609]:
X_train, X_test, y_train, y_test = train_test_split(X_45, y_45, test_size=0.2, shuffle=True, stratify=y_45, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [610]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [611]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 127, 'learning_rate': 0.06999999999999999, 'max_depth': 7, 'num_leaves': 526, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 7, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7957203315553553


In [612]:
optuna_45 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_45.fit(X_train, y_train)

In [613]:
optuna_proba_45 = optuna_45.predict_proba(X_test)[:, 1]
auc_45 = roc_auc_score(y_test, optuna_proba_45)
print(decimal.Decimal(auc_45).quantize(decimal.Decimal('1.000')))

0.775


In [614]:
X_train = X_train.values
y_train = y_train.values

In [615]:
auc_bootstrap = []

In [616]:
rs = RandomState(seed = 45)
bootstrap_auc(optuna_45, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75133955, 0.77331755])

In [617]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9989024996757507, pvalue=0.2512438893318176),
 0.762521816793404)

In [618]:
t_45 = auc_bootstrap
print(t_45)

[0.7734828904647983, 0.7651764354066986, 0.766693010936432, 0.7582717447026658, 0.7629736628503075, 0.7458481074846206, 0.7723187585440876, 0.7619991028708134, 0.7639135124743677, 0.7600419728298018, 0.7582263542378673, 0.7634729579630896, 0.761985752734108, 0.7599752221462748, 0.7631739149008887, 0.7663699376281613, 0.752774158407382, 0.759214264354067, 0.7606400589542037, 0.7552252435064936, 0.7610485731373889, 0.7618202110389611, 0.7562745642515379, 0.7590513926862611, 0.7680226845522898, 0.7564908364661653, 0.7683430878332194, 0.7691414260082023, 0.7718248034859877, 0.7651737653793576, 0.7628508415926178, 0.7701400162337663, 0.7606267088174984, 0.7636465097402598, 0.7684926093643198, 0.7568886705399863, 0.7624476674641147, 0.7680707450444293, 0.7706419813738893, 0.7706339712918662, 0.7645302887901573, 0.7679158834586467, 0.7640870642515378, 0.7509932501708818, 0.7596628289473685, 0.7600473128844839, 0.7699744745386192, 0.7647412209501024, 0.7612434851332877, 0.7680974453178401, 0.7

In [619]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [620]:
# 46.
column_to_drop_45 = 'Cat_이사 계획 첫 번째 이유'

In [621]:
if not column_to_drop_45.startswith('Cat_'):
    comp_46 = comp_45.drop(column_to_drop_45, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']
else:
    comp_46 = comp_45.drop(comp_45.filter(regex='^' + column_to_drop_45).columns, axis=1)
    X_46 = comp_46.drop('target', axis=1)
    y_46 = comp_46['target']

print(X_46.shape)

(6119, 82)


In [622]:
X_train, X_test, y_train, y_test = train_test_split(X_46, y_46, test_size=0.2, shuffle=True, stratify=y_46, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [623]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [624]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 146, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 634, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 10, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7951819297323015


In [625]:
optuna_46 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_46.fit(X_train, y_train)

In [626]:
optuna_proba_46 = optuna_46.predict_proba(X_test)[:, 1]
auc_46 = roc_auc_score(y_test, optuna_proba_46)
print(decimal.Decimal(auc_46).quantize(decimal.Decimal('1.000')))

0.778


In [627]:
X_train = X_train.values
y_train = y_train.values

In [628]:
auc_bootstrap = []

In [629]:
rs = RandomState(seed = 46)
bootstrap_auc(optuna_46, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75767092, 0.77740583])

In [630]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9980925917625427, pvalue=0.01892155595123768),
 0.7679304751580656)

In [631]:
t_46 = auc_bootstrap
print(t_46)

[0.7658653024606972, 0.7679372436773753, 0.7718194634313056, 0.764960163192071, 0.7627360304169515, 0.7729835953520163, 0.7659721035543404, 0.7630297334244702, 0.7674165883458648, 0.7691280758714968, 0.7684338687628162, 0.7671656057758032, 0.7600366327751196, 0.7703669685577581, 0.767291097060834, 0.7699958347573479, 0.7679399137047164, 0.7640763841421736, 0.767526059466849, 0.7639989533492821, 0.7655929596719071, 0.7615291780587833, 0.7669706937799043, 0.761724090054682, 0.7728073735475052, 0.7716379015721121, 0.7685914003759399, 0.7679719540328094, 0.7713708988380041, 0.7757497436773753, 0.7707194121667805, 0.7749073500512644, 0.769096035543404, 0.7716779519822283, 0.7714536696855776, 0.7679586038961039, 0.7769859663362952, 0.7632353255297335, 0.7717660628844839, 0.7665007689678742, 0.7732185577580314, 0.7628802118933697, 0.7637586508885851, 0.771264097744361, 0.7647438909774437, 0.7676782510252906, 0.7639455528024608, 0.7723828392002734, 0.7715791609706083, 0.7767643540669857, 0.767

In [632]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [633]:
# 47.
column_to_drop_46 = '소득 대비 주거관리비의 비율'

In [634]:
if not column_to_drop_46.startswith('Cat_'):
    comp_47 = comp_46.drop(column_to_drop_46, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']
else:
    comp_47 = comp_46.drop(comp_46.filter(regex='^' + column_to_drop_46).columns, axis=1)
    X_47 = comp_47.drop('target', axis=1)
    y_47 = comp_47['target']

print(X_47.shape)

(6119, 81)


In [635]:
X_train, X_test, y_train, y_test = train_test_split(X_47, y_47, test_size=0.2, shuffle=True, stratify=y_47, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [636]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [637]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 126, 'learning_rate': 0.09, 'max_depth': 4, 'num_leaves': 1010, 'subsample': 0.4, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 4, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7936835866743462


In [638]:
optuna_47 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_47.fit(X_train, y_train)

In [639]:
optuna_proba_47 = optuna_47.predict_proba(X_test)[:, 1]
auc_47 = roc_auc_score(y_test, optuna_proba_47)
print(decimal.Decimal(auc_47).quantize(decimal.Decimal('1.000')))

0.780


In [640]:
X_train = X_train.values
y_train = y_train.values

In [641]:
auc_bootstrap = []

In [642]:
rs = RandomState(seed = 47)
bootstrap_auc(optuna_47, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76358176, 0.78228617])

In [643]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9979938268661499, pvalue=0.013728522695600986),
 0.7731911412497864)

In [644]:
t_47 = auc_bootstrap
print(t_47)

[0.7743266191045797, 0.7662844967532467, 0.7715257604237868, 0.7742358381749829, 0.7643674171223513, 0.773805963773069, 0.7798135252904991, 0.7758298444976076, 0.7698810235816815, 0.7818053656869446, 0.7764706510594668, 0.7648253268113465, 0.7683591079972658, 0.767897193267259, 0.753171992481203, 0.7703055579289131, 0.7796212833219412, 0.7741277020676691, 0.7647385509227614, 0.7763945552802461, 0.7744467703349283, 0.7746977529049897, 0.7668371924128503, 0.775674982911825, 0.7760047312884484, 0.7811191686602871, 0.7760541267942583, 0.7765160415242653, 0.7665541695146958, 0.7720303955912509, 0.7704043489405331, 0.7720984812884484, 0.7709810748462065, 0.7780619873547505, 0.7653099367737526, 0.7771461679767602, 0.7739821855775804, 0.7828440063226246, 0.7733573991797675, 0.7695232399179768, 0.7677129613807245, 0.7769352358168147, 0.767093515037594, 0.7744761406356802, 0.7647652511961722, 0.7710705207621326, 0.7835141831852358, 0.7686875213602187, 0.770169386534518, 0.7682202665755298, 0.770

In [645]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [646]:
# 48
column_to_drop_47 = 'Cat_가구주 최종 학력'

In [647]:
if not column_to_drop_47.startswith('Cat_'):
    comp_48 = comp_47.drop(column_to_drop_47, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']
else:
    comp_48 = comp_47.drop(comp_47.filter(regex='^' + column_to_drop_47).columns, axis=1)
    X_48 = comp_48.drop('target', axis=1)
    y_48 = comp_48['target']

print(X_48.shape)

(6119, 78)


In [648]:
X_train, X_test, y_train, y_test = train_test_split(X_48, y_48, test_size=0.2, shuffle=True, stratify=y_48, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [649]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [650]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 142, 'learning_rate': 0.08, 'max_depth': 5, 'num_leaves': 448, 'subsample': 1.0, 'colsample_bytree': 0.4, 'reg_alpha': 7, 'reg_lambda': 3, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.792606783028239


In [651]:
optuna_48 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_48.fit(X_train, y_train)

In [652]:
optuna_proba_48 = optuna_48.predict_proba(X_test)[:, 1]
auc_48 = roc_auc_score(y_test, optuna_proba_48)
print(decimal.Decimal(auc_48).quantize(decimal.Decimal('1.000')))

0.780


In [653]:
X_train = X_train.values
y_train = y_train.values

In [654]:
auc_bootstrap = []

In [655]:
rs = RandomState(seed = 48)
bootstrap_auc(optuna_48, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75931576, 0.77917018])

In [656]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9984193444252014, pvalue=0.05496254563331604),
 0.7695459932234705)

In [657]:
t_48 = auc_bootstrap
print(t_48)

[0.7716832920369106, 0.7596441387559808, 0.7775413320232398, 0.7607388499658236, 0.7755975521189337, 0.7692215268284347, 0.7725750811688311, 0.7801312585440875, 0.7721104964114832, 0.7762810791182502, 0.773504250683527, 0.7692589072112098, 0.7647038405673274, 0.7726712021531101, 0.7749113550922759, 0.7640510188824333, 0.773402789644566, 0.7555883672248804, 0.77106384569378, 0.7674059082365005, 0.7671202153110048, 0.7657945467361585, 0.7722733680792891, 0.7675167143711552, 0.775907275290499, 0.7652725563909775, 0.7644528579972658, 0.7655021787423104, 0.7700545753588517, 0.7701453562884484, 0.7732425880041012, 0.7613396061175666, 0.7670748248462064, 0.7681802161654135, 0.765590289644566, 0.7708208732057418, 0.7691093856801094, 0.7662471163704716, 0.7736671223513328, 0.7617481203007519, 0.7715591357655504, 0.7756963431305537, 0.7727192626452495, 0.7742598684210525, 0.769397748632946, 0.7628641917293234, 0.7730316558441559, 0.770033215140123, 0.7661456553315106, 0.780199344241285, 0.771891

In [658]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [659]:
# 49
column_to_drop_48 = '장기부채부담지표'

In [660]:
if not column_to_drop_48.startswith('Cat_'):
    comp_49 = comp_48.drop(column_to_drop_48, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']
else:
    comp_49 = comp_48.drop(comp_48.filter(regex='^' + column_to_drop_48).columns, axis=1)
    X_49 = comp_49.drop('target', axis=1)
    y_49 = comp_49['target']

print(X_49.shape)

(6119, 77)


In [661]:
X_train, X_test, y_train, y_test = train_test_split(X_49, y_49, test_size=0.2, shuffle=True, stratify=y_49, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [662]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [663]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 141, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 932, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7916468417933371


In [664]:
optuna_49 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_49.fit(X_train, y_train)


In [665]:
optuna_proba_49 = optuna_49.predict_proba(X_test)[:, 1]
auc_49 = roc_auc_score(y_test, optuna_proba_49)
print(decimal.Decimal(auc_49).quantize(decimal.Decimal('1.000')))

0.782


In [666]:
X_train = X_train.values
y_train = y_train.values

In [667]:
auc_bootstrap = []

In [668]:
rs = RandomState(seed = 49)
bootstrap_auc(optuna_49, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76591547, 0.78254246])

In [669]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9995548725128174, pvalue=0.9479307532310486),
 0.7744074782125769)

In [670]:
t_49 = auc_bootstrap
print(t_49)

[0.7813594711209844, 0.7742011278195489, 0.7719850051264524, 0.7747645035885169, 0.7732492630724537, 0.7752597936602871, 0.7804850371667806, 0.7878089221633628, 0.7744614554853042, 0.7746617075358853, 0.7724602699931649, 0.7757497436773753, 0.7705405203349283, 0.7715297654647983, 0.7761021872863978, 0.7790806027853725, 0.7806105284518113, 0.7781714584757349, 0.7782195189678742, 0.7753438995215312, 0.7763504998291182, 0.7745642515379357, 0.7803368506493507, 0.7765494168660287, 0.775840524606972, 0.7710504955570745, 0.771588506066302, 0.7777202238550923, 0.7715297654647983, 0.7753799448906357, 0.7760060663021189, 0.7739154348940533, 0.7800431476418318, 0.7768364448051948, 0.7772222637559809, 0.7725804212235133, 0.7734655352870814, 0.7695085547676008, 0.7760741519993164, 0.7727192626452495, 0.7736671223513328, 0.7727499679596719, 0.7749113550922762, 0.7715591357655502, 0.7777602742652084, 0.772203947368421, 0.7741103468899522, 0.7666569655673274, 0.7747324632604238, 0.7773851354237867, 0.

In [671]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [672]:
# 50
column_to_drop_49 = 'Cat_현재 주택의 유형'

In [673]:
if not column_to_drop_49.startswith('Cat_'):
    comp_50 = comp_49.drop(column_to_drop_49, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']
else:
    comp_50 = comp_49.drop(comp_49.filter(regex='^' + column_to_drop_49).columns, axis=1)
    X_50 = comp_50.drop('target', axis=1)
    y_50 = comp_50['target']

print(X_50.shape)

(6119, 66)


In [674]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y_50, test_size=0.2, shuffle=True, stratify=y_50, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [675]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [676]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 164, 'learning_rate': 0.09999999999999999, 'max_depth': 7, 'num_leaves': 680, 'subsample': 0.5, 'colsample_bytree': 0.2, 'reg_alpha': 6, 'reg_lambda': 2, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.792527483534921


In [677]:
optuna_50 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_50.fit(X_train, y_train)

In [678]:
optuna_proba_50 = optuna_50.predict_proba(X_test)[:, 1]
auc_50 = roc_auc_score(y_test, optuna_proba_50)
print(decimal.Decimal(auc_50).quantize(decimal.Decimal('1.000')))

0.783


In [679]:
X_train = X_train.values
y_train = y_train.values

In [680]:
auc_bootstrap = []

In [681]:
rs = RandomState(seed = 50)
bootstrap_auc(optuna_50, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.76026278, 0.77954485])

In [682]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9991621375083923, pvalue=0.5064297914505005),
 0.7700753989020848)

In [683]:
t_50 = auc_bootstrap
print(t_50)

[0.7748446044087491, 0.7798108552631579, 0.7697688824333562, 0.7673765379357484, 0.7688076725905674, 0.7653713474025973, 0.7586135082023241, 0.7620978938824333, 0.7690506450786055, 0.7783690404989747, 0.776038106630212, 0.7692642472658919, 0.7802353896103896, 0.7653633373205742, 0.7723134184894054, 0.7740249060150376, 0.7727352828092959, 0.7660655545112782, 0.773069036226931, 0.7640683740601504, 0.7626933099794941, 0.7725937713602187, 0.7709730647641831, 0.7746523624401914, 0.7760567968215994, 0.776644202836637, 0.7659747735816815, 0.7700091848940533, 0.7647625811688312, 0.7632753759398495, 0.7699424342105262, 0.7718141233766234, 0.7705245001708817, 0.7716832920369106, 0.7715204203691045, 0.7664513734620643, 0.7657985517771703, 0.7667651016746411, 0.7659373931989064, 0.775004806049214, 0.7707060620300753, 0.7703215780929596, 0.7736537722146275, 0.7741210269993164, 0.7662524564251538, 0.7663699376281613, 0.7673364875256323, 0.7670240943267259, 0.7699584543745728, 0.7693496881408066, 0.7

In [684]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [685]:
# 51
column_to_drop_50 = 'Cat_이사 계획 중인 주택의 점유형태'

In [686]:
if not column_to_drop_50.startswith('Cat_'):
    comp_51 = comp_50.drop(column_to_drop_50, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']
else:
    comp_51 = comp_50.drop(comp_50.filter(regex='^' + column_to_drop_50).columns, axis=1)
    X_51 = comp_51.drop('target', axis=1)
    y_51 = comp_51['target']

print(X_51.shape)

(6119, 46)


In [687]:
X_train, X_test, y_train, y_test = train_test_split(X_51, y_51, test_size=0.2, shuffle=True, stratify=y_51, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [688]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [689]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 189, 'learning_rate': 0.08, 'max_depth': 4, 'num_leaves': 782, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_alpha': 4, 'reg_lambda': 5, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7938672276062405


In [690]:
optuna_51 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_51.fit(X_train, y_train)

In [691]:
optuna_proba_51 = optuna_51.predict_proba(X_test)[:, 1]
auc_51 = roc_auc_score(y_test, optuna_proba_51)
print(decimal.Decimal(auc_51).quantize(decimal.Decimal('1.000')))

0.781


In [692]:
X_train = X_train.values
y_train = y_train.values

In [693]:
auc_bootstrap = []

In [694]:
rs = RandomState(seed = 51)
bootstrap_auc(optuna_51, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75835952, 0.77822222])

In [695]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9983912110328674, pvalue=0.05015162006020546),
 0.7685324588548786)

In [696]:
t_51 = auc_bootstrap
print(t_51)

[0.7659280481032125, 0.7762330186261107, 0.7738219839371155, 0.7699637944292549, 0.7652191558441559, 0.7613796565276827, 0.7741637474367736, 0.7747378033151059, 0.7776828434723171, 0.7657958817498292, 0.76752872949419, 0.7696700914217363, 0.7652431860902256, 0.7706873718386876, 0.7726605220437457, 0.7648346719070404, 0.7666850008544087, 0.7724469198564593, 0.765874647556391, 0.7739554853041695, 0.7626292293233082, 0.7678264375427205, 0.7633634868421052, 0.7680120044429255, 0.7756416075700615, 0.7674299384825701, 0.7673364875256322, 0.7640950743335613, 0.7669706937799042, 0.7692909475393029, 0.7537567284688995, 0.7644221526828435, 0.7647652511961723, 0.7727913533834587, 0.764554319036227, 0.7625184231886536, 0.7753171992481204, 0.7732158877306904, 0.7668892579460014, 0.769667421394395, 0.7627413704716336, 0.7803448607313739, 0.7648012965652768, 0.7716605968045114, 0.7740542763157895, 0.773974175495557, 0.7653660073479153, 0.7697048017771702, 0.7689838943950784, 0.7663779477101844, 0.775

In [697]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [698]:
# 52
column_to_drop_51 = 'Cat_현재 거주 지역'

In [699]:
if not column_to_drop_51.startswith('Cat_'):
    comp_52 = comp_51.drop(column_to_drop_51, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']
else:
    comp_52 = comp_51.drop(comp_51.filter(regex='^' + column_to_drop_51).columns, axis=1)
    X_52 = comp_52.drop('target', axis=1)
    y_52 = comp_52['target']

print(X_52.shape)

(6119, 29)


In [700]:
X_train, X_test, y_train, y_test = train_test_split(X_52, y_52, test_size=0.2, shuffle=True, stratify=y_52, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [701]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [702]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 176, 'learning_rate': 0.06999999999999999, 'max_depth': 6, 'num_leaves': 398, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 8, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7879510680389653


In [703]:
optuna_52 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_52.fit(X_train, y_train)

In [704]:
optuna_proba_52 = optuna_52.predict_proba(X_test)[:, 1]
auc_52 = roc_auc_score(y_test, optuna_proba_52)
print(decimal.Decimal(auc_52).quantize(decimal.Decimal('1.000')))

0.771


In [705]:
X_train = X_train.values
y_train = y_train.values

In [706]:
auc_bootstrap = []

In [707]:
rs = RandomState(seed = 52)
bootstrap_auc(optuna_52, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74676486, 0.76858486])

In [708]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9974387884140015, pvalue=0.0023546924348920584),
 0.7581138726610561)

In [709]:
t_52 = auc_bootstrap
print(t_52)

[0.7517582130041012, 0.7592863550922762, 0.7527034026828434, 0.7599845672419686, 0.7649521531100478, 0.7525031506322624, 0.7562972594839371, 0.7580661525974025, 0.7477705271701982, 0.7620111179938482, 0.7645663341592618, 0.7533615644224196, 0.7550610368250171, 0.7605960035030759, 0.7496689166097061, 0.7472311816473001, 0.7548260744190021, 0.7596721740430622, 0.7576576384142173, 0.7598243656015037, 0.7669293083561176, 0.7648920774948736, 0.7529250149521531, 0.7631659048188655, 0.7546338324504442, 0.7572304340396446, 0.7624156271360218, 0.752391009483937, 0.752037230861244, 0.7537313632091593, 0.753073201469583, 0.7655876196172249, 0.759463911910458, 0.760314315618592, 0.7630444185748463, 0.7686968664559125, 0.7621085739917977, 0.7572744894907724, 0.7546204823137388, 0.7541665776657552, 0.7657825316131237, 0.7662885017942584, 0.7586482185577581, 0.7424798679938482, 0.7712106971975393, 0.7687022065105947, 0.7581409133629526, 0.7647986265379357, 0.7460763948222828, 0.7600593280075187, 0.76

In [710]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [711]:
# 53
column_to_drop_52 = '소득 대비 주택 임대료의 비율'

In [712]:
if not column_to_drop_52.startswith('Cat_'):
    comp_53 = comp_52.drop(column_to_drop_52, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']
else:
    comp_53 = comp_52.drop(comp_52.filter(regex='^' + column_to_drop_52).columns, axis=1)
    X_53 = comp_53.drop('target', axis=1)
    y_53 = comp_53['target']

print(X_53.shape)

(6119, 28)


In [713]:
X_train, X_test, y_train, y_test = train_test_split(X_53, y_53, test_size=0.2, shuffle=True, stratify=y_53, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [714]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [715]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 198, 'learning_rate': 0.060000000000000005, 'max_depth': 8, 'num_leaves': 438, 'subsample': 0.4, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7874648369351998


In [716]:
optuna_53 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_53.fit(X_train, y_train)

In [717]:
optuna_proba_53 = optuna_53.predict_proba(X_test)[:, 1]
auc_53 = roc_auc_score(y_test, optuna_proba_53)
print(decimal.Decimal(auc_53).quantize(decimal.Decimal('1.000')))

0.777


In [718]:
X_train = X_train.values
y_train = y_train.values

In [719]:
auc_bootstrap = []

In [720]:
rs = RandomState(seed = 53)
bootstrap_auc(optuna_53, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.75840084, 0.77599352])

In [721]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9988407492637634, pvalue=0.20900993049144745),
 0.7674096723075445)

In [722]:
t_53 = auc_bootstrap
print(t_53)

[0.7643780972317157, 0.7696166908749146, 0.7663993079289133, 0.760403761534518, 0.7760754870129869, 0.7628161312371838, 0.7701867417122351, 0.7679826341421736, 0.7677356566131237, 0.7723961893369788, 0.7631311944634314, 0.7660361842105263, 0.7624783727785371, 0.7658479472829802, 0.771899564251538, 0.7693109727443609, 0.7710598406527682, 0.7676048252734108, 0.768248301862611, 0.7712373974709501, 0.7672697368421052, 0.7702708475734792, 0.771701982228298, 0.7701800666438825, 0.7618602614490773, 0.7695085547676009, 0.7672977721291866, 0.7710077751196172, 0.7670801649008886, 0.7628348214285715, 0.7757817840054684, 0.758326480263158, 0.7675701149179768, 0.7622994809466849, 0.7744387602529049, 0.7632606907894737, 0.7627747458133971, 0.7693924085782639, 0.7743813546650717, 0.7725283556903622, 0.7590006621667805, 0.7649855284518113, 0.7640710440874916, 0.7704871197881066, 0.756741819036227, 0.7697128118591934, 0.7756416075700614, 0.7761595928742311, 0.768025354579631, 0.7664700636534518, 0.7599

In [723]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [724]:
# 54
column_to_drop_53 = 'Cat_주택 마련 예상 소요연수'

In [725]:
if not column_to_drop_53.startswith('Cat_'):
    comp_54 = comp_53.drop(column_to_drop_53, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']
else:
    comp_54 = comp_53.drop(comp_53.filter(regex='^' + column_to_drop_53).columns, axis=1)
    X_54 = comp_54.drop('target', axis=1)
    y_54 = comp_54['target']

print(X_54.shape)

(6119, 22)


In [726]:
X_train, X_test, y_train, y_test = train_test_split(X_54, y_54, test_size=0.2, shuffle=True, stratify=y_54, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [727]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [728]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 139, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 610, 'subsample': 0.1, 'colsample_bytree': 0.9, 'reg_alpha': 5, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7838713177906327


In [729]:
optuna_54 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_54.fit(X_train, y_train)

In [730]:
optuna_proba_54 = optuna_54.predict_proba(X_test)[:, 1]
auc_54 = roc_auc_score(y_test, optuna_proba_54)
print(decimal.Decimal(auc_54).quantize(decimal.Decimal('1.000')))

0.766


In [731]:
X_train = X_train.values
y_train = y_train.values

In [732]:
auc_bootstrap = []

In [733]:
rs = RandomState(seed = 54)
bootstrap_auc(optuna_54, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.7480428 , 0.76618324])

In [734]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9952969551086426, pvalue=6.1495275076595135e-06),
 0.7576593465642089)

In [735]:
t_54 = auc_bootstrap
print(t_54)

[0.7505580357142858, 0.7508624188311688, 0.7572918446684894, 0.7537914388243336, 0.7616466592617908, 0.7609818224538619, 0.755157157809296, 0.7580047419685578, 0.7623301862611074, 0.765123034859877, 0.7648346719070402, 0.7610111927546138, 0.7531332770847573, 0.7597108894395077, 0.7509198244190021, 0.7559915413533834, 0.7608910415242653, 0.7568058996924129, 0.7617307651230347, 0.7563506600307587, 0.758544087491456, 0.756648368079289, 0.7608683462918661, 0.7604544920539986, 0.7679599389097744, 0.7528035287081339, 0.7556657980177717, 0.7564320958646616, 0.7612822005297334, 0.7640950743335612, 0.7566817434210527, 0.759399831254272, 0.7609391020164046, 0.7542359983766235, 0.7655715994531784, 0.7486716613978127, 0.7564654712064252, 0.7606200337491456, 0.7596801841250855, 0.7615825786056049, 0.7476904263499659, 0.7592916951469584, 0.7539916908749147, 0.7576109129357484, 0.7524217147983595, 0.7559661760936432, 0.7438162166780588, 0.7481683612440191, 0.7561957984449761, 0.7599311666951469, 0.75

In [736]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [737]:
# 55
column_to_drop_54 = 'Cat_이사 계획 중인 거주 지역'

In [738]:
if not column_to_drop_54.startswith('Cat_'):
    comp_55 = comp_54.drop(column_to_drop_54, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']
else:
    comp_55 = comp_54.drop(comp_54.filter(regex='^' + column_to_drop_54).columns, axis=1)
    X_55 = comp_55.drop('target', axis=1)
    y_55 = comp_55['target']

print(X_55.shape)

(6119, 15)


In [739]:
X_train, X_test, y_train, y_test = train_test_split(X_55, y_55, test_size=0.2, shuffle=True, stratify=y_55, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [740]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [741]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 140, 'learning_rate': 0.09999999999999999, 'max_depth': 3, 'num_leaves': 796, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.5, 'reg_alpha': 4, 'reg_lambda': 1, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7728779038222355


In [742]:
optuna_55 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_55.fit(X_train, y_train)

In [743]:
optuna_proba_55 = optuna_55.predict_proba(X_test)[:, 1]
auc_55 = roc_auc_score(y_test, optuna_proba_55)
print(decimal.Decimal(auc_55).quantize(decimal.Decimal('1.000')))

0.757


In [744]:
X_train = X_train.values
y_train = y_train.values

In [745]:
auc_bootstrap = []

In [746]:
rs = RandomState(seed = 55)
bootstrap_auc(optuna_55, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.74121845, 0.75746686])

In [747]:
shapiro(auc_bootstrap), np.mean(auc_bootstrap)

(ShapiroResult(statistic=0.9968487024307251, pvalue=0.00039872623165138066),
 0.749697881733809)

In [748]:
t_55 = auc_bootstrap
print(t_55)

[0.7461204502734109, 0.752400354579631, 0.7399620322112099, 0.7566069826555024, 0.7472378567156528, 0.7563333048530417, 0.753557811431989, 0.7448909026828434, 0.7557792741797675, 0.7558940853554341, 0.7511894971804511, 0.7601354237867395, 0.7424344775290499, 0.7559020954374573, 0.7531506322624745, 0.7562545390464799, 0.7570341870300752, 0.7505607057416268, 0.7523990195659604, 0.7521266767771703, 0.7496916118421053, 0.7527848385167464, 0.7410140229835953, 0.7469201234620643, 0.7516140315276829, 0.7495127200102529, 0.7436253097231715, 0.7523295988550922, 0.7475021894224196, 0.7517995984278878, 0.7546792229152426, 0.7424224624060151, 0.7469855391319207, 0.7492750875768968, 0.7504645847573479, 0.7493364982057417, 0.7520265507518796, 0.7339184253246753, 0.7482217617908407, 0.752234812884484, 0.75046992481203, 0.7527941836124401, 0.7531639823991798, 0.7545390464798359, 0.7460403494531784, 0.7562825743335612, 0.7528515892002734, 0.7504191942925496, 0.7483512581168831, 0.7485848855092276, 0.74

In [749]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [750]:
# 56
column_to_drop_55 = '중기부채부담지표'

In [751]:
if not column_to_drop_55.startswith('Cat_'):
    comp_56 = comp_55.drop(column_to_drop_55, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']
else:
    comp_56 = comp_55.drop(comp_55.filter(regex='^' + column_to_drop_55).columns, axis=1)
    X_56 = comp_56.drop('target', axis=1)
    y_56 = comp_56['target']

print(X_56.shape)

(6119, 14)


In [752]:
X_train, X_test, y_train, y_test = train_test_split(X_56, y_56, test_size=0.2, shuffle=True, stratify=y_56, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [753]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [754]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 80, 'learning_rate': 0.09, 'max_depth': 6, 'num_leaves': 494, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 5, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.769171695924006


In [755]:
optuna_56 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_56.fit(X_train, y_train)

In [756]:
optuna_proba_56 = optuna_56.predict_proba(X_test)[:, 1]
auc_56 = roc_auc_score(y_test, optuna_proba_56)
print(decimal.Decimal(auc_56).quantize(decimal.Decimal('1.000')))

0.749


In [757]:
X_train = X_train.values
y_train = y_train.values

In [758]:
auc_bootstrap = []

In [759]:
rs = RandomState(seed = 56)
bootstrap_auc(optuna_56, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.73455383, 0.74956499])

In [760]:
np.mean(auc_bootstrap)

0.7423326934167805

In [761]:
t_56 = auc_bootstrap
print(t_56)

[0.7493044578776487, 0.7473406527682843, 0.7359876965140124, 0.7478279327580314, 0.7449523133116883, 0.7427935962064252, 0.7458187371838687, 0.7500253652597404, 0.7406549043062202, 0.7490841806220097, 0.7427054853041695, 0.7450217340225564, 0.7346139674470267, 0.740249060150376, 0.7437214307074504, 0.7398058356117566, 0.7366378481715653, 0.7455864448051948, 0.7430966443096377, 0.7462552866541354, 0.7458574525803143, 0.7368180750170882, 0.7483552631578947, 0.7400608232228298, 0.7461685107655502, 0.7410847787081339, 0.7475489149008886, 0.7428243015208476, 0.7279709394224197, 0.7413144010594669, 0.7475395698051948, 0.7420873739747094, 0.7396896894224196, 0.7424545027341081, 0.7432221355946684, 0.7422876260252904, 0.7400835184552289, 0.7390835932159945, 0.7463460675837321, 0.7468026422590568, 0.7490721654989747, 0.7409552823820915, 0.745258031442242, 0.7437387858851676, 0.7400501431134655, 0.7394667421394395, 0.7446012047163364, 0.7470856651572112, 0.7448655374231032, 0.7425719839371154, 0

In [762]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [763]:
# 57
column_to_drop_56 = 'Cat_현재 주택의 점유형태'

In [764]:
if not column_to_drop_56.startswith('Cat_'):
    comp_57 = comp_56.drop(column_to_drop_56, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']
else:
    comp_57 = comp_56.drop(comp_56.filter(regex='^' + column_to_drop_56).columns, axis=1)
    X_57 = comp_57.drop('target', axis=1)
    y_57 = comp_57['target']

print(X_57.shape)

(6119, 10)


In [765]:
X_train, X_test, y_train, y_test = train_test_split(X_57, y_57, test_size=0.2, shuffle=True, stratify=y_57, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [766]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [767]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 125, 'learning_rate': 0.08, 'max_depth': 6, 'num_leaves': 424, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.9, 'reg_alpha': 6, 'reg_lambda': 4, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7620368283541599


In [768]:
optuna_57 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_57.fit(X_train, y_train)

In [769]:
optuna_proba_57 = optuna_57.predict_proba(X_test)[:, 1]
auc_57 = roc_auc_score(y_test, optuna_proba_57)
print(decimal.Decimal(auc_57).quantize(decimal.Decimal('1.000')))

0.734


In [770]:
X_train = X_train.values
y_train = y_train.values

In [771]:
auc_bootstrap = []

In [772]:
rs = RandomState(seed = 57)
bootstrap_auc(optuna_57, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.71637685, 0.73563302])

In [773]:
np.mean(auc_bootstrap)

0.7266035376527256

In [774]:
t_57 = auc_bootstrap
print(t_57)

[0.7237683163875598, 0.729296607997266, 0.7245573094668489, 0.7279762794771018, 0.716379015721121, 0.7282953477443608, 0.727143230946685, 0.71845362696514, 0.7308946193609022, 0.7281458262132605, 0.7295088751708818, 0.734340289644566, 0.7262247415413534, 0.7244171330314423, 0.7233130767259057, 0.7261539858168148, 0.7268882433356116, 0.718732644822283, 0.7237242609364319, 0.7211129741968558, 0.7336367374401914, 0.7270604600991115, 0.7266799812030076, 0.7280323500512645, 0.7222450657894738, 0.7222050153793576, 0.7251153451811346, 0.7235533791866028, 0.7271952964798359, 0.7278254229323309, 0.7273047676008202, 0.7223291716507177, 0.718448286910458, 0.7230687692241968, 0.7299787999829119, 0.7176713089542038, 0.7349063354408749, 0.7232663512474367, 0.725704086209843, 0.7301136363636362, 0.7340105412679426, 0.7237830015379357, 0.7287399072966507, 0.7301443416780586, 0.7210702537593985, 0.7353829353212576, 0.7229993485133288, 0.719069068267259, 0.7337008180963774, 0.7341413726076554, 0.7288093

In [775]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc

In [776]:
# 58
column_to_drop_57 = '현재 주택의 면적(㎡)'

In [777]:
if not column_to_drop_57.startswith('Cat_'):
    comp_58 = comp_57.drop(column_to_drop_57, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']
else:
    comp_58 = comp_57.drop(comp_57.filter(regex='^' + column_to_drop_57).columns, axis=1)
    X_58 = comp_58.drop('target', axis=1)
    y_58 = comp_58['target']

print(X_58.shape)

(6119, 9)


In [778]:
X_train, X_test, y_train, y_test = train_test_split(X_58, y_58, test_size=0.2, shuffle=True, stratify=y_58, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state = 0)

In [779]:
def objective(trial):
    params = {
        'n_estimators' :trial.suggest_int('n_estimators', 50, 200),
        'learning_rate' :trial.suggest_float('learning_rate',0.01, 0.1,step=0.01),
        'max_depth' :trial.suggest_int('max_depth', 1, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=2),
        'subsample':trial.suggest_float('subsample', 0.1, 1.0,step=0.1),
        'colsample_bytree' :trial.suggest_float('colsample_bytree', 0.1, 1.0,step=0.1),
        'reg_alpha' :trial.suggest_int('reg_alpha', 1, 10),
        'reg_lambda' :trial.suggest_int('reg_lambda', 1, 10),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt']),
        'objective' : trial.suggest_categorical('objective', ['binary'])
        }
        
    clf = LGBMClassifier(**params,random_state = 0)
    clf.fit(X_train, y_train)
    clf_proba = clf.predict_proba(X_val)[:, 1]
        
    AUC_SCORE = roc_auc_score(y_val, clf_proba)
    return AUC_SCORE

direction = "maximize"
sampler = optuna.samplers.TPESampler(seed=10)
study =  optuna.create_study(direction=direction, sampler=sampler, pruner=optuna.pruners.HyperbandPruner())
early_stopping = EarlyStoppingCallback(10, direction=direction)
study.optimize(objective, callbacks=[early_stopping], n_trials=200) 

In [780]:
print(study.best_trial.params)
optuna_auc = study.best_trial.value
print(optuna_auc)

{'n_estimators': 166, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 768, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 2, 'reg_lambda': 8, 'boosting_type': 'gbdt', 'objective': 'binary'}
0.7172597434035342


In [781]:
optuna_58 = LGBMClassifier(**study.best_trial.params, random_state = 0)
optuna_58.fit(X_train, y_train)

In [782]:
optuna_proba_58 = optuna_58.predict_proba(X_test)[:, 1]
auc_58 = roc_auc_score(y_test, optuna_proba_58)
print(decimal.Decimal(auc_58).quantize(decimal.Decimal('1.000')))

0.706


In [783]:
X_train = X_train.values
y_train = y_train.values

In [784]:
auc_bootstrap = []

In [785]:
rs = RandomState(seed = 58)
bootstrap_auc(optuna_58, X_train, y_train, X_test, y_test, nsamples=2000)

array([0.70177931, 0.7079444 ])

In [786]:
np.mean(auc_bootstrap)

0.7060823363006237

In [787]:
t_58 = auc_bootstrap
print(t_58)

[0.706453189080656, 0.706453189080656, 0.7057683270676691, 0.706453189080656, 0.7059245236671222, 0.706453189080656, 0.7064251537935747, 0.7059245236671222, 0.706453189080656, 0.7064251537935747, 0.7072421821599453, 0.7017793062200957, 0.7064812243677374, 0.7066854814593301, 0.706453189080656, 0.706453189080656, 0.7013067113807244, 0.7064251537935747, 0.706453189080656, 0.7058964883800409, 0.706453189080656, 0.7064251537935747, 0.706453189080656, 0.7032985517771702, 0.7074157339371154, 0.7010770890293916, 0.7058964883800409, 0.706453189080656, 0.7059245236671222, 0.7079443993506492, 0.7057963623547504, 0.706453189080656, 0.7079443993506492, 0.7066854814593301, 0.7064812243677374, 0.7059525589542036, 0.706453189080656, 0.7072875726247436, 0.7059245236671222, 0.706453189080656, 0.7057683270676691, 0.706453189080656, 0.7074157339371154, 0.706453189080656, 0.7064812243677374, 0.7059245236671222, 0.7064812243677374, 0.7079443993506492, 0.706453189080656, 0.7072875726247436, 0.70183537679425

In [788]:
del X_train, X_test, y_train, y_test, X_val, y_val,study, optuna_auc