In [1]:
# ================================================== [ setting ] =======================================================
import pandas as pd 
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold 
from xgboost import XGBClassifier
import lightgbm as lgb 
from bayes_opt import BayesianOptimization
from functools import partial 

import os 
os.chdir("C:/Users/knuser/Desktop/Santander Customer Satisfaction")

  return f(*args, **kwds)


In [3]:
# =============================================== [ preprocessing ] ====================================================
def Preprocessing(file_path) : 
    df = pd.read_csv(file_path)
    df["var3"] = df["var3"].replace(-999999, np.median(df["var3"]))
    df.drop("ID", axis = 1, inplace = True)
    
    X_features = df.iloc[:, :-1]
    y_labels = df.iloc[:, -1]
    
    return df, X_features, y_labels  

train_df, X_features, y_labels = Preprocessing("train.csv")
test_df, _, _ = Preprocessing("test.csv")

In [22]:
# ================================================= [ modeling ] =======================================================
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data = None, y_data = None, n_splits = 5, output = "socre") :
    score = 0
    kf = StratifiedKFold(n_splits = n_splits)
    models = []
    
    for train_index, valid_index in kf.split(x_data, y_data) : 
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(num_leaves = int(num_leaves), 
                                   learning_rate = learning_rate, 
                                   n_estimators = int(n_estimators), 
                                   subsample = np.clip(subsample, 0, 1), 
                                   colsample_bytree = np.clip(colsample_bytree, 0, 1), 
                                   reg_alpha = reg_alpha, 
                                   reg_lambda = reg_lambda,)
        
        model.fit(x_train, y_train)
        models.append(model)
    
        pred = model.predict_proba(x_valid)[:, 1]    
        true = y_valid
        score += roc_auc_score(true, pred) / n_splits
        
    if output == "score" :
        return score
    if output == "model" :
        return models

In [23]:
func_fixed = partial(lgb_cv, x_data = X_features, y_data = y_labels, n_splits = 5, output = 'score') 

lgbBO = BayesianOptimization(
    func_fixed, 
    {'num_leaves': (16, 1024),
     'learning_rate': (0.0001, 0.1),
     'n_estimators': (16, 1024),
     'subsample': (0, 1),
     'colsample_bytree': (0, 1),
     'reg_alpha': (0, 10),
     'reg_lambda': (0, 50),}, 
    random_state = 2109
)

lgbBO.maximize(init_points = 5, n_iter = 30)

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8187  [0m | [0m 0.6128  [0m | [0m 0.09316 [0m | [0m 1.001e+0[0m | [0m 436.0   [0m | [0m 3.19    [0m | [0m 22.89   [0m | [0m 0.5532  [0m |
| [0m 2       [0m | [0m 0.8035  [0m | [0m 0.3464  [0m | [0m 0.01621 [0m | [0m 20.21   [0m | [0m 71.48   [0m | [0m 1.063   [0m | [0m 37.63   [0m | [0m 0.2233  [0m |
| [95m 3       [0m | [95m 0.8332  [0m | [95m 0.09816 [0m | [95m 0.02952 [0m | [95m 492.0   [0m | [95m 128.9   [0m | [95m 8.813   [0m | [95m 7.464   [0m | [95m 0.5725  [0m |
| [0m 4       [0m | [0m 0.8219  [0m | [0m 0.02332 [0m | [0m 0.07836 [0m | [0m 498.4   [0m | [0m 833.4   [0m | [0m 0.4697  [0m | [0m 23.6    [0m | [0m 0.2591  [0m |
| [0m 5       [0m | [0m 0.8124  [0m | 

In [35]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data = X_features, y_data = y_labels, n_splits = 5, output = 'model') # ROC-AUC : 0.839

In [40]:
preds = []
for model in models :
    pred = model.predict_proba(test_df)[:, 1]
    preds.append(pred)
    
pred = np.mean(preds, axis = 0)

In [46]:
# ================================================= [ modeling (2) ] ===================================================
def xgb_cv(max_leaves, eta, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data = None, y_data = None, n_splits = 5, output = "socre") :
    score = 0
    kf = StratifiedKFold(n_splits = n_splits)
    models = []
    
    for train_index, valid_index in kf.split(x_data, y_data) : 
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = XGBClassifier(max_leaves = int(max_leaves),
                              eta = eta,
                              n_estimators = int(n_estimators),
                              subsample = np.clip(subsample, 0, 1),
                              colsample_bytree = np.clip(colsample_bytree, 0, 1),
                              reg_alpha = reg_alpha,
                              reg_lambda = reg_lambda, )
        
        model.fit(x_train, y_train)
        models.append(model)
    
        pred = model.predict_proba(x_valid)[:, 1]    
        true = y_valid
        score += roc_auc_score(true, pred) / n_splits
        
    if output == "score" :
        return score
    if output == "model" :
        return models

In [None]:
xgb_func_fixed = partial(xgb_cv, x_data = X_features, y_data = y_labels, n_splits = 5, output = 'score') 

xgbBO = BayesianOptimization(
    xgb_func_fixed, 
    {'max_leaves': (16, 1024),
     'eta': (0.0001, 0.1),
     'n_estimators': (16, 1024),
     'subsample': (0, 1),
     'colsample_bytree': (0, 1),
     'reg_alpha': (0, 10),
     'reg_lambda': (0, 50),}, 
    random_state = 2109
)

xgbBO.maximize(init_points = 5, n_iter = 30)

|   iter    |  target   | colsam... |    eta    | max_le... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8388  [0m | [0m 0.6128  [0m | [0m 0.09316 [0m | [0m 1.001e+0[0m | [0m 436.0   [0m | [0m 3.19    [0m | [0m 22.89   [0m | [0m 0.5532  [0m |
| [0m 2       [0m | [0m 0.8298  [0m | [0m 0.3464  [0m | [0m 0.01621 [0m | [0m 20.21   [0m | [0m 71.48   [0m | [0m 1.063   [0m | [0m 37.63   [0m | [0m 0.2233  [0m |
| [0m 3       [0m | [0m 0.8267  [0m | [0m 0.09816 [0m | [0m 0.02952 [0m | [0m 492.0   [0m | [0m 128.9   [0m | [0m 8.813   [0m | [0m 7.464   [0m | [0m 0.5725  [0m |
| [0m 4       [0m | [0m 0.8222  [0m | [0m 0.02332 [0m | [0m 0.07836 [0m | [0m 498.4   [0m | [0m 833.4   [0m | [0m 0.4697  [0m | [0m 23.6    [0m | [0m 0.2591  [0m |
| [0m 5       [0m | [0m 0.8355  [0m | [0m 0.068

In [None]:
preds_2 = []
for model in models :
    pred = model.predict_proba(test_df)[:, 1]
    preds.append(pred)
    
preds_2 = np.mean(preds, axis = 0)

In [None]:
total_preds = preds * 0.7 + preds_2 * 0.3 

In [None]:
np.cortest(preds, preds_2)