In [1]:
# Load packages
import time
from scipy.stats import uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score

from lightgbm import LGBMClassifier
import lightgbm as lgbm

from hyperopt import hp, fmin, tpe
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from bayes_opt import BayesianOptimization

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [2]:
# !conda install -c conda-forge imbalanced-learn -y

In [3]:
# Make scorer: accuracy
acc_score = make_scorer(accuracy_score)

In [4]:
# Load dataset
trainSet = pd.read_csv('train.csv')
testSet = pd.read_csv('test.csv')
submitSet = pd.read_csv('sample_submission.csv')

trainSet.head()

# Remove not used variables
train = trainSet.drop(columns=['Name', 'Ticket'])
train['Cabin_letter'] = train['Cabin'].str[0:1]
train['Cabin_no'] = train['Cabin'].str[1:]

train.head()

# Feature generation: training data
train = trainSet.drop(columns=['Name', 'Ticket', 'Cabin'])
train = train.dropna(axis=0)
train = pd.get_dummies(train)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
2,2,0,3,0.33,1,2,71.29,0,1,0,0,1
3,3,0,3,19.0,0,0,13.04,0,1,0,0,1
4,4,1,3,25.0,0,0,7.76,0,1,0,0,1
5,5,0,2,35.0,0,0,6.71,0,1,0,0,1
6,6,0,3,7.0,0,0,9.77,0,1,0,0,1


In [5]:
train.groupby('Survived').size()

Survived
0    55093
1    41239
dtype: int64

In [6]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['PassengerId','Survived'], axis=0),
                                                  train['Survived'],
                                                  test_size=0.2, random_state=111,
                                                  stratify=train['Survived'])

In [7]:
sm = SMOTE(random_state = 42)
lgbm = LGBMClassifier()

In [8]:
# pipeline = Pipeline([('transformer', sm), ('estimator', lgbm)])

In [9]:
# lgbm_pip = make_pipeline(sm, lgbm)
# lgbm_pip = make_pipeline(SMOTE(random_state = 42),LGBMClassifier())
lgbm_pip = make_pipeline(SMOTE(random_state = 42), GradientBoostingClassifier())


In [10]:
# myModel = make_pipeline(SMOTE(random_state = 42),LGBMClassifier())

# skfold = StratifiedKFold (n_splits = 3)
# accs = cross_val_score (myModel, X_train, y_train, cv = skfold)

In [11]:
params_gbm = {}
learning_rate = (0.01, 1)
params_gbm['learning_rate'] = learning_rate

In [12]:
lgbm_pip

Pipeline(steps=[('smote', SMOTE(random_state=42)),
                ('gradientboostingclassifier', GradientBoostingClassifier())])

In [14]:
cross_val_score(lgbm_pip,
                             X_train, y_train, scoring='roc_auc', cv=5)

array([0.84020428, 0.84453879, 0.84311761, 0.83738358, 0.83866223])

In [26]:
# Gradient Boosting Machine
# def gbm_cl_bo(max_depth, max_features, learning_rate, n_estimators, subsample):
def gbm_cl_bo(learning_rate):
    params_gbm = {}
    
#     params_gbm['max_depth'] = round(max_depth)
#     params_gbm['max_features'] = max_features
    params_gbm['learning_rate'] = learning_rate
#     params_gbm['n_estimators'] = round(n_estimators)
#     params_gbm['subsample'] = subsample
#     lgbm_pip.fit(X_train, y_train)

    model_pip = make_pipeline(SMOTE(random_state = 42), GradientBoostingClassifier(random_state=42, **params_gbm))
    
    scores = cross_val_score(LGBMClassifier(random_state=123, **params_gbm),
                             X_train, y_train, scoring=acc_score, cv=5).mean()
    
    score = scores.mean()
    return score

In [27]:
# Run Bayesian Optimization
start = time.time()

params_gbm ={
#     'max_depth':(3, 10),
#     'max_features':(0.8, 1),
    'learning_rate':(0.01, 1)#,
#     'n_estimators':(80, 150)#,
#     'subsample': (0.8, 1)
}

gbm_bo = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
# gbm_bo.maximize(init_points=20, n_iter=4)

# print('It takes %s minutes' % ((time.time() - start)/60))

In [28]:
gbm_bo

<bayes_opt.bayesian_optimization.BayesianOptimization at 0x7fa86112de50>

In [29]:
gbm_bo.maximize(init_points=20, n_iter=4)

print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | learni... |
-------------------------------------
| [0m 1       [0m | [0m 0.7602  [0m | [0m 0.616   [0m |
| [95m 2       [0m | [95m 0.7704  [0m | [95m 0.1774  [0m |
| [0m 3       [0m | [0m 0.7663  [0m | [0m 0.4417  [0m |
| [0m 4       [0m | [0m 0.7571  [0m | [0m 0.7716  [0m |
| [0m 5       [0m | [0m 0.768   [0m | [0m 0.3024  [0m |
| [95m 6       [0m | [95m 0.7705  [0m | [95m 0.1577  [0m |
| [95m 7       [0m | [95m 0.7711  [0m | [95m 0.03225 [0m |
| [0m 8       [0m | [0m 0.7659  [0m | [0m 0.426   [0m |
| [0m 9       [0m | [0m 0.7706  [0m | [0m 0.2463  [0m |
| [0m 10      [0m | [0m 0.7684  [0m | [0m 0.3443  [0m |
| [0m 11      [0m | [0m 0.7511  [0m | [0m 0.9908  [0m |
| [0m 12      [0m | [0m 0.7696  [0m | [0m 0.2453  [0m |
| [95m 13      [0m | [95m 0.7726  [0m | [95m 0.09038 [0m |
| [0m 14      [0m | [0m 0.759   [0m | [0m 0.6729  [0m |
| [0m 15      [0m | [0m 0.7594  [0m | [

In [21]:
params_gbm = gbm_bo.max['params']

# params_gbm['max_depth'] = round(params_gbm['max_depth'])
# params_gbm['n_estimators'] = round(params_gbm['n_estimators'])

params_gbm

{'learning_rate': 0.08321798850177216,
 'max_depth': 9.305419254124786,
 'n_estimators': 135.577379233574}

In [40]:
# params_gbm = gbm_bo.max['params']

# # params_gbm['max_depth'] = round(params_gbm['max_depth'])
# # params_gbm['n_estimators'] = round(params_gbm['n_estimators'])

# params_gbm

In [15]:
from bayes_opt import BayesianOptimization
# Gradient Boosting Machine
def gbm_cl_bo(max_depth, max_features, learning_rate, n_estimators, subsample):
    params_gbm = {}
    params_gbm['max_depth'] = round(max_depth)
    params_gbm['max_features'] = max_features
    params_gbm['learning_rate'] = learning_rate
    params_gbm['n_estimators'] = round(n_estimators)
    params_gbm['subsample'] = subsample
    
    model_pip = make_pipeline(SMOTE(random_state = 42), GradientBoostingClassifier(random_state=42, **params_gbm))
    
    scores = cross_val_score(model_pip,
                             X_train, y_train, scoring='roc_auc', cv=5).mean()
    score = scores.mean()
    return score
# Run Bayesian Optimization
start = time.time()
params_gbm ={
    'max_depth':(3, 10),
    'max_features':(0.8, 1),
    'learning_rate':(0.01, 1),
    'n_estimators':(80, 150),
    'subsample': (0.8, 1)
}
gbm_bo = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
gbm_bo.maximize(init_points=20, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | learni... | max_depth | max_fe... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8314  [0m | [0m 0.616   [0m | [0m 4.183   [0m | [0m 0.8872  [0m | [0m 133.8   [0m | [0m 0.8591  [0m |
| [95m 2       [0m | [95m 0.8411  [0m | [95m 0.1577  [0m | [95m 3.157   [0m | [95m 0.884   [0m | [95m 96.71   [0m | [95m 0.8675  [0m |
| [0m 3       [0m | [0m 0.8095  [0m | [0m 0.9908  [0m | [0m 4.664   [0m | [0m 0.8162  [0m | [0m 126.9   [0m | [0m 0.9242  [0m |
| [0m 4       [0m | [0m 0.8372  [0m | [0m 0.2815  [0m | [0m 6.264   [0m | [0m 0.8237  [0m | [0m 85.18   [0m | [0m 0.9802  [0m |
| [0m 5       [0m | [0m 0.7634  [0m | [0m 0.796   [0m | [0m 8.884   [0m | [0m 0.963   [0m | [0m 149.4   [0m | [0m 0.9155  [0m |
| [0m 6       [0m | [0m 0.8021  [0m | [0m 0.8156  [0m | [0m 5.949   [0m | [0m 0.8055  [0m | [0m 111.

In [16]:
gbm_bo.max['params']

{'learning_rate': 0.07864837617488214,
 'max_depth': 5.5531193523155205,
 'max_features': 0.8723008386644597,
 'n_estimators': 113.04444496691903,
 'subsample': 0.8358969695415375}

In [20]:
import sklearn

In [21]:
sklearn.__version__

'0.22.2.post1'

In [42]:
from bayes_opt import BayesianOptimization
# Gradient Boosting Machine
def gbm_cl_bo(max_depth, max_features, learning_rate, n_estimators, subsample):
    params_gbm = {}
    params_gbm['max_depth'] = round(max_depth)
    params_gbm['max_features'] = max_features
    params_gbm['learning_rate'] = learning_rate
    params_gbm['n_estimators'] = round(n_estimators)
    params_gbm['subsample'] = subsample
    scores = cross_val_score(GradientBoostingClassifier(random_state=123, **params_gbm),
                             X_train, y_train, scoring=acc_score, cv=5).mean()
    score = scores.mean()
    return score
# Run Bayesian Optimization
start = time.time()
params_gbm ={
    'max_depth':(3, 10),
    'max_features':(0.8, 1),
    'learning_rate':(0.01, 1),
    'n_estimators':(80, 150),
    'subsample': (0.8, 1)
}
gbm_bo = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
gbm_bo.maximize(init_points=20, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | learni... | max_depth | max_fe... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7647  [0m | [0m 0.616   [0m | [0m 4.183   [0m | [0m 0.8872  [0m | [0m 133.8   [0m | [0m 0.8591  [0m |
| [95m 2       [0m | [95m 0.7711  [0m | [95m 0.1577  [0m | [95m 3.157   [0m | [95m 0.884   [0m | [95m 96.71   [0m | [95m 0.8675  [0m |


KeyboardInterrupt: 