In [26]:
import numpy as np
import pandas as pd
from datetime import datetime
import gc
import warnings
from bayes_opt import BayesianOptimization

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import time
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, roc_auc_score
from datetime import datetime
warnings.simplefilter(action='ignore', category=FutureWarning)

In [27]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod(
            (datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [28]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [29]:
print('Load Train Data.')
train = pd.read_csv('./data/train.csv')
train = reduce_mem_usage(train)
print('\nShape of Train Data: {}'.format(train.shape))

target = train['target']                      
train_ids = np.array(train.index)                     
train.drop(['ID_code', 'target'], axis=1, inplace=True)

Load Train Data.
Mem. usage decreased to 78.01 Mb (74.7% reduction)

Shape of Train Data: (200000, 202)


In [30]:
features = [c for c in train.columns]

In [55]:
def LGB_CV(
          max_depth,
          num_leaves,
          min_data_in_leaf,
          feature_fraction,
          bagging_fraction,
          lambda_l1
         ):
    
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(train.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                               label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][features],
                               label=target.iloc[val_idx])
    
        param = {
            'num_leaves': int(num_leaves),
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'regression',
            'max_depth': int(max_depth),
            'learning_rate': 0.01,
            "boosting": "gbdt",
            "feature_fraction": feature_fraction,
            "bagging_freq": 1,
            "bagging_fraction": bagging_fraction ,
            "bagging_seed": 11,
            "metric": 'auc',
            "lambda_l1": lambda_l1,
            "verbosity": -1
        }
        
        
    
        clf = lgb.train(param,
                        trn_data,
                        15000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 200)
        
        oof[val_idx] = clf.predict(train.iloc[val_idx][features],
                                   num_iteration=clf.best_iteration)
        
        
        
        del clf, trn_idx, val_idx
        gc.collect()
        
    return -roc_auc_score(target, oof, sample_weight=None)

In [56]:
LGB_BO = BayesianOptimization(LGB_CV, {
    'max_depth': (4, 10),
    'num_leaves': (5, 130),
    'min_data_in_leaf': (10, 150),
    'feature_fraction': (0.7, 1.0),
    'bagging_fraction': (0.7, 1.0),
    'lambda_l1': (0, 6)
    })

In [57]:
print('-'*126)

start_time = timer(None)
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=2, n_iter=20, acq='ei', xi=0.0)
timer(start_time)

------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | featur... | lambda_l1 | max_depth | min_da... | num_le... |
-------------------------------------------------------------------------------------------------
fold n°0
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.90027	valid_1's auc: 0.842262
[1000]	training's auc: 0.938136	valid_1's auc: 0.866655
[1500]	training's auc: 0.95699	valid_1's auc: 0.878068
[2000]	training's auc: 0.968541	valid_1's auc: 0.884383
[2500]	training's auc: 0.976174	valid_1's auc: 0.888368
[3000]	training's auc: 0.981612	valid_1's auc: 0.890889
[3500]	training's auc: 0.985529	valid_1's auc: 0.892551
[4000]	training's auc: 0.988511	valid_1's auc: 0.893378
[4500]	training's auc: 0.990791	valid_1's auc: 0.893952
[5000]	training's auc: 0.992576	valid_1's auc: 0.89424
Early stopping, best iteration is:
[5254]	trainin

In [66]:
print(LGB_BO.max)

{'target': -0.8888258488813868, 'params': {'bagging_fraction': 0.951163427814695, 'feature_fraction': 0.9450348885475112, 'lambda_l1': 0.24642982155987636, 'max_depth': 9.865442205008463, 'min_data_in_leaf': 46.341585013143494, 'num_leaves': 127.87400737010466}}
