In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn import ensemble, model_selection
import lightgbm

In [2]:
df = pd.read_csv('../data/train_sample.csv')

In [3]:
def append_time_feature(df):
    df['hour'] = df.click_time.apply(lambda t: int(t[11: 13]))
append_time_feature(df)

In [4]:
xnames = ['ip', 'app', 'device', 'os', 'channel', 'hour']
yname = 'is_attributed'
categorical_names = ['ip', 'app', 'device', 'os', 'channel', 'hour'] 

In [33]:
df_train, df_test = model_selection.train_test_split(df, test_size=0.99)

data_train = lightgbm.Dataset(df_train[xnames], df_train[yname])
data_test = lightgbm.Dataset(df_test[xnames], df_test[yname])

In [36]:
data_train

<lightgbm.basic.Dataset at 0x7f3aaa2c8940>

In [40]:
from bayes_opt import BayesianOptimization

def lgb_evaluate(learning_rate):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': learning_rate,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'scale_pos_weight':99, # because training data is extremely unbalanced 
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 0,
    }
    cv_result = lightgbm.cv(params, data_train,
            num_boost_round=300,
            early_stopping_rounds=50,
            nfold=5,
            categorical_feature=categorical_names,
            seed=2333)

    return cv_result['auc-mean'][-1]


xgbBO = BayesianOptimization(lgb_evaluate, {'learning_rate': (0.001, 1)})

xgbBO.maximize(n_iter=25, init_points=5)

[31mInitialization[0m
[94m-----------------------------------------------[0m
 Step |   Time |      Value |   learning_rate | 
    1 | 00m00s | [35m   0.92209[0m | [32m         0.5476[0m | 
    2 | 00m00s |    0.92181 |          0.5525 | 
    3 | 00m00s | [35m   0.92409[0m | [32m         0.4881[0m | 
    4 | 00m00s |    0.92199 |          0.5031 | 
    5 | 00m00s |    0.92157 |          0.8070 | 




[31mBayesian Optimization[0m
[94m-----------------------------------------------[0m
 Step |   Time |      Value |   learning_rate | 
    6 | 00m00s |    0.91957 |          0.0010 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


    7 | 00m03s |    0.91667 |          1.0000 | 
    8 | 00m09s |    0.92381 |          0.1926 | 
    9 | 00m09s |    0.92352 |          0.3226 | 


  " state: %s" % convergence_dict)


   10 | 00m09s | [35m   0.92708[0m | [32m         0.9120[0m | 


  " state: %s" % convergence_dict)


   11 | 00m10s |    0.92342 |          0.0846 | 
   12 | 00m11s |    0.92629 |          0.4001 | 


  " state: %s" % convergence_dict)


KeyboardInterrupt: 