In [17]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

import lightgbm as lgbm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, f1_score, accuracy_score

from bayes_opt import BayesianOptimization

import warnings
warnings.simplefilter('ignore')

### load data

In [6]:
CURRENT_PATH = '/Users/skcc10170/Desktop'

df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

### data feature

In [7]:
# find all columns
cols = df_train.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# input cols
cols_input1 = cols_ecfp + cols_fcfp + cols_ptfp
cols_input2 = cols_mol
cols_input  = cols_input1 + cols_input2

# label
cols_label = 'label'

### CV start!

In [8]:
n_splits     = 5
random_state = 2020
num_test     = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

In [11]:
x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

trainset_x = np.vstack([x_train, x_valid])
trainset_y = np.hstack([y_train, y_valid])

d_train = lgbm.Dataset(trainset_x, trainset_y)

In [None]:
def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def lgb_f1_score(y_hat, data):
    y_true = data.get_label().astype(int)
    y_hat = np.round(y_hat).astype(int) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True


n_folds = 5
random_seed=2020

def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'application':'binary',
              'num_iterations': 30000 , 
              'learning_rate':0.02, 
              'early_stopping_round':100,
              "objective" : "binary",
              "num_threads" : 20 ,
             }
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgbm.cv(params, d_train,
                       nfold=n_folds, seed=random_seed, 
                       stratified=True, verbose_eval =200, 
                       metrics=["None"], 
                       feval=lgb_f1_score
                      )
    return max(cv_result['f1-mean'])

lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=0)
init_round=5
opt_round = 10
lgbBO.maximize(init_points=init_round, n_iter=opt_round)
# lgbBO.points_to_csv("lgb_bayes_opt_result.csv")
params = lgbBO.res['max']['max_params']
lgb2 = lgbm.train(params, d_train , 100)
lgb_prob = lgb2.predict( lgb_test.values )

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's f1: 0.778245 + 0.0101776
[400]	cv_agg's f1: 0.78905 + 0.00988243
[600]	cv_agg's f1: 0.792864 + 0.0100368
[800]	cv_agg's f1: 0.797835 + 0.00826073
[1000]	cv_agg's f1: 0.801347 + 0.00815231
[1200]	cv_agg's f1: 0.805922 + 0.00773312
[1400]	cv_agg's f1: 0.807214 + 0.00710659
[1600]	cv_agg's f1: 0.80855 + 0.00603789
[1800]	cv_agg's f1: 0.809988 + 0.00573007
| [0m 1       [0m | [0m 0.8103  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 6.69    [0m | [0m 34.07   [0m | [0m 0.04432 [0m | [0m 42.73   [0m |
[200]	cv_agg's f1: 0.771531 + 0.0113234
[400]	cv_agg's f1: 0.780214 + 0.00971325
[600]	cv_agg's f1: 0.78762 + 0.00962899
[800]	cv_agg's f1: 0.790942 + 0.00867544
[1000]	cv_agg's f1: 0.