In [1]:
import warnings
warnings.filterwarnings('ignore')

from evaluation_metric import lgb_amex_metric
import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.calibration import CalibratedClassifierCV
import lightgbm as lgb

main_fold = 3

class CFG:
    input_dir = 'Data/'
    seed = 42
    n_folds = 5
    target = 'target'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def save_model(fold):
    def callback(env):
        global max_score
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 200 == 0:
            print('iteration {}, score= {:.05f}, max_score= {:.05f}'.format(iteration,score, max_score))
        if score > max_score:
            max_score = score
            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}_{}".format(main_fold, fold)):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
            joblib.dump(env.model, 'Models/fold_{}_{}_iter_{}_score_{:.05f}.pkl'.format(main_fold, fold, iteration, score))

    callback.order = 0
    return callback

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]

In [2]:
x_train = pd.read_pickle('Output/x_train_fold_{}.pkl'.format(main_fold))
x_val = pd.read_pickle('Output/x_val_fold_{}.pkl'.format(main_fold))
y_train = pd.read_pickle('Output/y_train_fold_{}.pkl'.format(main_fold))
y_val = pd.read_pickle('Output/y_val_fold_{}.pkl'.format(main_fold))

x_train.shape, x_val.shape, y_train.shape, y_val.shape

((367131, 1103), (91782, 1103), (367131,), (91782,))

In [3]:
val = pd.concat([x_val, y_val], axis=1)
val.shape

(91782, 1104)

In [4]:
def train_and_evaluate(x_train_org, y_train_org, val, parameters):
    global max_score
    
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(val, val[CFG.target])):
        max_score = 0.785
        print(' ')
        features = [col for col in x_train_org.columns if col not in ['target']]
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = val[features].iloc[trn_ind], val[features].iloc[val_ind]
        y_train, y_val = val[CFG.target].iloc[trn_ind], val[CFG.target].iloc[val_ind]
        x_train_new = pd.concat([x_train_org, x_train], axis=0)
        y_train_new = pd.concat([y_train_org, y_train], axis=0)
       
        lgb_train = lgb.Dataset(x_train_new, y_train_new, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        del x_train, x_val, y_train, y_val, x_train_new, y_train_new; gc.collect()
            
        model = lgb.train(
            params = parameters,
            train_set = lgb_train,
            num_boost_round = 15000,
            valid_sets = [lgb_valid],
            feval = lgb_amex_metric,
            callbacks=[save_model(fold)],
            )

        path = 'Models/'
        for fname in os.listdir(path):
            if fname.startswith("fold_{}_{}".format(main_fold, fold)):
                model = joblib.load('Models/' + fname)
                model.save_model('Models/cp_{}_{}_model.txt'.format(main_fold, fold))

In [5]:
params = {
    'objective': 'binary',
    'metric':'None',
    'boosting': 'dart',
    'seed': CFG.seed,
    'num_leaves': 100,
    'learning_rate': 0.01,
    'drop_rate': 0.1,
    'feature_fraction': 0.20,
    'bagging_freq': 10,
    'bagging_fraction': 0.50,
    'n_jobs': -1,
    'lambda_l2': 2,
    'min_data_in_leaf': 40
}

# params = {
#     'objective': ['binary'],
#     'metric': ['amex_metric'],
#     'boosting': ['dart'],
#     'seed': [42],
#     'num_leaves': [25, 50, 100, 250, 500, 1000],
#     'learning_rate': [0.01, 0.005],
#     'drop_rate': [0.01, 0.025, 0.05, 0.1, 0.2],
#     'feature_fraction': [0.20,0.30],
#     'bagging_freq': [10],
#     'bagging_fraction': [0.50,0.60],
#     'n_jobs': [-1],
#     'lambda_l2': [2],
#     'min_data_in_leaf': [40]
#     }

In [6]:
train_and_evaluate(x_train, y_train, val, params)

 
Training fold 0 with 1103 features...
[LightGBM] [Info] Number of positive: 114075, number of negative: 326481
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 187413
[LightGBM] [Info] Number of data points in the train set: 440556, number of used features: 1095
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258934 -> initscore=-1.051516
[LightGBM] [Info] Start training from score -1.051516
iteration 0, score= 0.70588, max_score= 0.78500
iteration 200, score= 0.76140, max_score= 0.78500
iteration 400, score= 0.76451, max_score= 0.78500
iteration 600, score= 0.76905, max_score= 0.78500
iteration 800, score= 0.77080, max_score= 0.78500
iteration 1000, score= 0.77459, max_score= 0.78500
iteration 1200, score= 0.77561, max_score= 0.78500
iteration 1400, score= 0.77930, max_score= 0.78500
iteration 1600, score= 0.78238, max_score= 0.78500
iteration 1800, score= 0.78300, max_score= 0.78500
High Score: iteration 1998, score=0.78522
High Score: iterat

In [None]:
# grid  = list(ParameterGrid(params))
# len_grid = len(grid)
# for run, parameters in enumerate(grid):
#     print('-' * 50)
#     print(run, len_grid, parameters)
#     train_and_evaluate(x_train,y_train, val, parameters)