In [1]:
import warnings
warnings.filterwarnings('ignore')

from evaluation_metric import lgb_amex_metric
import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb

class CFG:
    input_dir = 'Data/'
    seed = 42
    n_folds = 5
    target = 'target'

score_dic = {
    
    0:0.78,
    1:0.78,
    2:0.78,
    3:0.78,
    4:0.78,
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def save_model(fold):
    def callback(env):
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 200 == 0:
            print('iteration {}, score= {:.05f}, max_score= {:.05f}'.format(iteration,score, score_dic[fold]))
        if score > score_dic[fold]:
            score_dic[fold] = score
            path = 'models_DART_slope/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}_iter".format(fold)):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
            joblib.dump(env.model, path + 'fold_{}_iter_{}_score_{:.05f}.pkl'.format(fold, iteration, score))

    callback.order = 0
    return callback

In [2]:
train = pd.read_parquet(CFG.input_dir + 'train_all_slopes.parquet')
labels = pd.read_pickle('Data/train_labels.pkl').loc[train.index]

train['target'] = labels

cat_features = [
    
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

cat_features = [f"{cf}_last" for cf in cat_features]

In [3]:
def train_and_evaluate(train, parameters):
    
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        
        print(' ')

        features = [col for col in train.columns if col not in ['target']]
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        del x_train, x_val, y_train, y_val; gc.collect()
        
        path = 'models_DART_slope/'
        
        model = lgb.train(
            params = parameters,
            train_set = lgb_train,
            num_boost_round = 15000,
            valid_sets = [lgb_valid],
            feval = lgb_amex_metric,
            callbacks=[save_model(fold)],
            # init_model= path + 'cp_{}_model.txt'.format(fold),
            )

        
        for fname in os.listdir(path):
            if fname.startswith("fold_{}_iter".format(fold)):
                model = joblib.load(path + fname)
                model.save_model(path + 'cp_{}_model.txt'.format(fold))

In [4]:
params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [100],
    'learning_rate': [0.01],
    'drop_rate': [0.1],
    'feature_fraction': [0.30],
    'bagging_freq': [10],
    'bagging_fraction': [0.25],
    'n_jobs': [-1],
    'lambda_l2': [2],
    'min_data_in_leaf': [40]

}

In [5]:
grid  = list(ParameterGrid(params))
len_grid = len(grid)
for run, parameters in enumerate(grid):
    print('-' * 50)
    print(run, len_grid, parameters)
    train_and_evaluate(train, parameters)

--------------------------------------------------
0 1 {'bagging_fraction': 0.25, 'bagging_freq': 10, 'boosting': 'dart', 'drop_rate': 0.1, 'feature_fraction': 0.3, 'lambda_l2': 2, 'learning_rate': 0.01, 'metric': 'amex_metric', 'min_data_in_leaf': 40, 'n_jobs': -1, 'num_leaves': 100, 'objective': 'binary', 'seed': 42}
 
Training fold 0 with 1107 features...
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 188406
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 1099
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
iteration 0, score= 0.69550, max_score= 0.78000
iteration 200, score= 0.76654, max_score= 0.78000
iteration 400, score= 0.77170, max_score= 0.78000
iteration 600, score= 0.77503, max_score= 0.78000
High Score: iteration 764, score=0.78004
High

KeyboardInterrupt: 