In [1]:
import warnings
warnings.filterwarnings('ignore')

from evaluation_metric import lgb_amex_metric
import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.calibration import CalibratedClassifierCV
import lightgbm as lgb

class CFG:
    input_dir = 'Data/'
    seed = 42
    n_folds = 5
    target = 'target'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def save_model(fold):
    def callback(env):
        global max_score
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 200 == 0:
            print('iteration {}, score= {:.05f}'.format(iteration,score))
        if score > max_score:
            max_score = score
            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}".format(fold)):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
            joblib.dump(env.model, 'Models/fold_{}_iter_{}_score_{:.05f}.pkl'.format(fold, iteration, score))

    callback.order = 0
    return callback

In [2]:
train = pd.read_parquet(CFG.input_dir + 'train_all.parquet')
test = pd.read_parquet(CFG.input_dir + 'test_all.parquet')

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

cat_features = [f"{cf}_last" for cf in cat_features]
for cat_col in cat_features:
    encoder = LabelEncoder()
    train[cat_col] = encoder.fit_transform(train[cat_col])
    test[cat_col] = encoder.transform(test[cat_col])
    
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
num_cols = [col for col in num_cols if 'last' in col]
for col in num_cols:
    train[col + '_round2'] = train[col].round(2)
    test[col + '_round2'] = test[col].round(2)

features = [col for col in train.columns if col not in ['target']]

train.shape, test.shape

((458913, 1104), (924621, 1103))

In [3]:
def train_and_evaluate(train, parameters):
    global max_score
    max_score = 0.785
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        if fold == 1 or fold == 3:
            print(' ')

            features = [col for col in train.columns if col not in ['target']]
            print(f'Training fold {fold} with {len(features)} features...')
            x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
            y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
            lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
            lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
            del x_train, x_val, y_train, y_val; gc.collect()
            
             
            model = lgb.train(
                params = parameters,
                train_set = lgb_train,
                num_boost_round = 1500,
                valid_sets = [lgb_valid],
                feval = lgb_amex_metric,
                callbacks=[save_model(fold)],
                init_model='Models/cp_{}_model.txt'.format(fold),
                )

            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}".format(fold)):
                    model = joblib.load('Models/' + fname)
                    model.save_model('Models/cp_{}_model.txt'.format(fold))

In [4]:
params = {
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42, 43],
    'num_leaves': [100, 250, 500, 1000],
    'learning_rate': [0.01],
    'drop_rate': [0.01, 0.025, 0.1, 0.2],
    'feature_fraction': [0.20],
    'bagging_freq': [10],
    'bagging_fraction': [0.50],
    'n_jobs': [-1],
    'lambda_l2': [2],
    'min_data_in_leaf': [40]
    }

In [5]:
grid  = list(ParameterGrid(params))
len_grid = len(grid)
for run, parameters in enumerate(grid):
    print('-'*50)
    print(run, len_grid, parameters)
    train_and_evaluate(train, parameters)

--------------------------------------------------
0 32 {'bagging_fraction': 0.5, 'bagging_freq': 10, 'boosting': 'dart', 'drop_rate': 0.01, 'feature_fraction': 0.2, 'lambda_l2': 2, 'learning_rate': 0.01, 'metric': 'amex_metric', 'min_data_in_leaf': 40, 'n_jobs': -1, 'num_leaves': 100, 'objective': 'binary', 'seed': 42}
 
Training fold 1 with 1103 features...
