In [None]:
import warnings
warnings.filterwarnings('ignore')

from evaluation_metric import lgb_amex_metric
import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb


class CFG:
    input_dir = 'Data/'
    seed = 42
    n_folds = 5
    target = 'target'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def save_model(fold):
    def callback(env):
        global max_score
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 200 == 0:
            print('iteration {}, score= {:.05f}, max_score= {:.05f}'.format(iteration,score, max_score))
        if score > max_score:
            max_score = score
            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}_{}".format(main_fold, fold)):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
            joblib.dump(env.model, 'Models/fold_{}_{}_iter_{}_score_{:.05f}.pkl'.format(main_fold, fold, iteration, score))

    callback.order = 0
    return callback

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]

In [None]:
def train_and_evaluate(x_train_org, y_train_org, val, parameters, main_fold, weak_fold):
    
    
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(val, val[CFG.target])):
        if fold == weak_fold:
            
            print(' ')
            features = [col for col in x_train_org.columns if col not in ['target']]
            print(f'Training fold {fold} with {len(features)} features...')
            x_train, x_val = val[features].iloc[trn_ind], val[features].iloc[val_ind]
            y_train, y_val = val[CFG.target].iloc[trn_ind], val[CFG.target].iloc[val_ind]
            x_train_new = pd.concat([x_train_org, x_train], axis=0)
            y_train_new = pd.concat([y_train_org, y_train], axis=0)
            
            lgb_train = lgb.Dataset(x_train_new, y_train_new, categorical_feature = cat_features)
            lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
            del x_train, x_val, y_train, y_val, x_train_new, y_train_new; gc.collect()
                
            model = lgb.train(
                params = parameters,
                train_set = lgb_train,
                num_boost_round = 1200,
                valid_sets = [lgb_valid],
                feval = lgb_amex_metric,
                callbacks=[save_model(fold)],
                init_model='Models/cp_{}_{}_model.txt'.format(main_fold, weak_fold),
                )

            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}_{}".format(main_fold, fold)):
                    model = joblib.load('Models/' + fname)
                    model.save_model('Models/cp_{}_{}_model.txt'.format(main_fold, fold))

# path = 'Models/'
# for fname in os.listdir(path):
#     if fname.startswith("fold_3_4"):
#         print(fname)
#         print('{:.02f} MB'.format(os.path.getsize('Models/' + fname)/1000000))
#         model = joblib.load('Models/' + fname)
        
#         model.save_model('Models/cp_3_4_model.txt')

In [None]:
params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [100, 110],
    'learning_rate': [0.01, 0.005],
    'drop_rate': [0.01, 0.025, 0.50],
    'feature_fraction': [0.30],
    'bagging_freq': [10],
    'bagging_fraction': [0.25, 0.50],
    'n_jobs': [-1],
    'lambda_l2': [2],
    'min_data_in_leaf': [40]
}

In [None]:
scores = {'1_1':0.79249, '1_2':0.79170, '3_2':0.79015,'3_4':0.79371}

In [None]:
grid  = list(ParameterGrid(params))
len_grid = len(grid)
main_fold_list = [1,3]
global max_score

for main_fold in main_fold_list:
    x_train_org = pd.read_pickle('Output/x_train_fold_{}.pkl'.format(main_fold))
    x_val = pd.read_pickle('Output/x_val_fold_{}.pkl'.format(main_fold))
    y_train_org = pd.read_pickle('Output/y_train_fold_{}.pkl'.format(main_fold))
    y_val = pd.read_pickle('Output/y_val_fold_{}.pkl'.format(main_fold))
    val = pd.concat([x_val, y_val], axis=1)

    if main_fold == 1:
        weak_fold_list = [1,2]
    if main_fold == 3: 
        weak_fold_list = [2,4]

    for weak_fold in weak_fold_list:
        print(main_fold, weak_fold)
        
        max_score = scores['{}_{}'.format(main_fold, weak_fold)]
        print(max_score)
        for run, parameters in enumerate(grid):
            print('-' * 50)
            print(run, len_grid, parameters)
            train_and_evaluate(x_train_org,y_train_org, val, parameters, main_fold, weak_fold)