In [None]:
import warnings
warnings.filterwarnings("ignore")

from evaluation_metric import lgb_amex_metric

import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb
import pickle

class CFG:
    input_dir = "Data/"
    seed = 42
    n_folds = 5
    target = "target"
    path = "Models_DART_all_10corr_5folds/"

score_dic = {i: 0.785 for i in range(CFG.n_folds)}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


def save_model(fold, prefix):
    def callback(env):
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 500 == 0:
            print(
                "iteration {}, score= {:.05f}, max_score= {:.05f}".format(
                    iteration, score, score_dic[fold]
                )
            )
        if score > score_dic[fold]:
            score_dic[fold] = score

            for fname in os.listdir(CFG.path):
                if fname.startswith("{}_fold_{}_iter".format(prefix, fold)):
                    os.remove(os.path.join(CFG.path, fname))

            print("High Score: iteration {}, score={:.05f}".format(iteration, score))
            joblib.dump(
                env.model,
                CFG.path
                + "{}_fold_{}_iter_{}_score_{:.05f}.pkl".format(prefix, fold, iteration, score),
            )it

In [None]:
train = pd.read_parquet(CFG.input_dir + "train_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate.parquet")
labels = pd.read_pickle("Data/train_labels.pkl").loc[train.index]
train["target"] = labels

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

cat_features = [f"{cf}_last" for cf in cat_features]
train.shape, labels.shape


In [None]:
corr_col = train.columns[train.columns.str.startswith("corr_")].to_list()

top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
]

corr_to_remove = set(corr_col).difference(set(top_corr))
train.drop(corr_to_remove, axis=1, inplace=True)
train.shape, len(top_corr)

In [None]:
test = train.sample(frac=0.05, random_state=CFG.seed)
test.to_parquet(CFG.path + "validation.parquet")
train = train.drop(test.index)
train.shape, test.shape

In [None]:
folds = [i for i in range(0,CFG.n_folds,1)]

kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
    
    print("Saving fold {} lgb dataset ...".format(fold))

    features = [col for col in train.columns if col not in ["target"]]
    
    x_train, x_val = (
        train[features].iloc[trn_ind],
        train[features].iloc[val_ind],
    )
    y_train, y_val = (
        train[CFG.target].iloc[trn_ind],
        train[CFG.target].iloc[val_ind],
    )
    
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    filehandler = open(CFG.path + 'train_fold_{}.pkl'.format(fold),"wb")
    pickle.dump(lgb_train,filehandler)
    filehandler = open(CFG.path +'valid_fold_{}.pkl'.format(fold),"wb")
    pickle.dump(lgb_valid,filehandler)
    filehandler.close()
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

In [None]:
def train_and_evaluate(parameters, rounds, folds = [i for i in range(0,CFG.n_folds,1)], load_model=False, prefix='HT0'):

    for fold in folds:
        print("Training fold {} ...".format(fold))
        lgb_train = pickle.load(open(CFG.path + 'train_fold_{}.pkl'.format(fold),"rb"))
        lgb_valid = pickle.load(open(CFG.path +'valid_fold_{}.pkl'.format(fold),"rb"))

        if load_model:
            model = lgb.train(
                params=parameters,
                train_set=lgb_train,
            num_boost_round=rounds,
                valid_sets=[lgb_valid],
                feval=lgb_amex_metric,
                callbacks=[save_model(fold, prefix)],
                init_model=CFG.path + "cp_{}_model.txt".format(fold),
            )
        else:
            model = lgb.train(
            params=parameters,
            train_set=lgb_train,
            num_boost_round=rounds,
            valid_sets=[lgb_valid],
            feval=lgb_amex_metric,
            callbacks=[save_model(fold, prefix)],
        )
        del lgb_train, lgb_valid; _ = gc.collect()
        
        for fname in os.listdir(CFG.path):
            if fname.startswith("{}_fold_{}_iter".format(prefix,fold)):
                model = joblib.load(CFG.path + fname)
                model.save_model(CFG.path + "cp_{}_model.txt".format(fold))


In [None]:
score_dic = {i: 0.785 for i in range(CFG.n_folds)}

params = {
    "objective": ["binary"],
    "metric": ["amex_metric"],
    "boosting": ["dart"],
    "seed": [42],
    "num_leaves": [100],
    "learning_rate": [0.01],
    "drop_rate": [0.1],
    "feature_fraction": [0.2],
    "bagging_freq": [10],
    "bagging_fraction": [0.50],
    "n_jobs": [-1],
    "lambda_l1": [0],
    "lambda_l2": [20],
    "min_data_in_leaf": [40],
    'force_col_wise':[True]
}

grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    
    print("-" * 50)
    print(f"Training run {run} ...")
    train_and_evaluate(parameters, folds = [0,1,2,3,4], rounds=10000, load_model=False, prefix = 'HT0')

In [None]:
params = {
    "objective": ["binary"],
    "metric": ["amex_metric"],
    "boosting": ["dart"],
    "seed": [1,2,3,4,5],
    "num_leaves": [100],
    "learning_rate": [0.01],
    "drop_rate": [0.1],
    "feature_fraction": [0.2],
    "bagging_freq": [10],
    "bagging_fraction": [0.50],
    "n_jobs": [-1],
    "lambda_l1": [0],
    "lambda_l2": [20],
    "min_data_in_leaf": [40],
    'force_col_wise':[True]
}

grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    
    print("-" * 50)
    print(f"Training run {run} ...")
    train_and_evaluate(parameters, folds = [0,1,2,3,4], rounds=1000, load_model=True, prefix = 'HT1')

In [None]:
params = {
    "objective": ["binary"],
    "metric": ["amex_metric"],
    "boosting": ["dart"],
    "seed": [6,7,8,9,10],
    "num_leaves": [100],
    "learning_rate": [0.01],
    "drop_rate": [0.1],
    "feature_fraction": [0.2],
    "bagging_freq": [10],
    "bagging_fraction": [0.50],
    "n_jobs": [-1],
    "lambda_l1": [0],
    "lambda_l2": [20],
    "min_data_in_leaf": [40],
    'force_col_wise':[True]
}

grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    
    print("-" * 50)
    print(f"Training run {run} ...")
    train_and_evaluate(parameters, folds = [0,1,2,3,4], rounds=1000, load_model=True, prefix = 'HT2')

In [None]:
params = {
    "objective": ["binary"],
    "metric": ["amex_metric"],
    "boosting": ["dart"],
    "seed": [11,12,13,14,15],
    "num_leaves": [100],
    "learning_rate": [0.01],
    "drop_rate": [0.1],
    "feature_fraction": [0.2],
    "bagging_freq": [10],
    "bagging_fraction": [0.50],
    "n_jobs": [-1],
    "lambda_l1": [0],
    "lambda_l2": [20],
    "min_data_in_leaf": [40],
    'force_col_wise':[True]
}

grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    
    print("-" * 50)
    print(f"Training run {run} ...")
    train_and_evaluate(parameters, folds = [0,1,2,3,4], rounds=1000, load_model=True, prefix = 'HT3')

In [None]:
params = {
    "objective": ["binary"],
    "metric": ["amex_metric"],
    "boosting": ["dart"],
    "seed": [16,17,18,19,20],
    "num_leaves": [100],
    "learning_rate": [0.01],
    "drop_rate": [0.1],
    "feature_fraction": [0.2],
    "bagging_freq": [10],
    "bagging_fraction": [0.50],
    "n_jobs": [-1],
    "lambda_l1": [0],
    "lambda_l2": [20],
    "min_data_in_leaf": [40],
    'force_col_wise':[True]
}

grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    
    print("-" * 50)
    print(f"Training run {run} ...")
    train_and_evaluate(parameters, folds = [0,1,2,3,4], rounds=1000, load_model=True, prefix = 'HT4')

In [None]:
test.shape