In [1]:
import warnings
warnings.filterwarnings("ignore")

from evaluation_metric import lgb_amex_metric
import os
import gc
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb


class CFG:
    input_dir = "Data/"
    seed = 42
    n_folds = 5
    target = "target"
    path = "models_DART_all_corr_pacslope_lag_avediff/"


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


def save_model(fold):
    def callback(env):
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 500 == 0:
            print(
                "iteration {}, score= {:.05f}, max_score= {:.05f}".format(
                    iteration, score, score_dic[fold]
                )
            )
        if score > score_dic[fold]:
            score_dic[fold] = score

            for fname in os.listdir(CFG.path):
                if fname.startswith("fold_{}_iter".format(fold)):
                    os.remove(os.path.join(CFG.path, fname))

            print("High Score: iteration {}, score={:.05f}".format(iteration, score))
            joblib.dump(
                env.model,
                CFG.path
                + "fold_{}_iter_{}_score_{:.05f}.pkl".format(fold, iteration, score),
            )

    callback.order = 0
    return callback


In [2]:
train = pd.read_parquet(CFG.input_dir + "train_all_slopes_corr_pcaslope_lag_avediff.parquet")
labels = pd.read_pickle("Data/train_labels.pkl").loc[train.index]
train["target"] = labels

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

cat_features = [f"{cf}_last" for cf in cat_features]
train.shape, labels.shape


((458913, 3240), (458913, 1))

In [3]:
corr_col = train.columns[train.columns.str.startswith("corr_")].to_list()

top_corr = [
    "corr_D_39-B_26",
    "corr_D_48-B_4",
    "corr_P_2-D_44",
    "corr_D_47-B_4",
    "corr_D_47-D_39",
    "corr_P_2-B_4",
    "corr_D_39-B_10",
    "corr_D_44-B_4",
    "corr_D_39-B_2",
    "corr_D_46-B_4",
    "corr_D_48-B_3",
    "corr_D_48-B_9",
    "corr_S_5-S_24",
    "corr_S_7-S_3",
    "corr_D_43-D_144",
    "corr_D_48-D_39",
    "corr_P_3-D_46",
    "corr_S_5-D_43",
    "corr_R_1-B_4",
    "corr_P_3-D_47",
    "corr_D_39-B_3",
    "corr_R_6-D_39",
    "corr_S_27-B_2",
    "corr_S_23-D_43",
    "corr_R_6-D_69",
    "corr_P_2-D_48",
    "corr_S_25-B_4",
    "corr_D_43-B_4",
    "corr_R_27-D_69",
    "corr_S_7-S_27",
    "corr_D_39-B_11",
    "corr_S_3-D_39",
    "corr_S_12-B_4",
    "corr_D_39-B_15",
    "corr_R_27-B_26",
    "corr_S_23-D_39",
    "corr_R_27-R_1",
    "corr_R_1-D_39",
    "corr_S_19-D_39",
    "corr_S_27-B_3",
    "corr_S_16-D_39",
    "corr_R_27-B_5",
    "corr_S_3-D_62",
    "corr_D_71-D_62",
    "corr_R_27-D_39",
    "corr_D_48-D_43",
    "corr_D_61-B_36",
    "corr_S_25-D_39",
    "corr_R_6-D_43",
    "corr_S_27-R_27",
    "corr_S_27-S_12",
    "corr_S_27-D_39",
    "corr_D_46-B_3",
    "corr_D_62-D_47",
    "corr_B_4-B_3",
    "corr_R_1-D_48",
    "corr_S_16-D_46",
    "corr_D_61-D_48",
    "corr_P_2-D_39",
    "corr_R_27-B_2",
]

corr_to_remove = set(corr_col).difference(set(top_corr))
train.drop(corr_to_remove, axis=1, inplace=True)
train.shape, len(top_corr)


((458913, 2075), 60)

In [4]:
def train_and_evaluate(train, parameters, rounds, load_model=False):

    kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        if fold in [0, 1, 2, 3, 4]:
            print(" ")

            features = [col for col in train.columns if col not in ["target"]]
            print(f"Training fold {fold} with {len(features)} features...")
            x_train, x_val = (
                train[features].iloc[trn_ind],
                train[features].iloc[val_ind],
            )
            y_train, y_val = (
                train[CFG.target].iloc[trn_ind],
                train[CFG.target].iloc[val_ind],
            )
            lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
            lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
            del x_train, x_val, y_train, y_val
            gc.collect()


            if load_model:
                model = lgb.train(
                    params=parameters,
                    train_set=lgb_train,
                num_boost_round=rounds,
                    valid_sets=[lgb_valid],
                    feval=lgb_amex_metric,
                    callbacks=[save_model(fold)],
                    init_model=CFG.path + "cp_{}_model.txt".format(fold),
                )
            else:
                model = lgb.train(
                params=parameters,
                train_set=lgb_train,
                num_boost_round=rounds,
                valid_sets=[lgb_valid],
                feval=lgb_amex_metric,
                callbacks=[save_model(fold)],
            )

            for fname in os.listdir(CFG.path):
                if fname.startswith("fold_{}_iter".format(fold)):
                    model = joblib.load(CFG.path + fname)
                    model.save_model(CFG.path + "cp_{}_model.txt".format(fold))


In [5]:
params = {
    "objective": ["binary"],
    "metric": ["amex_metric"],
    "boosting": ["dart"],
    "seed": [42],
    "num_leaves": [100],
    "learning_rate": [0.01],
    "drop_rate": [0.1],
    "feature_fraction": [0.20],
    "bagging_freq": [10],
    "bagging_fraction": [0.50],
    "n_jobs": [-1],
    "lambda_l1": [0],
    "lambda_l2": [20],
    "min_data_in_leaf": [40],
}

score_dic = {
    0: 0.789,
    1: 0.789,
    2: 0.789,
    3: 0.789,
    4: 0.789,
}

grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    print("-" * 50)
    print(run, len_grid, parameters)
    train_and_evaluate(train, parameters, rounds=15000, load_model=False)


1 models to train
--------------------------------------------------
0 1 {'bagging_fraction': 0.5, 'bagging_freq': 10, 'boosting': 'dart', 'drop_rate': 0.1, 'feature_fraction': 0.2, 'lambda_l1': 0, 'lambda_l2': 20, 'learning_rate': 0.01, 'metric': 'amex_metric', 'min_data_in_leaf': 40, 'n_jobs': -1, 'num_leaves': 100, 'objective': 'binary', 'seed': 42}
 
Training fold 0 with 2074 features...
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 368871
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 2060
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
iteration 0, score= 0.70976, max_score= 0.78900
iteration 500, score= 0.77048, max_score= 0.78900
iteration 1000, score= 0.78120, max_score= 0.78900
iteration 1500, score= 0.78807, max_score= 0.78900
High Scor

In [None]:
print(score_dic)

In [None]:
params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [100],
    'learning_rate': [0.01, 0.005],
    'drop_rate': [0.1],
    'feature_fraction': [0.50],
    'bagging_freq': [10],
    'bagging_fraction': [0.80],
    'n_jobs': [-1],
    'lambda_l1': [0],
    'lambda_l2': [20],
    'min_data_in_leaf': [100, 200, 300, 400, 500]

}


grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    print("-" * 50)
    print(run, len_grid, parameters)
    train_and_evaluate(train, parameters, rounds=3000, load_model=True)


In [None]:
print(score_dic)

In [None]:
params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [100],
    'learning_rate': [0.01,0.005],
    'drop_rate': [0.1],
    'feature_fraction': [0.50],
    'bagging_freq': [10],
    'bagging_fraction': [0.80],
    'n_jobs': [-1],
    'lambda_l1': [0, 20, 40],
    'lambda_l2': [20, 40, 60],
    'min_data_in_leaf': [100]

}


grid = list(ParameterGrid(params))
len_grid = len(grid)
print(f"{len_grid} models to train")

for run, parameters in enumerate(grid):
    print("-" * 50)
    print(run, len_grid, parameters)
    train_and_evaluate(train, parameters, rounds=3000, load_model=True)

In [None]:
print(score_dic)