In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from evaluation_metric import lgb_amex_metric

import joblib
import lightgbm as lgb
global max_score
max_score = 0.780

def save_model():
    def callback(env):
        global max_score
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 500 == 0:
            print('iteration {}, score= {:.05f}'.format(iteration,score))
        if score > max_score:
            max_score = score
            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("weak_fold"):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
            joblib.dump(env.model, 'Models/weak_fold_score_{:.05f}.pkl'.format(score))

    callback.order = 0
    return callback

In [2]:
val_indx = pd.read_pickle('Data/weak_fold_indx.pkl')['weak_index'].tolist()
train_val = pd.read_parquet('Data/train_all.parquet')
train_indx = set(train_val.index).difference(set(val_indx))
train = train_val.loc[train_indx]
val = train_val.loc[val_indx]
train.shape, val.shape

In [None]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

cat_features = [f"{cf}_last" for cf in cat_features]
for cat_col in cat_features:
    encoder = LabelEncoder()
    train[cat_col] = encoder.fit_transform(train[cat_col])
    val[cat_col] = encoder.transform(val[cat_col])
    
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
num_cols = [col for col in num_cols if 'last' in col]
for col in num_cols:
    train[col + '_round2'] = train[col].round(2)
    val[col + '_round2'] = val[col].round(2)

features = [col for col in train.columns if col not in ['target']]

train.shape, val.shape

In [None]:
def train_and_evaluate(lgb_train, lgb_valid, parameters):
    
    model = lgb.train(
        params = parameters,
        train_set = lgb_train,
        num_boost_round = 15000,
        valid_sets = [lgb_valid],
        feval = lgb_amex_metric,
        callbacks=[save_model()],
        )

In [None]:
params = {
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [100],
    'learning_rate': [0.01, 0.005],
    'drop_rate': [0.01, 0.025, 0.05, 0.1, 0.2],
    'feature_fraction': [0.20],
    'bagging_freq': [10],
    'bagging_fraction': [0.50],
    'n_jobs': [-1],
    'lambda_l2': [2],
    'min_data_in_leaf': [40]
    }

lgb_train = lgb.Dataset(train[features], train['target'], categorical_feature = cat_features)
lgb_valid = lgb.Dataset(val[features], val['target'], categorical_feature = cat_features)

In [None]:
grid  = list(ParameterGrid(params))
len_grid = len(grid)
for run, parameters in enumerate(grid):
    print('-'*50)
    print(run, len_grid, parameters)
    train_and_evaluate(lgb_train, lgb_valid, parameters)