In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from evaluation_metric import lgb_amex_metric
from sklearn.model_selection import StratifiedKFold

import joblib
import lightgbm as lgb

import gc

def save_model(fold):
    def callback(env):
        
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 200 == 0:
            print('iteration {}, score= {:.05f}'.format(iteration,score))
        if score > score_dic[fold]:
            score_dic[fold] = score
            path = 'Models/'
            for fname in os.listdir(path):
                if fname.startswith("weak_fold_{}".format(fold)):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
            joblib.dump(env.model, 'Models/weak_fold_{}_score_{:.05f}.pkl'.format(fold, score))

    callback.order = 0
    return callback

In [2]:
val_indx = pd.read_pickle('Data/weak_fold_indx.pkl')['weak_index'].tolist()
train_val = pd.read_parquet('Data/train_all.parquet')
train_indx = set(train_val.index).difference(set(val_indx))
train = train_val.loc[train_indx]
val = train_val.loc[val_indx]
train.shape, val.shape

((385487, 1011), (73426, 1011))

In [3]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

cat_features = [f"{cf}_last" for cf in cat_features]
for cat_col in cat_features:
    encoder = LabelEncoder()
    train[cat_col] = encoder.fit_transform(train[cat_col])
    val[cat_col] = encoder.transform(val[cat_col])
    
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
num_cols = [col for col in num_cols if 'last' in col]
for col in num_cols:
    train[col + '_round2'] = train[col].round(2)
    val[col + '_round2'] = val[col].round(2)

features = [col for col in train.columns if col not in ['target']]

train.shape, val.shape

((385487, 1104), (73426, 1104))

In [4]:
score_dic = {4:0.78990, 7:0.77869, 9:0.78098}

In [5]:
def train_and_evaluate(train_org, val, parameters):
    kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(val, val['target'])):
        if fold in [4, 7, 9]:
            
            print(f'Training fold {fold} with {len(features)} features...')
            x_train, x_val = val[features].iloc[trn_ind], val[features].iloc[val_ind]
            y_train, y_val = val['target'].iloc[trn_ind], val['target'].iloc[val_ind]
            x_train_new = pd.concat([train_org[features], x_train], axis=0)
            y_train_new = pd.concat([train_org['target'], y_train], axis=0)

            lgb_train = lgb.Dataset(x_train_new, y_train_new, categorical_feature = cat_features)
            lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
            del x_train, x_val, y_train, y_val, x_train_new, y_train_new; gc.collect()
            model = lgb.train(
                params = parameters,
                train_set = lgb_train,
                num_boost_round = 1200,
                valid_sets = [lgb_valid],
                feval = lgb_amex_metric,
                callbacks=[save_model(fold)],
                init_model= 'Models/weak_{}_model.txt'.format(fold)
                )

In [6]:
# model = joblib.load('Models/weak_fold_4_score_0.78990.pkl')
# model.save_model('Models/weak_4_model.txt')

In [7]:
# params = {
#     'objective': ['binary'],
#     'metric': ['amex_metric'],
#     'boosting': ['dart'],
#     'seed': [42],
#     'num_leaves': [100],
#     'learning_rate': [0.01],
#     'drop_rate': [0.1],
#     'feature_fraction': [0.20],
#     'bagging_freq': [10],
#     'bagging_fraction': [0.50],
#     'n_jobs': [-1],
#     'lambda_l2': [2],
#     'min_data_in_leaf': [40]
#     }

params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [100, 110],
    'learning_rate': [0.01, 0.005],
    'drop_rate': [0.01, 0.025, 0.50],
    'feature_fraction': [0.30],
    'bagging_freq': [10],
    'bagging_fraction': [0.25, 0.50],
    'n_jobs': [-1],
    'lambda_l2': [2],
    'min_data_in_leaf': [40]
}

In [8]:
grid  = list(ParameterGrid(params))
len_grid = len(grid)

for run, parameters in enumerate(grid):

    print('-'*50)
    print(run, len_grid, parameters)
    train_and_evaluate(train, val, parameters)

--------------------------------------------------
0 24 {'bagging_fraction': 0.25, 'bagging_freq': 10, 'boosting': 'dart', 'drop_rate': 0.01, 'feature_fraction': 0.3, 'lambda_l2': 2, 'learning_rate': 0.01, 'metric': 'amex_metric', 'min_data_in_leaf': 40, 'n_jobs': -1, 'num_leaves': 100, 'objective': 'binary', 'seed': 42}
Training fold 4 with 1103 features...
[LightGBM] [Info] Number of positive: 116926, number of negative: 334644
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 187417
[LightGBM] [Info] Number of data points in the train set: 451570, number of used features: 1095
High Score: iteration 16602, score=0.78991
High Score: iteration 16603, score=0.78991
iteration 16800, score= 0.78755
iteration 17000, score= 0.78883
iteration 17200, score= 0.78914
iteration 17400, score= 0.78812
iteration 17600, score= 0.78779
iteration 17800, score= 0.78887
Training fold 7 with 1103 features...
[LightGBM] [Info] Number of positive: 116927, number of nega