In [None]:
from evaluation_metric import lgb_amex_metric, amex_metric
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb
import joblib
import gc
import os

In [None]:
data = pd.read_csv('Output/train_DART_folds.csv', index_col='customer_ID')
labels = pd.read_pickle('Data/train_labels.pkl').loc[data.index]
columns = data.columns
data.shape, labels.shape

In [None]:
data['mean'] = data[columns].mean(axis=1)
data['std'] = data[columns].std(axis=1)
data['min'] = data[columns].min(axis=1)
data['max'] = data[columns].max(axis=1)
data['median'] = data[columns].median(axis=1)
data['skew'] = data[columns].skew(axis=1)
data['kurtosis'] = data[columns].kurtosis(axis=1)
data.shape

In [None]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(data, labels)):
    
    print(fold)

In [None]:
def save_model(fold, ave_score):
    def callback(env):
        iteration = env.iteration
        score = env.evaluation_result_list[0][2]
        if iteration % 50 == 0:
            print('iteration {}, score= {:.05f}, max_score= {:.05f}, ave_score= {:.05f}'.format(iteration,score, score_dict[fold], ave_score))
        if score > score_dict[fold]:
            score_dict[fold] = score

            path = 'Ensemble_DART/'
            for fname in os.listdir(path):
                if fname.startswith("fold_{}_iter".format(fold)):
                    os.remove(os.path.join(path, fname))

            print('High Score: iteration {}, score={:.05f}, ave_score={:.05f}'.format(iteration, score, ave_score))
            joblib.dump(env.model, path + 'fold_{}_iter_{}_score_{:.05f}.pkl'.format(fold, iteration, score))

    callback.order = 0
    return callback


def train_and_evaluate(data, parameters):
    
    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(data, labels)):
        
        print('')
        
        x_train, x_val = data.iloc[trn_ind], data.iloc[val_ind]
        y_train, y_val = labels.iloc[trn_ind], labels.iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)
 
        ave_score = amex_metric(y_val['target'], x_val['mean'])
        del x_train, x_val, y_train, y_val; gc.collect()
        
        path = 'Ensemble_DART/'
        
        model = lgb.train(
            params = parameters,
            train_set = lgb_train,
            num_boost_round = 250,
            valid_sets = [lgb_valid],
            feval = lgb_amex_metric,
            callbacks=[save_model(fold, ave_score)],
            # init_model= path + 'cp_{}_model.txt'.format(fold),
            )
        
        for fname in os.listdir(path):
            if fname.startswith("fold_{}_iter".format(fold)):
                model = joblib.load(path + fname)
                model.save_model(path + 'cp_{}_model.txt'.format(fold))
        
    return True

In [None]:
score_dict = {
    
    0: 0.9,
    1: 0.9,
    2: 0.9,
    3: 0.9,
    4: 0.9,
}

In [None]:
params = {
    
    'objective': ['binary'],
    'metric': ['amex_metric'],
    'boosting': ['dart'],
    'seed': [42],
    'num_leaves': [31],
    'learning_rate': [0.1],
    'drop_rate': [0.1],
    'feature_fraction': [1.0],
    'bagging_freq': [0],
    'bagging_fraction': [1.0],
    'n_jobs': [-1],
    'lambda_l1': [0.0],
    'lambda_l2': [0.0],
    'min_data_in_leaf': [20]

}

In [None]:
grid  = list(ParameterGrid(params))
len_grid = len(grid)
for run, parameters in enumerate(grid):
    print('-' * 50)
    print(run, len_grid, parameters)
    train_and_evaluate(data, parameters)

### Prediction

In [None]:
import pandas as pd
import lightgbm as lgb
import joblib

In [None]:
test = pd.read_csv('Output/test_DART_folds.csv', index_col='customer_ID')
columns = test.columns

test['mean'] = test[columns].mean(axis=1)
test['std'] = test[columns].std(axis=1)
test['min'] = test[columns].min(axis=1)
test['max'] = test[columns].max(axis=1)
test['median'] = test[columns].median(axis=1)
test['skew'] = test[columns].skew(axis=1)
test['kurtosis'] = test[columns].kurtosis(axis=1)

test.shape

In [1]:
path = 'Ensemble_DART/'
pred_list = []
for fname in os.listdir(path):
    if fname.startswith("fold_"):
        print(fname)
        model = joblib.load(path + fname)
        pred_list.append(model.predict(test))

fold_1_iter_194_score_0.99999.pkl
fold_4_iter_20_score_1.00000.pkl
fold_3_iter_247_score_0.99998.pkl
fold_2_iter_248_score_0.99990.pkl
fold_0_iter_172_score_0.99988.pkl


In [None]:
column_name = [
    'fold_1',
    'fold_4',
    'fold_3',
    'fold_2',
    'fold_0',
]

pred_df = pd.DataFrame(pred_list).T
pred_df.columns = column_name
pred_df.index = data.index