## Feature Engineering and CV based on 5-th Place Solutions

New in this notebook:
- Create training data for all months

In [1]:
from santander_helper import *

Create all months' data

In [7]:
fixed_lag = 6
for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
    print(m)
    if m in ['2015-01-28', '2016-06-28']:
        continue
    x_train, y_train, w_train = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
    print('-'*60)
del x_train, y_train, w_train
gc.collect()

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

2015-01-28
2015-02-28
------------------------------------------------------------
2015-03-28
------------------------------------------------------------
2015-04-28
------------------------------------------------------------
2015-05-28
------------------------------------------------------------
2015-06-28
------------------------------------------------------------
2015-07-28
------------------------------------------------------------
2015-08-28
------------------------------------------------------------
2015-09-28
------------------------------------------------------------
2015-10-28
------------------------------------------------------------
2015-11-28
------------------------------------------------------------
2015-12-28
------------------------------------------------------------
2016-01-28
------------------------------------------------------------
2016-02-28
------------------------------------------------------------
2016-03-28
------------------------------------------

307

Load all months' data

In [2]:
x_train = []
y_train = []
w_train = []
fixed_lag = 6
for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
    if m in ['2015-01-28', '2016-06-28']:
        continue
    x_tmp, y_tmp, w_tmp = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
    x_train.append(x_tmp)
    y_train.append(y_tmp)
    w_train.append(w_tmp)
del x_tmp, y_tmp, w_tmp
gc.collect()

x_val = x_train[-1]
y_val = y_train[-1]
w_val = w_train[-1]

x_train = pd.concat(x_train[:-1], axis=0, ignore_index=True, sort=False)
y_train = pd.concat(y_train[:-1], axis=0, ignore_index=True, sort=False)
w_train = pd.concat(w_train[:-1], axis=0, ignore_index=True, sort=False)

gc.collect()

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




In [24]:
def cv_xgb_skfrm(params, x_train, y_train, w_train, n_features=350, num_boost_round=3, n_splits=2, 
                           n_repeats=2, random_state=0, verbose_eval=False):
    '''
    CV of xgb using Stratified KFold Repeated Models (SKFRM)
    verbose_eval is the same as in xgb.train
    '''
    cv_results = {}
    clfs = {}
    running_time = {}
    
    eval_metric = 'mlogloss'
    
    # Select features
    if n_features is not None:
        fi = pd.read_csv('feature_importance.csv', )
        fi = fi.iloc[:min(n_features, fi.shape[0]), 0].values.tolist()
        fi = list(set(fi) | set(target_cols) | set(cat_cols))
        x_train = x_train[fi]
    
    np.random.seed(random_state)
    skf = StratifiedKFold(n_splits=n_splits, random_state=np.random.randint(10**6), shuffle=True)
    
    for m in range(n_repeats):
        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):
            
            start_time = time.time()
            
            # Construct DMatrix
            x_train_tmp = x_train.iloc[train_index]
            y_train_tmp = y_train.iloc[train_index]
            w_train_tmp = w_train.iloc[train_index]
            x_val_tmp = x_train.iloc[val_index]
            y_val_tmp = y_train.iloc[val_index]
            w_val_tmp = w_train.iloc[val_index]
            
            dtrain = xgb.DMatrix(x_train_tmp, label=y_train_tmp, weight=w_train_tmp)
            dval = xgb.DMatrix(x_train_tmp, label=y_train_tmp, weight=w_train_tmp)
            
            # Ground truth parameters for evaluation of MAP@7
#             gt_train = prep_map(x_train_tmp, y_train_tmp)
#             gt_val = prep_map(x_val_tmp, y_val_tmp)
#             ground_truth = {'train': gt_train, 'val': gt_val}
#             data_len = {'train': len(dtrain.get_label()), 'val': len(dval.get_label())}
                        
            # Placeholder for evals_result
            cv_results[m, n] = {}
            params['seed'] = np.random.randint(10**6)
            clfs[m, n] = xgb.train(params, dtrain, 
                num_boost_round=num_boost_round,
                evals=[(dtrain, 'train'), (dval, 'val')],
                evals_result=cv_results[m, n], 
                verbose_eval=verbose_eval)
        
            running_time[m, n] = time.time() - start_time
            
            print('Repeat {}, split {}, validate score = {:.3f}, running time = {:.3f} min'.format(m, n, 
                cv_results[m, n]['val'][eval_metric][-1], running_time[m, n]/60))
        
    # Post-process cv_results
    cv_results_final = {}
    for m in range(n_repeats):
        for n in range(n_splits):
            cv_results_final['train', m, n] = cv_results[m, n]['train'][eval_metric]
            cv_results_final['val', m, n] = cv_results[m, n]['val'][eval_metric]
    
    df = pd.DataFrame.from_dict(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repeat', 'split']

    print('Score mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clfs, running_time


In [25]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
df, clfs, running_time = cv_xgb_skfrm(param, x_train, y_train, w_train, n_features=350, num_boost_round=3, n_splits=2, 
    n_repeats=2, random_state=0, verbose_eval=True)

[0]	train-mlogloss:2.66062	val-mlogloss:2.66062
[1]	train-mlogloss:2.46403	val-mlogloss:2.46403
[2]	train-mlogloss:2.31184	val-mlogloss:2.31184
Repeat 0, split 0, validate score = 2.312, running time = 2.060 min
[0]	train-mlogloss:2.65935	val-mlogloss:2.65935
[1]	train-mlogloss:2.46327	val-mlogloss:2.46327
[2]	train-mlogloss:2.31	val-mlogloss:2.31
Repeat 0, split 1, validate score = 2.310, running time = 2.076 min
[0]	train-mlogloss:2.65929	val-mlogloss:2.65929
[1]	train-mlogloss:2.4643	val-mlogloss:2.4643
[2]	train-mlogloss:2.31139	val-mlogloss:2.31139
Repeat 1, split 0, validate score = 2.311, running time = 2.131 min
[0]	train-mlogloss:2.66049	val-mlogloss:2.66049
[1]	train-mlogloss:2.4641	val-mlogloss:2.4641
[2]	train-mlogloss:2.31161	val-mlogloss:2.31161
Repeat 1, split 1, validate score = 2.312, running time = 2.084 min
Score mean = 2.311, std = 0.001


In [14]:
# Train, validation, and prediction
simulation_name = 'p1'
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds = 100
n_repeat = 5

# history, model_dict, y_pred, y_sub = \
#     train_test_month(param, num_rounds, '2015-06-28', '2016-05-28', 
#     'eda_4_32_{}.csv.gz'.format(simulation_name), n_repeat=n_repeat, random_seed=54, n_features=350)



# History and learning curve
history_train = history.xs(axis=1, level=[1, 2], key=['train', 'MAP@7']).copy()
history_train['mean'] = history_train.iloc[:, :n_repeat].mean(axis=1)
history_train['std'] = history_train.iloc[:, :n_repeat].std(axis=1)

history_val = history.xs(axis=1, level=[1, 2], key=['val', 'MAP@7']).copy()
history_val['mean'] = history_val.iloc[:, :n_repeat].mean(axis=1)
history_val['std'] = history_val.iloc[:, :n_repeat].std(axis=1)

plt.figure(figsize=(16, 9))
# plt.plot(history_train['mean'])
# plt.fill_between(history_train.index, history_train['mean']+history_train['std'], history_train['mean']-history_train['std'], alpha=0.3)

plt.plot(history_val['mean'])
plt.fill_between(history_val.index, history_val['mean']+history_val['std'], history_val['mean']-history_val['std'], alpha=0.3)

plt.grid()

# Feature importance
fi = pd.DataFrame({i: model_dict[i].get_score(importance_type='gain') for i in range(n_repeat)})
fi['mean'] = fi.iloc[:, :n_repeat].mean(axis=1)
fi['std'] = fi.iloc[:, :n_repeat].std(axis=1)
fi.sort_values(by=['mean'], inplace=True, ascending=False)
# fi.reset_index(inplace=True)
#fi.to_csv('feature_importance_{}.csv'.format(simulation_name))

plt.rcParams.update({'figure.figsize': '16, 240'})
plt.rcParams.update({'font.size': '22'})
fig, ax = plt.subplots()
ax.barh(fi.index, fi['mean'].values, log=True, xerr=fi['std'].values)
ax.grid()
ax.tick_params(labelbottom=True,labeltop=True)
ax.set_ylim(fi.shape[0], -0.5)

plt.figure(figsize=(16, 9))
plt.plot(fi.values[:, n_repeat])
plt.grid()
plt.yscale('log')

# Save data
save_pickle('parameter_tune_eda_4_32_{}.pickle'.format(simulation_name), (history, model_dict, y_pred, y_sub, fi))

KeyboardInterrupt: 