## Feature Engineering and CV based Winners' Solutions

continued from eda_4_26

New in this notebook:
- Trained@2015-06-28, validated@2015-12-28
- `min_child_weight=10` is much better than `min_child_weight=1`, try more trees to find the optimal number of iterations

In [1]:
from santander_helper import *

In [None]:
# Train, validation, and prediction
simulation_name = 'p2'
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds = 150
n_repeat = 5

history, model_dict, y_pred, y_sub = \
    train_test_month(param, num_rounds, 
    '2015-06-28', '2016-05-28', 
    sub_name='eda_4_32_{}.csv.gz'.format(simulation_name), 
    n_repeat=n_repeat, random_seed=54, n_features=350, 
    eval_train_flag=False)

# History and learning curve
history_train = history.xs(axis=1, level=[1, 2], key=['train', 'MAP@7']).copy()
history_train['mean'] = history_train.iloc[:, :n_repeat].mean(axis=1)
history_train['std'] = history_train.iloc[:, :n_repeat].std(axis=1)

history_val = history.xs(axis=1, level=[1, 2], key=['val', 'MAP@7']).copy()
history_val['mean'] = history_val.iloc[:, :n_repeat].mean(axis=1)
history_val['std'] = history_val.iloc[:, :n_repeat].std(axis=1)

plt.figure(figsize=(16, 9))
# plt.plot(history_train['mean'])
# plt.fill_between(history_train.index, history_train['mean']+history_train['std'], history_train['mean']-history_train['std'], alpha=0.3)

plt.plot(history_val['mean'])
plt.fill_between(history_val.index, history_val['mean']+history_val['std'], history_val['mean']-history_val['std'], alpha=0.3)

plt.grid()

# Feature importance
fi = pd.DataFrame({i: model_dict[i].get_score(importance_type='gain') for i in range(n_repeat)})
fi['mean'] = fi.iloc[:, :n_repeat].mean(axis=1)
fi['std'] = fi.iloc[:, :n_repeat].std(axis=1)
fi.sort_values(by=['mean'], inplace=True, ascending=False)
# fi.reset_index(inplace=True)
#fi.to_csv('feature_importance_{}.csv'.format(simulation_name))

plt.rcParams.update({'figure.figsize': '16, 240'})
plt.rcParams.update({'font.size': '22'})
fig, ax = plt.subplots()
ax.barh(fi.index, fi['mean'].values, log=True, xerr=fi['std'].values)
ax.grid()
ax.tick_params(labelbottom=True,labeltop=True)
ax.set_ylim(fi.shape[0], -0.5)

plt.figure(figsize=(16, 9))
plt.plot(fi.values[:, n_repeat])
plt.grid()
plt.yscale('log')

# Save data
save_pickle('parameter_tune_eda_4_32_{}.pickle'.format(simulation_name), (history, model_dict, y_pred, y_sub, fi, param))

Train, validate, and predict, repetition 0 of 5
[0]	train-merror:0.294974	val-merror:0.347556	train-MAP@7:0	val-MAP@7:0.827417
[1]	train-merror:0.290465	val-merror:0.341639	train-MAP@7:0	val-MAP@7:0.831906
[2]	train-merror:0.286089	val-merror:0.338903	train-MAP@7:0	val-MAP@7:0.83558
[3]	train-merror:0.285696	val-merror:0.335542	train-MAP@7:0	val-MAP@7:0.838663
[4]	train-merror:0.284739	val-merror:0.33582	train-MAP@7:0	val-MAP@7:0.838362
[5]	train-merror:0.284077	val-merror:0.333251	train-MAP@7:0	val-MAP@7:0.841912
[6]	train-merror:0.283449	val-merror:0.333039	train-MAP@7:0	val-MAP@7:0.843281
[7]	train-merror:0.283122	val-merror:0.332111	train-MAP@7:0	val-MAP@7:0.844928
[8]	train-merror:0.282772	val-merror:0.330297	train-MAP@7:0	val-MAP@7:0.847062
[9]	train-merror:0.281847	val-merror:0.33012	train-MAP@7:0	val-MAP@7:0.848437
[10]	train-merror:0.281516	val-merror:0.329214	train-MAP@7:0	val-MAP@7:0.849377
[11]	train-merror:0.281154	val-merror:0.329014	train-MAP@7:0	val-MAP@7:0.850162
[12]	

[103]	train-merror:0.259852	val-merror:0.322081	train-MAP@7:0	val-MAP@7:0.860357
[104]	train-merror:0.25968	val-merror:0.322005	train-MAP@7:0	val-MAP@7:0.860519
[105]	train-merror:0.259581	val-merror:0.322005	train-MAP@7:0	val-MAP@7:0.860511
[106]	train-merror:0.259409	val-merror:0.322075	train-MAP@7:0	val-MAP@7:0.860545
[107]	train-merror:0.259485	val-merror:0.32198	train-MAP@7:0	val-MAP@7:0.860629
[108]	train-merror:0.259266	val-merror:0.321993	train-MAP@7:0	val-MAP@7:0.860625
[109]	train-merror:0.259201	val-merror:0.322008	train-MAP@7:0	val-MAP@7:0.860635
[110]	train-merror:0.259003	val-merror:0.321824	train-MAP@7:0	val-MAP@7:0.860718
[111]	train-merror:0.258776	val-merror:0.321951	train-MAP@7:0	val-MAP@7:0.860631
[112]	train-merror:0.258717	val-merror:0.321789	train-MAP@7:0	val-MAP@7:0.860789
[113]	train-merror:0.258622	val-merror:0.322052	train-MAP@7:0	val-MAP@7:0.860667
[114]	train-merror:0.258351	val-merror:0.322049	train-MAP@7:0	val-MAP@7:0.860639
[115]	train-merror:0.258231	va

In [None]:
history_val