## Feature Engineering and CV based Winners' Solutions

continued from eda_4_26

New in this notebook:
- Trained@2015-06-28, validated@2015-12-28
- `min_child_weight=10` is much better than `min_child_weight=1`, try more trees to find the optimal number of iterations

In [1]:
from santander_helper import *

In [2]:
# Train, validation, and prediction
simulation_name = 'p2'
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds = 200
n_repeat = 5

history, model_dict, y_pred, y_sub = \
    train_test_month(param, num_rounds, 
    '2015-06-28', '2016-05-28', 
    sub_name='eda_4_32_{}.csv.gz'.format(simulation_name), 
    n_repeat=n_repeat, random_seed=54, n_features=350, 
    eval_train_flag=False)

# History and learning curve
history_train = history.xs(axis=1, level=[1, 2], key=['train', 'MAP@7']).copy()
history_train['mean'] = history_train.iloc[:, :n_repeat].mean(axis=1)
history_train['std'] = history_train.iloc[:, :n_repeat].std(axis=1)

history_val = history.xs(axis=1, level=[1, 2], key=['val', 'MAP@7']).copy()
history_val['mean'] = history_val.iloc[:, :n_repeat].mean(axis=1)
history_val['std'] = history_val.iloc[:, :n_repeat].std(axis=1)

plt.figure(figsize=(16, 9))
# plt.plot(history_train['mean'])
# plt.fill_between(history_train.index, history_train['mean']+history_train['std'], history_train['mean']-history_train['std'], alpha=0.3)

plt.plot(history_val['mean'])
plt.fill_between(history_val.index, history_val['mean']+history_val['std'], history_val['mean']-history_val['std'], alpha=0.3)

plt.grid()

# Feature importance
fi = pd.DataFrame({i: model_dict[i].get_score(importance_type='gain') for i in range(n_repeat)})
fi['mean'] = fi.iloc[:, :n_repeat].mean(axis=1)
fi['std'] = fi.iloc[:, :n_repeat].std(axis=1)
fi.sort_values(by=['mean'], inplace=True, ascending=False)
# fi.reset_index(inplace=True)
#fi.to_csv('feature_importance_{}.csv'.format(simulation_name))

plt.rcParams.update({'figure.figsize': '16, 240'})
plt.rcParams.update({'font.size': '22'})
fig, ax = plt.subplots()
ax.barh(fi.index, fi['mean'].values, log=True, xerr=fi['std'].values)
ax.grid()
ax.tick_params(labelbottom=True,labeltop=True)
ax.set_ylim(fi.shape[0], -0.5)

plt.figure(figsize=(16, 9))
plt.plot(fi.values[:, n_repeat])
plt.grid()
plt.yscale('log')

# Save data
save_pickle('parameter_tune_eda_4_32_{}.pickle'.format(simulation_name), (history, model_dict, y_pred, y_sub, fi, param))

Train, validate, and predict, repetition 0 of 5
[0]	train-merror:0.293453	val-merror:0.344831	train-MAP@7:0	val-MAP@7:0.830413
[1]	train-merror:0.288389	val-merror:0.341441	train-MAP@7:0	val-MAP@7:0.832433
[2]	train-merror:0.287414	val-merror:0.339641	train-MAP@7:0	val-MAP@7:0.834673
[3]	train-merror:0.286695	val-merror:0.335162	train-MAP@7:0	val-MAP@7:0.840689
[4]	train-merror:0.285279	val-merror:0.333399	train-MAP@7:0	val-MAP@7:0.843016
[5]	train-merror:0.283951	val-merror:0.331303	train-MAP@7:0	val-MAP@7:0.845455
[6]	train-merror:0.283761	val-merror:0.330813	train-MAP@7:0	val-MAP@7:0.846932
[7]	train-merror:0.282979	val-merror:0.329547	train-MAP@7:0	val-MAP@7:0.848581
[8]	train-merror:0.281821	val-merror:0.329157	train-MAP@7:0	val-MAP@7:0.849012
[9]	train-merror:0.281864	val-merror:0.328673	train-MAP@7:0	val-MAP@7:0.849967
[10]	train-merror:0.281335	val-merror:0.328147	train-MAP@7:0	val-MAP@7:0.850769
[11]	train-merror:0.280936	val-merror:0.328137	train-MAP@7:0	val-MAP@7:0.851379
[1

[103]	train-merror:0.260079	val-merror:0.322253	train-MAP@7:0	val-MAP@7:0.86023
[104]	train-merror:0.259891	val-merror:0.322133	train-MAP@7:0	val-MAP@7:0.860355
[105]	train-merror:0.259787	val-merror:0.322228	train-MAP@7:0	val-MAP@7:0.86035
[106]	train-merror:0.259602	val-merror:0.322307	train-MAP@7:0	val-MAP@7:0.860314
[107]	train-merror:0.259336	val-merror:0.322351	train-MAP@7:0	val-MAP@7:0.860347
[108]	train-merror:0.259336	val-merror:0.322389	train-MAP@7:0	val-MAP@7:0.860419
[109]	train-merror:0.259228	val-merror:0.322396	train-MAP@7:0	val-MAP@7:0.860408
[110]	train-merror:0.259151	val-merror:0.32218	train-MAP@7:0	val-MAP@7:0.860515
[111]	train-merror:0.259021	val-merror:0.322129	train-MAP@7:0	val-MAP@7:0.860547
[112]	train-merror:0.258728	val-merror:0.322149	train-MAP@7:0	val-MAP@7:0.860536
[113]	train-merror:0.258483	val-merror:0.321968	train-MAP@7:0	val-MAP@7:0.860636
[114]	train-merror:0.258339	val-merror:0.321932	train-MAP@7:0	val-MAP@7:0.860636
[115]	train-merror:0.258154	val

KeyError: 'mlogloss'

In [None]:
history_val