## Feature Engineering and CV based on 5-th Place Solutions

`param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}`
         
`n_repeats=2
n_trees = 150`

Train on the last three months

New in this notebook:
- Create training data for all months (2015-02-28 to 2016-04-28) and validate on 2016-05-28

In [1]:
from santander_helper import *

Load all months' data

In [2]:
x_train_list = []
y_train_list = []
w_train_list = []
fixed_lag = 6
for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
    if m in ['2015-01-28', '2016-06-28']:
        continue
    x_tmp, y_tmp, w_tmp = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
    x_train_list.append(x_tmp)
    y_train_list.append(y_tmp)
    w_train_list.append(w_tmp)
    print(len(x_train_list), m)
del x_tmp, y_tmp, w_tmp
gc.collect()

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

1 2015-02-28
2 2015-03-28
3 2015-04-28
4 2015-05-28
5 2015-06-28
6 2015-07-28
7 2015-08-28
8 2015-09-28
9 2015-10-28
10 2015-11-28
11 2015-12-28
12 2016-01-28
13 2016-02-28
14 2016-03-28
15 2016-04-28
16 2016-05-28



594

Prepare the last few months for train and validation

In [3]:
# Prepare for train and validation
x_val = x_train_list[-1]
y_val = y_train_list[-1]
w_val = w_train_list[-1]

x_train = x_train_list[10]
y_train = y_train_list[10]
w_train = w_train_list[10]

gc.collect()

0

In [4]:
param = {'objective': 'multi:softprob', 
         'eta': 0.1, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'lambda': 5,
         'max_delta_step': 5,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}

n_rows = None # number of rows in train dataset, to simplify testing, always set to None
n_repeats = 3
n_trees = 100
train = {'x': x_train.iloc[:n_rows, :], 'y': y_train.iloc[:n_rows], 'w': w_train.iloc[:n_rows]}
val = {'x': x_val.iloc[:n_rows, :], 'y': y_val.iloc[:n_rows], 'w': w_val.iloc[:n_rows]}
df, clfs, running_time = cv_all_month(param, train, val, n_features=350, num_boost_round=n_trees, 
    n_repeats=n_repeats, random_state=0, verbose_eval=True)

[0]	train-merror:0.312491	val-merror:0.339377	train-MAP@7:0	val-MAP@7:0.851934
[1]	train-merror:0.305138	val-merror:0.324558	train-MAP@7:0	val-MAP@7:0.861536
[2]	train-merror:0.303903	val-merror:0.318562	train-MAP@7:0	val-MAP@7:0.86582
[3]	train-merror:0.301684	val-merror:0.316656	train-MAP@7:0	val-MAP@7:0.867803
[4]	train-merror:0.299247	val-merror:0.315222	train-MAP@7:0	val-MAP@7:0.869263
[5]	train-merror:0.297541	val-merror:0.314349	train-MAP@7:0	val-MAP@7:0.870287
[6]	train-merror:0.295548	val-merror:0.313056	train-MAP@7:0	val-MAP@7:0.871173
[7]	train-merror:0.294321	val-merror:0.312821	train-MAP@7:0	val-MAP@7:0.871087
[8]	train-merror:0.293149	val-merror:0.313054	train-MAP@7:0	val-MAP@7:0.871021
[9]	train-merror:0.291666	val-merror:0.311771	train-MAP@7:0	val-MAP@7:0.871666
[10]	train-merror:0.29114	val-merror:0.311513	train-MAP@7:0	val-MAP@7:0.871855
[11]	train-merror:0.290505	val-merror:0.311136	train-MAP@7:0	val-MAP@7:0.872267
[12]	train-merror:0.289768	val-merror:0.310737	train

[3]	train-merror:0.299699	val-merror:0.320436	train-MAP@7:0	val-MAP@7:0.865797
[4]	train-merror:0.29865	val-merror:0.323856	train-MAP@7:0	val-MAP@7:0.864453
[5]	train-merror:0.297402	val-merror:0.320517	train-MAP@7:0	val-MAP@7:0.866402
[6]	train-merror:0.296113	val-merror:0.32266	train-MAP@7:0	val-MAP@7:0.865445
[7]	train-merror:0.294827	val-merror:0.320419	train-MAP@7:0	val-MAP@7:0.867153
[8]	train-merror:0.293762	val-merror:0.320372	train-MAP@7:0	val-MAP@7:0.867229
[9]	train-merror:0.292613	val-merror:0.322283	train-MAP@7:0	val-MAP@7:0.86616
[10]	train-merror:0.291578	val-merror:0.321804	train-MAP@7:0	val-MAP@7:0.866615
[11]	train-merror:0.290955	val-merror:0.322983	train-MAP@7:0	val-MAP@7:0.86588
[12]	train-merror:0.290007	val-merror:0.323795	train-MAP@7:0	val-MAP@7:0.86545
[13]	train-merror:0.289183	val-merror:0.322163	train-MAP@7:0	val-MAP@7:0.866424
[14]	train-merror:0.287877	val-merror:0.324292	train-MAP@7:0	val-MAP@7:0.865413
[15]	train-merror:0.286743	val-merror:0.323299	train

[6]	train-merror:0.2948	val-merror:0.31266	train-MAP@7:0	val-MAP@7:0.870854
[7]	train-merror:0.293287	val-merror:0.311512	train-MAP@7:0	val-MAP@7:0.871914
[8]	train-merror:0.292662	val-merror:0.311374	train-MAP@7:0	val-MAP@7:0.872061
[9]	train-merror:0.291555	val-merror:0.312449	train-MAP@7:0	val-MAP@7:0.871696
[10]	train-merror:0.290807	val-merror:0.315023	train-MAP@7:0	val-MAP@7:0.870127
[11]	train-merror:0.29021	val-merror:0.314637	train-MAP@7:0	val-MAP@7:0.870611
[12]	train-merror:0.289512	val-merror:0.314732	train-MAP@7:0	val-MAP@7:0.870731
[13]	train-merror:0.28868	val-merror:0.315265	train-MAP@7:0	val-MAP@7:0.87039
[14]	train-merror:0.288028	val-merror:0.314462	train-MAP@7:0	val-MAP@7:0.871094
[15]	train-merror:0.286902	val-merror:0.315432	train-MAP@7:0	val-MAP@7:0.870646
[16]	train-merror:0.28557	val-merror:0.316562	train-MAP@7:0	val-MAP@7:0.869974
[17]	train-merror:0.284458	val-merror:0.31943	train-MAP@7:0	val-MAP@7:0.868349
[18]	train-merror:0.283238	val-merror:0.316427	train

Train on 1603-1605

In [5]:
n_rows = None # number of rows in train dataset, to simplify testing, always set to None
n_repeats = 1
n_trees = 70
train = {'x': x_train.iloc[:n_rows, :], 'y': y_train.iloc[:n_rows], 'w': w_train.iloc[:n_rows]}
# val = {'x': x_val.iloc[:n_rows, :], 'y': y_val.iloc[:n_rows], 'w': w_val.iloc[:n_rows]}
clfs, running_time = cv_all_month(param, train, n_features=350, num_boost_round=n_trees, 
    n_repeats=n_repeats, random_state=47, verbose_eval=True)

[0]	train-merror:0.312748	train-MAP@7:0
[1]	train-merror:0.304609	train-MAP@7:0
[2]	train-merror:0.302523	train-MAP@7:0
[3]	train-merror:0.300725	train-MAP@7:0
[4]	train-merror:0.298701	train-MAP@7:0
[5]	train-merror:0.29654	train-MAP@7:0
[6]	train-merror:0.295884	train-MAP@7:0
[7]	train-merror:0.29399	train-MAP@7:0
[8]	train-merror:0.292819	train-MAP@7:0
[9]	train-merror:0.291364	train-MAP@7:0
[10]	train-merror:0.290138	train-MAP@7:0
[11]	train-merror:0.288248	train-MAP@7:0
[12]	train-merror:0.287471	train-MAP@7:0
[13]	train-merror:0.287412	train-MAP@7:0
[14]	train-merror:0.286179	train-MAP@7:0
[15]	train-merror:0.285771	train-MAP@7:0
[16]	train-merror:0.285542	train-MAP@7:0
[17]	train-merror:0.284604	train-MAP@7:0
[18]	train-merror:0.284098	train-MAP@7:0
[19]	train-merror:0.28329	train-MAP@7:0
[20]	train-merror:0.282729	train-MAP@7:0
[21]	train-merror:0.2819	train-MAP@7:0
[22]	train-merror:0.281081	train-MAP@7:0
[23]	train-merror:0.280782	train-MAP@7:0
[24]	train-merror:0.280315	trai

Predict on 1606

In [6]:
x_test = create_test(month='2016-06-28', max_lag=17, fixed_lag=6, pattern_flag=True)

In [7]:
simulation_name = 'p2'
y_pred, y_sub = predict_all_month(model_dict=clfs, x_test=x_test, 
    sub_name='eda_4_35_{}.csv.gz'.format(simulation_name), n_features=350, n_trees=n_trees)
save_pickle('eda_4_35_results_{}.pickle'.format(simulation_name), (clfs, running_time, y_pred, y_sub))

Load predictions based on 1506 and all months

In [8]:
y_pred2 = load_pickle('parameter_tune_eda_4_32_p4.pickle')
y_pred2 = y_pred2[-4]

y_pred3 = load_pickle('eda_4_33_results_p2.pickle')
y_pred3 = y_pred3[2]

y_pred4 = load_pickle('eda_4_34_results_p1.pickle')
y_pred4 = y_pred4[2]

Combine three models

In [9]:
y_pred_final = np.concatenate((y_pred, y_pred2, y_pred3))

In [10]:
y_pred_final = np.mean(y_pred_final, axis=0)

In [11]:
y_sub = np.argsort(y_pred_final, axis=1)
y_sub = np.fliplr(y_sub)[:, :7]
# Prepare submission
test_id = x_test.loc[:, 'ncodpers'].values
y_sub = [' '.join([target_cols[k] for k in pred]) for pred in y_sub]
y_sub = pd.DataFrame({'ncodpers': test_id, 'added_products': y_sub})
y_sub.to_csv('eda_4_35_{}+eda_4_34_p3+eda_4_33_p2+eda_4_32_p4.csv.gz'.format(simulation_name), compression='gzip', index=False)