## Feature Engineering and CV based on 5-th Place Solutions

`param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}`
         
`n_repeats=2
n_trees = 150`

Train on the last three months

New in this notebook:
- Create training data for all months (2015-02-28 to 2016-04-28) and validate on 2016-05-28

In [1]:
from santander_helper import *

Load all months' data

In [2]:
x_train_list = []
y_train_list = []
w_train_list = []
fixed_lag = 6
for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
    if m in ['2015-01-28', '2016-06-28']:
        continue
    x_tmp, y_tmp, w_tmp = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
    x_train_list.append(x_tmp)
    y_train_list.append(y_tmp)
    w_train_list.append(w_tmp)
del x_tmp, y_tmp, w_tmp
gc.collect()

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




528

Prepare the last few months for train and validation

In [3]:
# Prepare for train and validation
x_val = x_train_list[-1]
y_val = y_train_list[-1]
w_val = w_train_list[-1]

x_train = pd.concat(x_train_list[-4:-1], axis=0, ignore_index=True, sort=False)
y_train = pd.concat(y_train_list[-4:-1], axis=0, ignore_index=True, sort=False)
w_train = pd.concat(w_train_list[-4:-1], axis=0, ignore_index=True, sort=False)

gc.collect()

0

In [4]:
param = {'objective': 'multi:softprob', 
         'eta': 0.1, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'lambda': 5,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}

n_rows = None # number of rows in train dataset, to simplify testing, always set to None
n_repeats = 3
n_trees = 100
train = {'x': x_train.iloc[:n_rows, :], 'y': y_train.iloc[:n_rows], 'w': w_train.iloc[:n_rows]}
val = {'x': x_val.iloc[:n_rows, :], 'y': y_val.iloc[:n_rows], 'w': w_val.iloc[:n_rows]}
df, clfs, running_time = cv_all_month(param, train, val, n_features=350, num_boost_round=n_trees, 
    n_repeats=n_repeats, random_state=0, verbose_eval=True)

[0]	train-merror:0.285485	val-merror:0.283159	train-MAP@7:0	val-MAP@7:0.893973
[1]	train-merror:0.281807	val-merror:0.279465	train-MAP@7:0	val-MAP@7:0.897743
[2]	train-merror:0.279786	val-merror:0.275982	train-MAP@7:0	val-MAP@7:0.900998
[3]	train-merror:0.278102	val-merror:0.274748	train-MAP@7:0	val-MAP@7:0.902293
[4]	train-merror:0.276841	val-merror:0.273686	train-MAP@7:0	val-MAP@7:0.903141
[5]	train-merror:0.276152	val-merror:0.273113	train-MAP@7:0	val-MAP@7:0.903613
[6]	train-merror:0.275092	val-merror:0.272508	train-MAP@7:0	val-MAP@7:0.90405
[7]	train-merror:0.274167	val-merror:0.271913	train-MAP@7:0	val-MAP@7:0.90456
[8]	train-merror:0.273757	val-merror:0.2711	train-MAP@7:0	val-MAP@7:0.905123
[9]	train-merror:0.273272	val-merror:0.270642	train-MAP@7:0	val-MAP@7:0.905572
[10]	train-merror:0.272696	val-merror:0.270626	train-MAP@7:0	val-MAP@7:0.905756
[11]	train-merror:0.272252	val-merror:0.270759	train-MAP@7:0	val-MAP@7:0.905786
[12]	train-merror:0.271783	val-merror:0.270383	train-M

[3]	train-merror:0.27787	val-merror:0.275673	train-MAP@7:0	val-MAP@7:0.901483
[4]	train-merror:0.27665	val-merror:0.274959	train-MAP@7:0	val-MAP@7:0.902071
[5]	train-merror:0.275521	val-merror:0.273831	train-MAP@7:0	val-MAP@7:0.902784
[6]	train-merror:0.274922	val-merror:0.273092	train-MAP@7:0	val-MAP@7:0.903489
[7]	train-merror:0.274181	val-merror:0.272784	train-MAP@7:0	val-MAP@7:0.903837
[8]	train-merror:0.273649	val-merror:0.272304	train-MAP@7:0	val-MAP@7:0.904237
[9]	train-merror:0.27309	val-merror:0.271359	train-MAP@7:0	val-MAP@7:0.904928
[10]	train-merror:0.272444	val-merror:0.271606	train-MAP@7:0	val-MAP@7:0.904902
[11]	train-merror:0.271952	val-merror:0.271264	train-MAP@7:0	val-MAP@7:0.905186
[12]	train-merror:0.271075	val-merror:0.271319	train-MAP@7:0	val-MAP@7:0.905401
[13]	train-merror:0.270516	val-merror:0.271276	train-MAP@7:0	val-MAP@7:0.905492
[14]	train-merror:0.270192	val-merror:0.271263	train-MAP@7:0	val-MAP@7:0.905532
[15]	train-merror:0.269642	val-merror:0.271104	tra

[6]	train-merror:0.275573	val-merror:0.274509	train-MAP@7:0	val-MAP@7:0.902925
[7]	train-merror:0.274529	val-merror:0.274107	train-MAP@7:0	val-MAP@7:0.903421
[8]	train-merror:0.273943	val-merror:0.273429	train-MAP@7:0	val-MAP@7:0.903893
[9]	train-merror:0.272835	val-merror:0.272696	train-MAP@7:0	val-MAP@7:0.904387
[10]	train-merror:0.272133	val-merror:0.272025	train-MAP@7:0	val-MAP@7:0.904896
[11]	train-merror:0.271476	val-merror:0.272158	train-MAP@7:0	val-MAP@7:0.904864
[12]	train-merror:0.271156	val-merror:0.271402	train-MAP@7:0	val-MAP@7:0.905429
[13]	train-merror:0.270651	val-merror:0.271533	train-MAP@7:0	val-MAP@7:0.90548
[14]	train-merror:0.269933	val-merror:0.270914	train-MAP@7:0	val-MAP@7:0.905872
[15]	train-merror:0.269429	val-merror:0.270652	train-MAP@7:0	val-MAP@7:0.906131
[16]	train-merror:0.269222	val-merror:0.270715	train-MAP@7:0	val-MAP@7:0.906211
[17]	train-merror:0.268937	val-merror:0.270217	train-MAP@7:0	val-MAP@7:0.90651
[18]	train-merror:0.268531	val-merror:0.269839

Train on 1603-1605

In [5]:
x_train = pd.concat(x_train_list[-3:], axis=0, ignore_index=True, sort=False)
y_train = pd.concat(y_train_list[-3:], axis=0, ignore_index=True, sort=False)
w_train = pd.concat(w_train_list[-3:], axis=0, ignore_index=True, sort=False)

In [6]:
n_rows = None # number of rows in train dataset, to simplify testing, always set to None
n_repeats = 1
n_trees = 70
train = {'x': x_train.iloc[:n_rows, :], 'y': y_train.iloc[:n_rows], 'w': w_train.iloc[:n_rows]}
# val = {'x': x_val.iloc[:n_rows, :], 'y': y_val.iloc[:n_rows], 'w': w_val.iloc[:n_rows]}
clfs, running_time = cv_all_month(param, train, n_features=350, num_boost_round=n_trees, 
    n_repeats=n_repeats, random_state=47, verbose_eval=True)

[0]	train-merror:0.271463	train-MAP@7:0
[1]	train-merror:0.266817	train-MAP@7:0
[2]	train-merror:0.26531	train-MAP@7:0
[3]	train-merror:0.263486	train-MAP@7:0
[4]	train-merror:0.262397	train-MAP@7:0
[5]	train-merror:0.261808	train-MAP@7:0
[6]	train-merror:0.261232	train-MAP@7:0
[7]	train-merror:0.26048	train-MAP@7:0
[8]	train-merror:0.259835	train-MAP@7:0
[9]	train-merror:0.25937	train-MAP@7:0
[10]	train-merror:0.258757	train-MAP@7:0
[11]	train-merror:0.25787	train-MAP@7:0
[12]	train-merror:0.256897	train-MAP@7:0
[13]	train-merror:0.256606	train-MAP@7:0
[14]	train-merror:0.255729	train-MAP@7:0
[15]	train-merror:0.2553	train-MAP@7:0
[16]	train-merror:0.25512	train-MAP@7:0
[17]	train-merror:0.254259	train-MAP@7:0
[18]	train-merror:0.253837	train-MAP@7:0
[19]	train-merror:0.253283	train-MAP@7:0
[20]	train-merror:0.25244	train-MAP@7:0
[21]	train-merror:0.252185	train-MAP@7:0
[22]	train-merror:0.251678	train-MAP@7:0
[23]	train-merror:0.25132	train-MAP@7:0
[24]	train-merror:0.250985	train-MA

Predict 

In [7]:
x_test = create_test(month='2016-06-28', max_lag=17, fixed_lag=6, pattern_flag=True)

In [8]:
simulation_name = 'p1'
y_pred, y_sub = predict_all_month(model_dict=clfs, x_test=x_test, 
    sub_name='eda_4_34_{}.csv.gz'.format(simulation_name), n_features=350, n_trees=n_trees)
save_pickle('eda_4_34_results_{}.pickle'.format(simulation_name), (clfs, running_time, y_pred, y_sub))

In [9]:
y_pred2 = load_pickle('parameter_tune_eda_4_32_p4.pickle')
y_pred2 = y_pred2[-4]

y_pred3 = load_pickle('eda_4_33_results_p2.pickle')
y_pred3 = y_pred3[2]

In [10]:
y_pred_final = np.concatenate((y_pred, y_pred2, y_pred3))

In [11]:
y_pred_final = np.mean(y_pred_final, axis=0)

In [13]:
y_sub = np.argsort(y_pred_final, axis=1)
y_sub = np.fliplr(y_sub)[:, :7]
# Prepare submission
test_id = x_test.loc[:, 'ncodpers'].values
y_sub = [' '.join([target_cols[k] for k in pred]) for pred in y_sub]
y_sub = pd.DataFrame({'ncodpers': test_id, 'added_products': y_sub})
y_sub.to_csv('eda_4_34_p1+eda_4_33_p2+eda_4_32_p4.csv.gz', compression='gzip', index=False)