## Feature Engineering and CV based on 5-th Place Solutions

`param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}`
         
`n_repeats=2
n_trees = 150`

val-MAP@7:0.89935, private LB: 0.0266884, public LB: 0.0264044

New in this notebook:
- Create training data for all months (2015-02-28 to 2016-04-28) and validate on 2016-05-28

In [1]:
from santander_helper import *

Create all months' data

In [2]:
# fixed_lag = 6
# for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
#     print(m)
#     if m in ['2015-01-28', '2016-06-28']:
#         continue
#     x_train, y_train, w_train = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
#     print('-'*60)
# del x_train, y_train, w_train
# gc.collect()

Load all months' data

In [2]:
x_train = []
y_train = []
w_train = []
fixed_lag = 6
for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
    if m in ['2015-01-28', '2016-06-28']:
        continue
    x_tmp, y_tmp, w_tmp = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
    x_train.append(x_tmp)
    y_train.append(y_tmp)
    w_train.append(w_tmp)
del x_tmp, y_tmp, w_tmp
gc.collect()

# Prepare for train and validation
x_val = x_train[-1]
y_val = y_train[-1]
w_val = w_train[-1]

x_train = pd.concat(x_train[:-1], axis=0, ignore_index=True, sort=False)
y_train = pd.concat(y_train[:-1], axis=0, ignore_index=True, sort=False)
w_train = pd.concat(w_train[:-1], axis=0, ignore_index=True, sort=False)

gc.collect()

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




203

In [3]:
param = {'objective': 'multi:softprob', 
         'eta': 0.1, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'lambda': 5,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}

n_rows = None # number of rows in train dataset, to simplify testing, always set to None
n_repeats = 1
n_trees = 50
train = {'x': x_train.iloc[:n_rows, :], 'y': y_train.iloc[:n_rows], 'w': w_train.iloc[:n_rows]}
val = {'x': x_val.iloc[:n_rows, :], 'y': y_val.iloc[:n_rows], 'w': w_val.iloc[:n_rows]}
df, clfs, running_time = cv_all_month(param, train, val, n_features=350, num_boost_round=n_trees, 
    n_repeats=n_repeats, random_state=0, verbose_eval=True)

[0]	train-merror:0.264369	val-merror:0.280592	train-MAP@7:0	val-MAP@7:0.89426
[1]	train-merror:0.261121	val-merror:0.276471	train-MAP@7:0	val-MAP@7:0.898705
[2]	train-merror:0.259361	val-merror:0.273682	train-MAP@7:0	val-MAP@7:0.901197
[3]	train-merror:0.25834	val-merror:0.272487	train-MAP@7:0	val-MAP@7:0.902048
[4]	train-merror:0.25738	val-merror:0.271646	train-MAP@7:0	val-MAP@7:0.902625
[5]	train-merror:0.256701	val-merror:0.271411	train-MAP@7:0	val-MAP@7:0.903462
[6]	train-merror:0.256367	val-merror:0.271032	train-MAP@7:0	val-MAP@7:0.903767
[7]	train-merror:0.255809	val-merror:0.270889	train-MAP@7:0	val-MAP@7:0.904071
[8]	train-merror:0.255355	val-merror:0.270498	train-MAP@7:0	val-MAP@7:0.904665
[9]	train-merror:0.254994	val-merror:0.270451	train-MAP@7:0	val-MAP@7:0.904784
[10]	train-merror:0.254642	val-merror:0.270532	train-MAP@7:0	val-MAP@7:0.904831
[11]	train-merror:0.254294	val-merror:0.270396	train-MAP@7:0	val-MAP@7:0.904902
[12]	train-merror:0.254006	val-merror:0.270291	train-

In [5]:
x_test = create_test(month='2016-06-28', max_lag=17, fixed_lag=6, pattern_flag=True)

In [6]:
simulation_name = 'p1'
y_pred, y_sub = predict_all_month(model_dict=clfs, x_test=x_test, 
    sub_name='eda_4_33_{}.csv.gz'.format(simulation_name), n_features=350, n_trees=n_trees)

In [7]:
y_pred2 = load_pickle('parameter_tune_eda_4_32_p4.pickle')

In [15]:
y_pred2 = y_pred2[-4]

In [16]:
y_pred2.shape

(5, 929615, 19)

In [17]:
y_pred.shape

(1, 929615, 19)

In [18]:
y_pred_final = np.concatenate((y_pred, y_pred2))

In [20]:
y_pred_final = np.mean(y_pred_final, axis=0)

In [21]:
y_sub = np.argsort(y_pred_final, axis=1)
y_sub = np.fliplr(y_sub)[:, :7]
# Prepare submission
test_id = x_test.loc[:, 'ncodpers'].values
y_sub = [' '.join([target_cols[k] for k in pred]) for pred in y_sub]
y_sub = pd.DataFrame({'ncodpers': test_id, 'added_products': y_sub})
y_sub.to_csv(sub_name, compression='gzip', index=False)

(929615, 19)