# MAP

Improve MAP evaluation speed
10 times run:
- 17.7s: original
- 18.05s: removes `apks`
- 18.01: remove `enumerate`
- 17.04: remove `y_prob = {}`

Optimizations:
- convert `dict` to `np.array`
- `enumerate` is quite efficient, so do not remove it, if I cannot find a better way
- `for` loop limits the speed, but so many customers with very few products, hard to optimize
- `np.mean` and `np.array` seem more efficient than `a += 1` and `a /= n`, but not sure
- use `numpy` whenever you can

In [1]:
from santander_helper import *
%matplotlib inline

In [2]:
month_train = '2015-06-28'
month_val = '2016-05-28'

param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0,
         'booster': 'gbtree', 
         'rate_drop': 0.1, 
         'skip_drop': 0.5,
         'normalize_type': 'tree', 
         'sample_type': 'uniform'}
num_rounds = 10

In [3]:
x_train, y_train, weight_train = create_train(month_train, pattern_flag=True)
x_val, y_val, weight_val = create_train(month_val, pattern_flag=True)

gt_train = prep_map(x_train, y_train)
gt_val = prep_map(x_val, y_val)
ground_truth = {'train': gt_train, 'val': gt_val}

dtrain = xgb.DMatrix(x_train, y_train)
dval = xgb.DMatrix(x_val, y_val)

data_len = {'train': len(dtrain.get_label()), 'val': len(dval.get_label())}

weight_index = 0

dtrain.set_weight(weight_train.values[:, weight_index])
dval.set_weight(weight_val.values[:, weight_index])

param['seed'] = np.random.randint(10**6)
#model = xgb.train(param, dtrain, num_rounds)

model = xgb.train(param, dtrain, num_rounds, feval=eval_map, verbose_eval=True, evals=[(dtrain, 'train'), (dval, 'val')], 
                 gt=ground_truth, ts=data_len)

[0]	train-mlogloss:2.6041	val-mlogloss:2.62764	train-MAP@7:0.964656	val-MAP@7:0.961384
[1]	train-mlogloss:2.37108	val-mlogloss:2.40365	train-MAP@7:0.968099	val-MAP@7:0.964917
[2]	train-mlogloss:2.19067	val-mlogloss:2.23665	train-MAP@7:0.968956	val-MAP@7:0.964644
[3]	train-mlogloss:2.04277	val-mlogloss:2.09845	train-MAP@7:0.969859	val-MAP@7:0.964905
[4]	train-mlogloss:1.9172	val-mlogloss:1.97772	train-MAP@7:0.970351	val-MAP@7:0.967903
[5]	train-mlogloss:1.8084	val-mlogloss:1.87093	train-MAP@7:0.970813	val-MAP@7:0.968156
[6]	train-mlogloss:1.71241	val-mlogloss:1.78006	train-MAP@7:0.971355	val-MAP@7:0.96858
[7]	train-mlogloss:1.62667	val-mlogloss:1.69745	train-MAP@7:0.971629	val-MAP@7:0.968606
[8]	train-mlogloss:1.54907	val-mlogloss:1.62386	train-MAP@7:0.971987	val-MAP@7:0.968737
[9]	train-mlogloss:1.47882	val-mlogloss:1.55731	train-MAP@7:0.972364	val-MAP@7:0.969047


In [4]:
# Is the MAP correct? If not, the answer should not be 1
y_train_prob = model.predict(dtrain)
y_val_prob = model.predict(dval)

y_prob = np.zeros((y_train.shape[0], len(target_cols)))

y_prob[np.arange(len(y_train)), y_train] = 1

eval_map(y_prob, dtrain, gt=ground_truth, ts=data_len)

NameError: name 'eval_map1' is not defined

In [None]:
import timeit

Time of train+evaluation with `eval_map`

In [None]:
timeit.timeit("model = xgb.train(param, dtrain, num_boost_round=1, feval=eval_map, evals=[(dtrain, 'train'), (dval, 'val')], gt=ground_truth, ts=data_hash)", globals=globals(), number=5)