# MAP

Improve MAP evaluation speed
10 times run:
- 17.7s: original
- 18.05s: removes `apks`
- 18.01: remove `enumerate`
- 17.04: remove `y_prob = {}`

Optimizations:
- convert `dict` to `np.array`
- `enumerate` is quite efficient, so do not remove it, if I cannot find a better way
- `for` loop limits the speed, but so many customers with very few products, hard to optimize
- `np.mean` and `np.array` seem more efficient than `a += 1` and `a /= n`, but not sure
- use `numpy` whenever you can
- train+evaluation time reduced by 35%

In [1]:
from santander_helper import *
%matplotlib inline

In [2]:
@jit
def apk1(actual, predicted, k=7, default=0.0):
    if actual.size==0:
        return default
    
    if predicted.size>k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(actual.size, k)

In [3]:
@jit
def eval_map1(y_prob, dtrain, gt={}, ts={}):
    '''
    Evaluate MAP@7 for train and validation sets---
    '''
    # Check which set is it?
    if len(dtrain.get_label())==ts['train']:
        glist = gt['train']
    elif len(dtrain.get_label())==ts['val']:
        glist = gt['val']
    
    n = len(glist)
    score = np.zeros(n)
    for i in range(n):
        tmp = np.mean(y_prob[glist[i][1], :], axis=0)
        tmp = np.argsort(tmp)[:-8:-1]
        score[i] = apk1(glist[i][0], tmp)
    score = np.mean(score)

    return 'MAP@7', score

In [4]:
def prep_map1(x_train, y_train):
    '''Prepare ground truth value and index for MAP evaluation, and save it.'''
    # Ground truth value: MAP needs to know the products bought by each customers
    gtv = pd.concat((pd.DataFrame(x_train.loc[:, 'ncodpers'].copy()), y_train), axis=1, ignore_index=True)
    gtv.columns = ['ncodpers', 'target']
    gtv = gtv.groupby('ncodpers')['target'].apply(lambda x: x.values).to_dict()
    # Ground truth index: MAP needs to know for each customer which rows are its corresponding data
    gti = pd.DataFrame(x_train.loc[:, 'ncodpers']).reset_index()
    gti = gti.groupby('ncodpers')['index'].apply(lambda x: x.values).to_dict()
    
    gt = np.array([[gtv[k], gti[k]] for k in gtv.keys()])
    
    return gt

In [5]:
month_train = '2015-06-28'
month_val = '2016-05-28'

param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0,
         'booster': 'gbtree', 
         'rate_drop': 0.1, 
         'skip_drop': 0.5,
         'normalize_type': 'tree', 
         'sample_type': 'uniform'}
num_rounds = 10

In [20]:
x_train, y_train, weight_train = create_train(month_train, pattern_flag=True)
x_val, y_val, weight_val = create_train(month_val, pattern_flag=True)

gt_train = prep_map(x_train, y_train)
gt_val = prep_map(x_val, y_val)
ground_truth = {'train': gt_train, 'val': gt_val}

dtrain = xgb.DMatrix(x_train, y_train)
dval = xgb.DMatrix(x_val, y_val)

data_hash = {'train': hash(dtrain.get_label().tostring()), 'val': hash(dval.get_label().tostring())}
data_len = {'train': len(dtrain.get_label()), 'val': len(dval.get_label())}

weight_index = 0

dtrain.set_weight(weight_train.values[:, weight_index])
dval.set_weight(weight_val.values[:, weight_index])

param['seed'] = np.random.randint(10**6)
#model = xgb.train(param, dtrain, num_rounds)

model = xgb.train(param, dtrain, num_rounds, feval=eval_map, verbose_eval=True, evals=[(dtrain, 'train'), (dval, 'val')], 
                 gt=ground_truth, ts=data_hash)

[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859
[1]	train-mlogloss:2.37044	val-mlogloss:2.41404	train-MAP@7:0.968539	val-MAP@7:0.962926
[2]	train-mlogloss:2.19071	val-mlogloss:2.24495	train-MAP@7:0.969591	val-MAP@7:0.964
[3]	train-mlogloss:2.043	val-mlogloss:2.10607	train-MAP@7:0.970032	val-MAP@7:0.965531
[4]	train-mlogloss:1.9175	val-mlogloss:1.98782	train-MAP@7:0.970235	val-MAP@7:0.966833
[5]	train-mlogloss:1.80862	val-mlogloss:1.88123	train-MAP@7:0.970676	val-MAP@7:0.969043
[6]	train-mlogloss:1.71258	val-mlogloss:1.79014	train-MAP@7:0.971184	val-MAP@7:0.966674
[7]	train-mlogloss:1.62668	val-mlogloss:1.7092	train-MAP@7:0.971655	val-MAP@7:0.968602
[8]	train-mlogloss:1.54919	val-mlogloss:1.63335	train-MAP@7:0.972032	val-MAP@7:0.968778
[9]	train-mlogloss:1.47879	val-mlogloss:1.56767	train-MAP@7:0.972389	val-MAP@7:0.968722


In [None]:
x_train, y_train, weight_train = create_train(month_train, pattern_flag=True)
x_val, y_val, weight_val = create_train(month_val, pattern_flag=True)

gt_train = prep_map1(x_train, y_train)
gt_val = prep_map1(x_val, y_val)
ground_truth = {'train': gt_train, 'val': gt_val}

dtrain = xgb.DMatrix(x_train, y_train)
dval = xgb.DMatrix(x_val, y_val)

data_hash = {'train': hash(dtrain.get_label().tostring()), 'val': hash(dval.get_label().tostring())}
data_len = {'train': len(dtrain.get_label()), 'val': len(dval.get_label())}

weight_index = 0

dtrain.set_weight(weight_train.values[:, weight_index])
dval.set_weight(weight_val.values[:, weight_index])

param['seed'] = np.random.randint(10**6)
#model = xgb.train(param, dtrain, num_rounds)

model = xgb.train(param, dtrain, num_rounds, feval=eval_map1, verbose_eval=True, evals=[(dtrain, 'train'), (dval, 'val')], 
                 gt=ground_truth, ts=data_len)

In [8]:
# Is the MAP correct? If not, the answer should not be 1
y_train_prob = model.predict(dtrain)
y_val_prob = model.predict(dval)

y_prob = np.zeros((y_train.shape[0], len(target_cols)))

y_prob[np.arange(len(y_train)), y_train] = 1

eval_map1(y_prob, dtrain, gt=ground_truth, ts=data_len)

('MAP@7', 1.0)

In [9]:
import timeit

Time of evaluating

In [25]:
gt_train = prep_map1(x_train, y_train)
gt_val = prep_map1(x_val, y_val)
ground_truth = {'train': gt_train, 'val': gt_val}
timeit.timeit('eval_map1(y_prob, dtrain, gt=ground_truth, ts=data_len)', globals=globals(), number=5)

20.170260999999982

Time of training

In [16]:
timeit.timeit('model = xgb.train(param, dtrain, num_boost_round=1)', globals=globals(), number=5)

18.318205699999993

Time of training+evaluating with `eval_map1`

In [17]:
timeit.timeit("model = xgb.train(param, dtrain, num_boost_round=1, feval=eval_map1, evals=[(dtrain, 'train'), (dval, 'val')], gt=ground_truth, ts=data_len, )", globals=globals(), number=5)

[0]	train-mlogloss:2.6029	val-mlogloss:2.62639	train-MAP@7:0.965442	val-MAP@7:0.960371
[0]	train-mlogloss:2.6029	val-mlogloss:2.62639	train-MAP@7:0.965442	val-MAP@7:0.960371
[0]	train-mlogloss:2.6029	val-mlogloss:2.62639	train-MAP@7:0.965442	val-MAP@7:0.960371
[0]	train-mlogloss:2.6029	val-mlogloss:2.62639	train-MAP@7:0.965442	val-MAP@7:0.960371
[0]	train-mlogloss:2.6029	val-mlogloss:2.62639	train-MAP@7:0.965442	val-MAP@7:0.960371


56.101055599999995

Time of train+evaluation with `eval_map`

In [21]:
timeit.timeit("model = xgb.train(param, dtrain, num_boost_round=1, feval=eval_map, evals=[(dtrain, 'train'), (dval, 'val')], gt=ground_truth, ts=data_hash)", globals=globals(), number=5)

[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859
[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859
[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859
[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859
[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859


85.21862490000001

The two methods should have the same evaluation results

In [22]:
gt_train = prep_map(x_train, y_train)
gt_val = prep_map(x_val, y_val)
ground_truth = {'train': gt_train, 'val': gt_val}
model = xgb.train(param, dtrain, num_boost_round=1, feval=eval_map, evals=[(dtrain, 'train'), (dval, 'val')], gt=ground_truth, ts=data_hash)

[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859


In [24]:
gt_train = prep_map1(x_train, y_train)
gt_val = prep_map1(x_val, y_val)
ground_truth = {'train': gt_train, 'val': gt_val}
model = xgb.train(param, dtrain, num_boost_round=1, feval=eval_map1, evals=[(dtrain, 'train'), (dval, 'val')], gt=ground_truth, ts=data_len)

[0]	train-mlogloss:2.60259	val-mlogloss:2.62607	train-MAP@7:0.96534	val-MAP@7:0.965859
