# MAP

Improve MAP evaluation speed
10 times run:
- 17.7s: original
- 18.05s: removes `apks`
- 18.01: remove `enumerate`
- 17.04: remove `y_prob = {}`

In [1]:
from santander_helper import *
%matplotlib inline

In [2]:
@jit
def apk1(actual, predicted, k=7, default=0.0):
    if actual.size==0:
        return default
    
    if predicted.size>k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(actual.size, k)

In [3]:
@jit
def eval_map1(y_prob, dtrain, gt={}, ts={}):
    '''
    Evaluate MAP@7 for train and validation sets---
    '''
    # Check which set is it?
    if len(dtrain.get_label())==ts['train']:
        glist = gt['train']
    elif len(dtrain.get_label())==ts['val']:
        glist = gt['val']
    
    n = len(glist)
    score = np.zeros(n)
    for i in range(n):
        tmp = np.mean(y_prob[glist[i][1], :], axis=0)
        tmp = np.argsort(tmp)[:-8:-1]
        score[i] = apk1(glist[i][0], tmp)
    score = np.mean(score)

    return 'MAP@7', score

In [4]:
def prep_map1(x_train, y_train):
    '''Prepare ground truth value and index for MAP evaluation, and save it.'''
    # Ground truth value: MAP needs to know the products bought by each customers
    gtv = pd.concat((pd.DataFrame(x_train.loc[:, 'ncodpers'].copy()), y_train), axis=1, ignore_index=True)
    gtv.columns = ['ncodpers', 'target']
    gtv = gtv.groupby('ncodpers')['target'].apply(lambda x: x.values).to_dict()
    # Ground truth index: MAP needs to know for each customer which rows are its corresponding data
    gti = pd.DataFrame(x_train.loc[:, 'ncodpers']).reset_index()
    gti = gti.groupby('ncodpers')['index'].apply(lambda x: x.values).to_dict()
    
    gt = np.array([[gtv[k], gti[k]] for k in gtv.keys()])
    
    return gt

In [5]:
def cv_month1(param, num_rounds, month_train, month_val, n_repeat=2, random_seed=0,
                    lag_train=5, lag_val=5, weight_set=(1), verbose_eval=True):
    '''Train on one month and validate on another'''
    history = {}
    model_dict = {}

    x_train, y_train, weight_train = create_train(month_train, max_lag=lag_train, pattern_flag=True)
    x_val, y_val, weight_val = create_train(month_val, max_lag=lag_val, pattern_flag=True)

    gt_train = prep_map(x_train, y_train)
    gt_val = prep_map(x_val, y_val)

    dtrain = xgb.DMatrix(x_train, y_train)
    dval = xgb.DMatrix(x_val, y_val)

    ground_truth = {'train': gt_train, 'val': gt_val}
    data_hash = {'train': hash(dtrain.get_label().tostring()), 'val': hash(dval.get_label().tostring())}
    # data_len = {'train': len(dtrain.get_label()), 'val': len(dval.get_label())}

    for weight_index in weight_set:
        history[weight_index] = {}
        model_dict[weight_index] = []

        dtrain.set_weight(weight_train.values[:, weight_index])
        dval.set_weight(weight_val.values[:, weight_index])
        
        for n in range(n_repeat):
            history[weight_index][n] = {}
            
            param['seed'] = np.random.randint(10**6)
            
            time_start = time.time()
            print('Train with weight {}, repetition {} of {}'.format(weight_index, n, n_repeat))
            model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'val')], 
                verbose_eval=verbose_eval, feval=eval_map, evals_result=history[weight_index][n], 
                gt=ground_truth, ts=data_hash)
            model_dict[weight_index].append(model)
            time_end = time.time()
            print('Validate logloss = {:.5f}, MAP@7 = {:.5f}, time = {:.2f} min'.format(
                history[weight_index][n]['val']['mlogloss'][-1], 
                history[weight_index][n]['val']['MAP@7'][-1], (time_end-time_start)/60))
            print('-'*50)
            print('')
        print('')

    history = {(w, n, d, m): history[w][n][d][m] 
               for w in weight_set 
               for n in range(n_repeat)
               for d in ['train', 'val'] 
               for m in ['mlogloss', 'MAP@7']}
    history = pd.DataFrame(history)
    history.columns.names = ['weight_index', 'repetition', 'data_set', 'metrics']
        
    return history, model_dict

In [6]:
month_train = '2015-06-28'
month_val = '2016-05-28'

param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0,
         'booster': 'gbtree', 
         'rate_drop': 0.1, 
         'skip_drop': 0.5,
         'normalize_type': 'tree', 
         'sample_type': 'uniform'}
num_rounds = 100

In [None]:
x_train, y_train, weight_train = create_train(month_train, pattern_flag=True)
x_val, y_val, weight_val = create_train(month_val, pattern_flag=True)

gt_train = prep_map1(x_train, y_train)
gt_val = prep_map1(x_val, y_val)

dtrain = xgb.DMatrix(x_train, y_train)
dval = xgb.DMatrix(x_val, y_val)

ground_truth = {'train': gt_train, 'val': gt_val}
# data_hash = {'train': hash(dtrain.get_label().tostring()), 'val': hash(dval.get_label().tostring())}
data_len = {'train': len(dtrain.get_label()), 'val': len(dval.get_label())}

weight_index = 0

dtrain.set_weight(weight_train.values[:, weight_index])
dval.set_weight(weight_val.values[:, weight_index])

param['seed'] = np.random.randint(10**6)
#model = xgb.train(param, dtrain, num_rounds)

In [None]:
# model = xgb.train(param, dtrain, num_rounds, feval=eval_map1, verbose_eval=True, evals=[(dtrain, 'train'), (dval, 'val')], 
#                  gt=ground_truth, ts=data_len)

In [14]:
y_train_prob = model.predict(dtrain)
y_val_prob = model.predict(dval)

In [44]:
y_prob = np.zeros((y_train.shape[0], len(target_cols)))

y_prob[np.arange(len(y_train)), y_train] = 1

In [61]:
eval_map1(y_prob, dtrain, gt=ground_truth, ts=data_len)

('MAP@7', 1.0)

In [48]:
import timeit

In [62]:
timeit.timeit('eval_map1(y_prob, dtrain, gt=ground_truth, ts=data_len)', globals=globals(), number=1)

7.217208700000015

In [16]:
timeit.timeit('eval_map1(y_prob, dtrain, gt=ground_truth, ts=data_len)', globals=globals(), number=1)

4.38330179999997

In [18]:
timeit.timeit('model = xgb.train(param, dtrain, num_rounds)', globals=globals(), number=5)

65.5014271

In [22]:
timeit.timeit('model = xgb.train(param, dtrain, num_rounds, feval=eval_map1)', globals=globals(), number=5)

64.29673359999992