# MAP

Improve MAP evaluation speed

In [10]:
import numpy as np
from numba import jit
import os
import timeit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb

In [22]:
@jit
def apk(actual, predicted, k=7, default=0.0):
    if predicted.size>k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if actual.size==0:
        return default

    return score / min(actual.size, k)

In [3]:
@jit
def eval_map(y_prob, dtrain, gt={}, ts={}):
    '''Evaluate MAP@7 for train and validation sets'''
    # Check which set is it?
    if hash(dtrain.get_label().tostring())==ts['train']:
        gti = gt['train']['index']
        gtv = gt['train']['value']
    elif hash(dtrain.get_label().tostring())==ts['val']:
        gti = gt['val']['index']
        gtv = gt['val']['value']
    
    n = len(gti)
    apks = np.zeros(n)
    y_pred = {}
    for i, (cust_id, idx) in enumerate(gti.items()):
        tmp = np.mean(y_prob[idx, :], axis=0)
        y_pred[cust_id] = np.argsort(tmp)[:-8:-1]
        apks[i] = apk(gtv[cust_id], y_pred[cust_id])
    score = np.mean(apks)

    return 'MAP@7', score

In [23]:
def prep_map(x_train, y_train):
    '''Prepare ground truth value and index for MAP evaluation, and save it.'''
    # Ground truth value: MAP needs to know the products bought by each customers
    gtv = pd.concat((pd.DataFrame(x_train.loc[:, 'ncodpers'].copy()), y_train), axis=1, ignore_index=True)
    gtv.columns = ['ncodpers', 'target']
    gtv = gtv.groupby('ncodpers')['target'].apply(lambda x: x.values).to_dict()
    # Ground truth index: MAP needs to know for each customer which rows are its corresponding data
    gti = pd.DataFrame(x_train.loc[:, 'ncodpers']).reset_index()
    gti = gti.groupby('ncodpers')['index'].apply(lambda x: x.values).to_dict()
    
    gt = {'value': gtv, 'index': gti}
    
    return gt

In [24]:
def cv_month(param, num_rounds, month_train, month_val, n_repeat=2, random_seed=0,
                    lag_train=5, lag_val=5, weight_set=(1), verbose_eval=True):
    '''Train on one month and validate on another'''
    history = {}
    model_dict = {}

    x_train, y_train, weight_train = create_train(month_train, max_lag=lag_train, pattern_flag=True)
    x_val, y_val, weight_val = create_train(month_val, max_lag=lag_val, pattern_flag=True)

    gt_train = prep_map(x_train, y_train)
    gt_val = prep_map(x_val, y_val)

    dtrain = xgb.DMatrix(x_train, y_train)
    dval = xgb.DMatrix(x_val, y_val)

    ground_truth = {'train': gt_train, 'val': gt_val}
    data_hash = {'train': hash(dtrain.get_label().tostring()), 'val': hash(dval.get_label().tostring())}

    for weight_index in weight_set:
        history[weight_index] = {}
        model_dict[weight_index] = []

        dtrain.set_weight(weight_train.values[:, weight_index])
        dval.set_weight(weight_val.values[:, weight_index])
        
        print('Start weight index {}'.format(weight_index))
        print('#'*50)
        
        for n in range(n_repeat):
            history[weight_index][n] = {}
            
            param['seed'] = np.random.randint(10**6)
            
            time_start = time.time()
            print('Train with weight {}, repetition {} of {}'.format(weight_index, n, n_repeat))
            model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'val')], 
                verbose_eval=verbose_eval, feval=eval_map, evals_result=history[weight_index][n], 
                gt=ground_truth, ts=data_hash)
            model_dict[weight_index].append(model)
            time_end = time.time()
            print('Validate logloss = {:.5f}, MAP@7 = {:.5f}, time = {:.2f} min'.format(
                history[weight_index][n]['val']['mlogloss'][-1], 
                history[weight_index][n]['val']['MAP@7'][-1], (time_end-time_start)/60))
            print('-'*50)
            print('')
        print('')

    history = {(w, n, d, m): history[w][n][d][m] 
               for w in weight_set 
               for n in range(n_repeat)
               for d in ['train', 'val'] 
               for m in ['mlogloss', 'MAP@7']}
    history = pd.DataFrame(history)
    history.columns.names = ['weight_index', 'repetition', 'data_set', 'metrics']
        
    return history, model_dict

In [6]:
n_row = 50000
n_col = 500
n_class = 20

y_prob = np.random.random((n_row, n_class))
dtrain = xgb.DMatrix(np.random.random((n_row, n_col)), label=np.random.randint(0, n_class, (n_row, )))

In [9]:
dtrain.get_label().shape

(50000,)

In [12]:
y_prob.shape

(50000, 20)

In [20]:
timeit.timeit('hash(y_prob.tostring())==0', globals=globals(), number=1000)

7.949219599999992

In [21]:
a = {'a': 1, 'b': 2}

In [None]:
np.