In [3]:
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('recs_synthetic.csv')
data['rnk'] = data.groupby('user_id').cumcount()
data.head()

Unnamed: 0,user_id,score,target,total_actions,rnk
0,0,0.993668,1,17,0
1,0,0.896899,0,17,1
2,0,0.875116,1,17,2
3,0,0.844549,0,17,3
4,0,0.837955,0,17,4


In [5]:
def precision_at_k(df, k):
    df = df[df['rnk'] < k]
    df['tp'] = df['target'] & df['pred']
    df['fp'] = df[(df['target']!=df['pred'])&(df['pred']==1)]
    return df['tp'].sum()/(df['tp'].sum()+df['fp'].sum())

def recall_at_k(df, k):
    df['pred'] = df['rnk'].apply(lambda x: 1 if x < k else 0)
    df['tp'] = df['target'] & df['pred']
    agg_df = df[['user_id', 'tp', 'total_actions']].groupby('user_id').agg({'tp': 'sum', 'total_actions': 'max'}) # тут можно сразу tp поделить на total_actions, чтобы не делать 2 агрегации
    agg_df['recall'] = agg_df['tp']/agg_df['total_actions']
    return agg_df['recall'].mean()

In [6]:
df = data.copy()
df['pred'] = df['pred'] = (df['rnk'] < 8).astype(int)

In [5]:
%%time
df['tp'] = df[['target', 'pred']].apply(lambda x: 1 if x[0]==x[1]==1 else 0, axis=1)

CPU times: user 817 ms, sys: 2.96 ms, total: 820 ms
Wall time: 820 ms


In [6]:
%%time
df['tp'] = df['target'] & df['pred'] # логическое AND
# получаемся ~ в 500 раз быстрее

CPU times: user 1.45 ms, sys: 676 µs, total: 2.13 ms
Wall time: 1.43 ms


In [7]:
def map_at_k(df, k):
    df = df[df['rnk'] < k]
    df['target_cumsum'] = df[['user_id', 'target']].groupby('user_id').cumsum()
    df['for_running_sum'] = df['target_cumsum']/(df['rnk']+1)
    agg_df = df[df['target']==1][['user_id', 'for_running_sum']]\
                    .groupby('user_id')\
                    .agg({'for_running_sum':['sum', 'count']}).reset_index() # тут можно просто mean сделать, это то же самое, что sum / count
    agg_df.columns = ['user_id', 'sum', 'count']
    agg_df['ap'] = agg_df['sum']/agg_df['count']
    return agg_df['ap'].mean()

In [12]:
def ndcg_at_k(df, k):
    df = df[df['rnk'] < k]
    # ты тут, видимо, не сохранила скобки, я поставил)
    df['dcg_i'] = (2**df['target']-1)/df['rnk'].apply(lambda x: math.log(x + 2, 2)) # лучше векторизовать: df['target'] / (np.log2(df['rnk'] + 2))
    df = df.sort_values(['user_id', 'target'], ascending=[True, False]) # пересортировка тоже достаточно тяжелая операция
    # лучше так: df['idcg_i'] = df['target'] / (np.log2(df.groupby('user_id')['target'].cumsum() + 1))
    df['rnk_new'] = df.groupby('user_id').cumcount()
    df['idcg_i'] = (2**df['target']-1)/df['rnk_new'].apply(lambda x: math.log(x + 2, 2))
    dcg_df = df[['user_id', 'dcg_i', 'idcg_i']].groupby('user_id').sum()
    dcg_df['ndcg'] = dcg_df['dcg_i']/dcg_df['idcg_i']
    return dcg_df['ndcg'].mean()

In [9]:
def mrr(df):
    df = df[df['target']==1]
    agg_df = df[['user_id', 'rnk']].groupby('user_id').min()
    agg_df['tmp'] = 1/(agg_df['rnk']+1)
    return agg_df.tmp.sum()/agg_df.shape[0]

In [2]:
print(f'precision@8: {precision_at_k(df=data.copy(), k=8)}')
print(f'precision@30: {precision_at_k(df=data.copy(), k=30)}')

NameError: name 'data' is not defined

In [11]:
print(f'recall@8: {recall_at_k(df=data.copy(), k=8)}')
print(f'recall@30: {recall_at_k(df=data.copy(), k=30)}')

recall@8: 0.1521432894570625
recall@30: 0.5879062358080964


In [12]:
print(f'MAP@8: {map_at_k(df=data.copy(), k=8)}')
print(f'MAP@30: {map_at_k(df=data.copy(), k=30)}')

MAP@8: 0.49745983037797686
MAP@30: 0.39767171242806565


In [13]:
print(f'MRR: {mrr(df=data.copy())}')

MRR: 0.5359987465750623


In [13]:
ndcg_at_k(df=data.copy(), k=8), ndcg_at_k(df=data.copy(), k=30)

(0.6563167070312262, 0.662153051871711)