# Course project


**Основное**
- Дедлайн - 19 февраля 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**


**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


# Set global const

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 100 

# Process features dataset

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [6]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 7 недель -- | -- 4 недель -- 
# подобрать размер 2-ого датасета (7 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
VAL_MATCHER_WEEKS = 9
VAL_RANKER_WEEKS = 3

In [7]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [8]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])
#df_join_train_matcher = data_train_matcher

In [9]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [10]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2024256, 12) Users: 2498 Items: 82059
val_matcher
Shape: (254234, 12) Users: 2258 Items: 32634
train_ranker
Shape: (254234, 12) Users: 2258 Items: 32634
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [11]:
# выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [12]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2019698,790,40341753006,573,1110624,1,3.59,31782,0.0,19,83,0.0,0.0
2019699,790,40341753006,573,5570406,1,1.67,31782,-1.92,19,83,0.0,0.0


# Prefilter items

In [13]:
n_items_before = data_train_matcher['item_id'].nunique()

n_popular = 2100

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=n_popular)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 82059 to 2101


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

#выбираем товары из подготовленного набора
prefiltered_items = list(set(data_train_matcher['item_id']))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

#оставляем подготовленные товары по всем массивам данных
#data_val_matcher = data_val_matcher[data_val_matcher.item_id.isin(prefiltered_items)]
#data_train_ranker = data_train_ranker[data_train_ranker.item_id.isin(prefiltered_items)]
#data_val_ranker = data_val_ranker[data_val_ranker.item_id.isin(prefiltered_items)]


print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (759224, 13) Users: 1966 Items: 2101
val_matcher
Shape: (244305, 12) Users: 1966 Items: 32059
train_ranker
Shape: (244305, 12) Users: 1966 Items: 32059
val_ranker
Shape: (116866, 12) Users: 1966 Items: 24195


In [15]:
data_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19
12,1364,26984896261,1,999999,1,2.99,31742,-0.4,1520,1,0.0,0.0,2.99
13,1364,26984896261,1,999999,1,3.09,31742,0.0,1520,1,0.0,0.0,3.09
14,1364,26984896261,1,999999,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


# Init/train recommender

In [16]:
def recommended_items(calc_type, users_list, number_items, function):
    if calc_type == 0:
        return [function(user, number_items) for user in users_list]
    elif calc_type == 1:
        return users_list.apply(lambda x: function(x, N=number_items))
    else:
        return list(map(lambda x: function(x, N=number_items), users_list))

In [17]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [18]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [19]:
recommender = MainRecommender(data_train_matcher, weighting='bm25', B=0.895, 
                              n_factors=200, regularization=0.001, iterations=35, num_threads=4, 
                              K=1)



  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

In [20]:
from hyperopt import fmin, tpe, hp, Trials
n_iter = 50
random_st = 12

def score_func(params, data=data_train_matcher, TOPK_PRECISION = 5):
    
    #print(f'{params}')
    
    # the function gets a set of variable parameters in "param"
    model_params = {'weighting': params['weighting'],
                    'B': params['B'],
                    'K1': int(params['K1']),
                    'n_factors': int(params['n_factors']),
                    'regularization': params['regularization'],
                    'iterations': int(params['iterations']),
                    'num_threads': params['num_threads'],
                    'K': int(params['K'])
                   }
    recommendation_model = params['model']
    
    # we use this params to create a new CatBoost model
    recommender = MainRecommender(data, **model_params)

    result_eval = data.groupby(USER_COL)[ITEM_COL].unique().reset_index()
    result_eval.columns=[USER_COL, ACTUAL_COL]
    
    if recommendation_model == 'own_rec':
        result_eval['own_rec'] = recommended_items(0, result_eval[USER_COL], N_PREDICT, recommender.get_own_recommendations)
    else:
        result_eval['als_rec'] = recommended_items(0, result_eval[USER_COL], N_PREDICT, recommender.get_als_recommendations)
    TOPK_PRECISION = 5
    TOPK_RECALL = 50
    # precision
    current_precision = sorted(calc_precision(result_eval, TOPK_PRECISION), 
                               key=lambda x: x[1], reverse=True)[0][1]

    if current_precision > 0:
        score = 1/current_precision
    else:
        score = 1e100
    
    return score

In [21]:
"""param={'weighting': hp.choice('weighting', ['bm25','tfidf']),
       'B': hp.uniform('B', 0.8, 0.9),
       'K1': hp.quniform('iterations', 50, 200, 10),
       'n_factors': hp.quniform('n_factors', 100, 300, 4),
       'regularization': hp.uniform('regularization', 0.001, 0.005),
       'iterations': hp.quniform('iterations', 30, 60, 2),
       'num_threads': 4,
       'K': hp.choice('K', [1, 2, 3]),
       'model': hp.choice('model', ['own_rec', 'als_rec'])
      }"""
"""param={'weighting': hp.choice('weighting', ['bm25','tfidf']),
       'B': hp.uniform('B', 0.87, 0.9),
       'K1': hp.quniform('K1', 130, 180, 10),
       'n_factors': hp.quniform('n_factors', 300, 340, 2),
       'regularization': hp.uniform('regularization', 0.0044, 0.005),
       'iterations': hp.quniform('iterations', 42, 50, 1),
       'num_threads': 4,
       'K': hp.choice('K', [1, 2]),
       'model': hp.choice('model', ['own_rec', 'als_rec'])
      }"""
"""param={'weighting': hp.choice('weighting', ['bm25']),
       'B': hp.uniform('B', 0.89, 0.9),
       'K1': hp.quniform('K1', 130, 150, 10),
       'n_factors': hp.quniform('n_factors', 304, 312, 2),
       'regularization': hp.uniform('regularization', 0.0047, 0.0048),
       'iterations': hp.quniform('iterations', 42, 45, 1),
       'num_threads': 4,
       'K': hp.choice('K', [1]),
       'model': hp.choice('model', ['als_rec'])
      }"""
"""param={'weighting': hp.choice('weighting', ['bm25', 'tfidf']),
       'B': 0.8916824806796836,
       'K1': 140,
       'K': 1,
       'n_factors': hp.quniform('n_factors', 290, 350, 2),
       'regularization': hp.uniform('regularization', 0.0044, 0.0052),
       'iterations': hp.quniform('iterations', 42, 46, 1),
       'num_threads': 4,
       'model': hp.choice('model', ['als_rec'])
      }"""
param={'weighting': hp.choice('weighting', ['bm25']),
       'B': hp.uniform('B', 0.89, 0.9),
       'K1': 150,
       'K': 1,
       'n_factors': hp.quniform('n_factors', 190, 350, 2),
       'regularization': hp.uniform('regularization', 0.001, 0.0052),
       'iterations': hp.quniform('iterations', 30, 46, 1),
       'num_threads': 4,
       'model': hp.choice('model', ['als_rec'])
      }

"""best=fmin(score_func, # function to optimize
          space=param, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
         )
best"""

'best=fmin(score_func, # function to optimize\n          space=param, \n          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically\n          max_evals=n_iter, # maximum number of iterations\n         )\nbest'

In [22]:
"""{'B': 0.6521514604522011,
 'K': 1,
 'iterations': 43.0,
 'model': 1,
 'n_factors': 299.0,
 'regularization': 0.00385482503376726,
 'weighting': 1}
 
 {'B': 0.8916824806796836,
 'K': 1,
 'K1': 140.0,
 'iterations': 44.0,
 'model': 1,
 'n_factors': 310.0,
 'regularization': 0.0047527371557352415,
 'weighting': 1}
 
 {'B': 0.8871774904286023,
 'K': 1,
 'iteration_n': 44.0,
 'iterations': 150.0,
 'model': 'als_rec',
 'n_factors': 316.0,
 'regularization': 0.004702184365679667,
 'weighting': 'tfidf'}
 
 {'B': 0.8911317242721445,
 'K': 0,
 'K1': 150.0,
 'iterations': 44.0,
 'model': 0,
 'n_factors': 306.0,
 'regularization': 0.004749095065273229,
 'weighting': 0}
 
 
 {'B': 0.8965492275035548,
 'iterations': 42.0,
 'model': 0,
 'n_factors': 190.0,
 'regularization': 0.0011024214998912698,
 'weighting': 0}
"""

"{'B': 0.6521514604522011,\n 'K': 1,\n 'iterations': 43.0,\n 'model': 1,\n 'n_factors': 299.0,\n 'regularization': 0.00385482503376726,\n 'weighting': 1}\n \n {'B': 0.8916824806796836,\n 'K': 1,\n 'K1': 140.0,\n 'iterations': 44.0,\n 'model': 1,\n 'n_factors': 310.0,\n 'regularization': 0.0047527371557352415,\n 'weighting': 1}\n \n {'B': 0.8871774904286023,\n 'K': 1,\n 'iteration_n': 44.0,\n 'iterations': 150.0,\n 'model': 'als_rec',\n 'n_factors': 316.0,\n 'regularization': 0.004702184365679667,\n 'weighting': 'tfidf'}\n \n {'B': 0.8911317242721445,\n 'K': 0,\n 'K1': 150.0,\n 'iterations': 44.0,\n 'model': 0,\n 'n_factors': 306.0,\n 'regularization': 0.004749095065273229,\n 'weighting': 0}\n \n \n {'B': 0.8965492275035548,\n 'iterations': 42.0,\n 'model': 0,\n 'n_factors': 190.0,\n 'regularization': 0.0011024214998912698,\n 'weighting': 0}\n"

In [23]:
best = {'B': 0.8965492275035548,
        'iterations': 42.0,
        'model': 'als_rec',
        'n_factors': 190.0,
        'regularization': 0.0011024214998912698,
        'weighting': 'bm25',
        'K': 1
       }
 

In [24]:
"""recommender = MainRecommender(data_train_matcher, weighting=best['weighting'],
                              B=best['B'], n_factors=int(best['n_factors']),
                              regularization=best['regularization'], iterations=int(best['iterations']),
                             num_threads=4, K=int(best['K'])
                             )"""

"recommender = MainRecommender(data_train_matcher, weighting=best['weighting'],\n                              B=best['B'], n_factors=int(best['n_factors']),\n                              regularization=best['regularization'], iterations=int(best['iterations']),\n                             num_threads=4, K=int(best['K'])\n                             )"

In [25]:
best['weighting'], param['weighting'][1]

('bm25', <hyperopt.pyll.base.Apply at 0x7f81d2088eb0>)

### Варианты, как получить кандидатов

Можно потом все эти варианты соединить в один

(!) Если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N

# Eval recall of matching

### Измеряем recall@k

In [26]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[836548, 856942, 877391, 933913, 948420, 10368..."
1,3,"[946839, 953476, 1053690]"


In [27]:
#result_eval_matcher['own_rec'] = recommended_items(0, result_eval_matcher[USER_COL], N_PREDICT, recommender.get_own_recommendations)

### Пример оборачивания

In [28]:
def evalRecall(df_result, target_col_name, recommend_model, result_col_name='result', 
               actual_col = 'actual', N_PREDICT=50, inplace = False):
    df_result[result_col_name] = recommended_items(0, df_result[target_col_name], N_PREDICT, recommend_model)
    result = np.mean([recall_at_k(a, b, k=N_PREDICT) for a,b in zip(df_result[result_col_name], df_result[actual_col])])
    if not(inplace):
        df_result.drop(columns=result_col_name, inplace=True)
    return result

In [29]:
def evalPrecision(df_result, target_col_name, recommend_model, result_col_name='result', 
                  actual_col = 'actual', N_PREDICT=50, inplace = False):
    df_result[result_col_name] = recommended_items(0, df_result[target_col_name], N_PREDICT, recommend_model)
    result = np.mean([precision_at_k(a, b, k=N_PREDICT) for a,b in zip(df_result[result_col_name], df_result[actual_col])])
    if not(inplace):
        df_result.drop(columns=result_col_name, inplace=True)
    return result

In [30]:
# evalRecall(result_eval_matcher, USER_COL, recommender.get_own_recommendations)

### Recall@50 of matching

In [31]:
TOPK_RECALL = 50

In [32]:
#sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

### Precision@5 of matching

In [33]:
TOPK_PRECISION = 5

In [34]:
#sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах

In [35]:
# -- давние покупки -- | -- 7 недель -- | -- 4 недель -- 

## Подготовка данных для трейна

In [36]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [37]:
# собираем кандитатов с первого этапа (matcher)
"""if best['model']=='own_rec':
    df_match_candidates['candidates'] = recommended_items(0, df_match_candidates[USER_COL], N_PREDICT, recommender.get_own_recommendations)
else:"""
df_match_candidates['candidates'] = recommended_items(0, df_match_candidates[USER_COL], N_PREDICT, recommender.get_als_recommendations)

In [38]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,790,"[1112238, 1029743, 878996, 5569230, 5574377, 5..."
1,1795,"[986912, 1105488, 1127179, 5568489, 838186, 82..."


In [39]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [40]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [41]:
#df_match_candidates.head(4)

### Check warm start

In [42]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (196600, 2) Users: 1966 Items: 298


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [43]:
data_train_ranker.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2019698,790,40341753006,573,1110624,1,3.59,31782,0.0,19,83,0.0,0.0
2019699,790,40341753006,573,5570406,1,1.67,31782,-1.92,19,83,0.0,0.0
2019700,790,40341753006,573,5995101,1,2.5,31782,-0.33,19,83,0.0,0.0
2019701,790,40341753006,573,5995177,1,2.5,31782,-0.33,19,83,0.0,0.0
2019702,1795,40341753025,573,890536,1,0.69,31782,0.0,100,83,0.0,0.0


In [44]:
def df_preparation(data, keep_columns=[USER_COL, ITEM_COL, 'quantity', 'sales_value', 'retail_disc', 
                                       'week_no', 'store_id']):
    data = data[keep_columns].copy()
    return data

In [45]:
#df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL, 'basket_id', 'quantity', 'sales_value', 'retail_disc', 'week_no', 'store_id']].copy()
#df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL, 'basket_id', 'quantity', 'sales_value', 'retail_disc', 'week_no', 'store_id']].copy()
columns = [USER_COL, ITEM_COL, 'quantity', 'sales_value', 'retail_disc', 'week_no', 'store_id']

df_ranker_train = df_preparation(data_train_ranker, keep_columns=columns)
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')
df_ranker_train['target'].fillna(0, inplace= True)

In [46]:
df_ranker_train.target.value_counts()

0.0    185509
1.0     18946
Name: target, dtype: int64

(!) На каждого юзера 50 item_id-кандидатов

In [47]:
df_ranker_train['target'].mean()

0.09266586779486928

## Подготавливаем фичи для обучения модели

### Описательные фичи

In [48]:
#item_features.head(2)
#user_features.head(2)

In [49]:
def df_merge_features(data, item_features, user_features):
    data = data.merge(item_features, on='item_id', how='left')
    data = data.merge(user_features, on='user_id', how='left')
    return data

In [50]:
df_ranker_train = df_merge_features(df_ranker_train, item_features, user_features)


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [51]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,quantity,sales_value,retail_disc,week_no,store_id,target,manufacturer,department,...,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,790,1112238,,,,,,0.0,3173,PRODUCE,...,VALUE ADDED FRUIT,INSTORE CUT FRUIT,,45-54,U,50-74K,Homeowner,Unknown,1,None/Unknown
1,790,1029743,,,,,,0.0,69,GROCERY,...,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Homeowner,Unknown,1,None/Unknown
2,790,878996,,,,,,0.0,2,PRODUCE,...,GRAPES,GRAPES RED,18 LB,45-54,U,50-74K,Homeowner,Unknown,1,None/Unknown
3,790,5569230,1.0,4.19,-0.5,85.0,31782.0,1.0,1208,GROCERY,...,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,U,50-74K,Homeowner,Unknown,1,None/Unknown
4,790,5574377,,,,,,0.0,1208,GROCERY,...,SOFT DRINKS,SFT DRNK MLT-PK BTL CARB (EXCP,24 OZ,45-54,U,50-74K,Homeowner,Unknown,1,None/Unknown


## !!! Пока выполните нотбук без этих строк, потом вернитесь и запустите их, обучите ранкер и посмотрите на метрики с ранжированием

In [52]:
def df_add_features(data, initial_df, ITEM_COL='item_id', USER_COL='user_id'):
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
    #data = data.merge(initial_df.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
    #data = data.merge(initial_df.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/initial_df.week_no.nunique(), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_user')/initial_df.user_id.nunique(), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/initial_df.week_no.nunique(), how='left',on=USER_COL)
    
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/initial_df.basket_id.nunique(), how='left',on=ITEM_COL)
    #data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/initial_df.basket_id.nunique(), how='left',on=USER_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/initial_df.basket_id.nunique(), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/initial_df.basket_id.nunique(), how='left',on=USER_COL)
    return data

In [53]:
#df_ranker_train = df_add_features(df_ranker_train, df_join_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)
df_ranker_train = df_add_features(df_ranker_train, data_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)

In [54]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [55]:
def train_data_preparation(data):
    if 'quantity' in data.columns:
        data['quantity'].fillna(value=0, inplace=True)
    if 'sales_value' in data.columns:
        data['sales_value'].fillna(value=0, inplace=True)
    if 'retail_disc' in data.columns:
        data['retail_disc'].fillna(value=0, inplace=True)
    if 'age_desc' in data.columns:
        data['age_desc'].fillna(value='19-65+', inplace=True)
    if 'marital_status_code' in data.columns:
        data['marital_status_code'].fillna( value='C', inplace=True)
    if 'income_desc' in data.columns:
        data['income_desc'].fillna( value='0-250K+', inplace=True)
    if 'homeowner_desc' in data.columns:
        data['homeowner_desc'].fillna( value='Unknown', inplace=True)
    if 'hh_comp_desc' in data.columns:
        data['hh_comp_desc'].fillna( value='Unknown', inplace=True)
    if 'household_size_desc' in data.columns:
        data['household_size_desc'].fillna( value='Unknown', inplace=True)
    if 'kid_category_desc' in data.columns:
        data['kid_category_desc'].fillna( value='None/Unknown', inplace=True)
    #if 'store_id' in data.columns:
        #data['store_id'] = X_train['store_id'].astype(str)
    #if 'basket_id' in data.columns:
        #data['basket_id'] = X_train['basket_id'].astype(str)
    
    data.replace('nan', 'U', inplace=True)
    #if 'week_no' in data.columns:
        #data['week_no'].fillna(value=0, inplace=True)
        #data['week_no'] = data['week_no'].astype(int)
    
    return data

In [56]:
X_train = train_data_preparation(X_train)

In [57]:
#unimportant_features = ['brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 
#                        'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 
#                        'household_size_desc', 'kid_category_desc', 'item_quantity_per_week'
#                       ]
"""unimportant_features = ['basket_id']
X_train.drop(columns = unimportant_features, inplace=True)"""

"unimportant_features = ['basket_id']\nX_train.drop(columns = unimportant_features, inplace=True)"

In [58]:
num_feats = [col for col in X_train.columns if (X_train[col].dtype == 'int64') or (X_train[col].dtype == 'float64')]
num_feats = set(num_feats) - set(['manufacturer','user_id', 'item_id'])
#num_feats

In [59]:
cat_feats = X_train.columns[0:].tolist()
cat_feats = list(set(cat_feats) - set(num_feats))
#cat_feats

In [60]:
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [61]:
y_train.mean()

target    0.092666
dtype: float64

In [62]:
#X_train.info()

## Обучение модели ранжирования

In [63]:
# week for validation
"""data = X_train
data['target'] = y_train

X_val = data[data['week_no'] >= data['week_no'].max() - 1]
X_train = data[data['week_no'] < data['week_no'].max() - 1]

y_train = X_train[['target']]
X_train = X_train.drop('target', axis=1)

y_val = X_val[['target']]
X_val = X_val.drop('target', axis=1)"""

"data = X_train\ndata['target'] = y_train\n\nX_val = data[data['week_no'] >= data['week_no'].max() - 1]\nX_train = data[data['week_no'] < data['week_no'].max() - 1]\n\ny_train = X_train[['target']]\nX_train = X_train.drop('target', axis=1)\n\ny_val = X_val[['target']]\nX_val = X_val.drop('target', axis=1)"

In [64]:
#feature_weight = {"week_no":0.005,"store_id":0.01, "quantity":0.01, "sales_value":0.01, 
#                  "retail_disc":0.01, "basket_id":0.11, "retail_disc":0.005, "item_id":0.1, "user_id":0.1}
"""feature_weight = {"week_no":0.08, "retail_disc":0.5}

#feature_weight = {"quantity":1}

model = CatBoostClassifier(
    iterations=70,
    learning_rate=0.05,
    depth=10,
    random_seed=12,
    logging_level='Silent',
    cat_features=cat_feats,
    feature_weights = feature_weight,
    custom_metric=['Logloss', 'Precision', 'F1', 'Recall']
)
#use_best_model=True

model.fit(
    X_train, y_train,
    plot=True
)
#eval_set=(X_val, y_val),"""

'feature_weight = {"week_no":0.08, "retail_disc":0.5}\n\n#feature_weight = {"quantity":1}\n\nmodel = CatBoostClassifier(\n    iterations=70,\n    learning_rate=0.05,\n    depth=10,\n    random_seed=12,\n    logging_level=\'Silent\',\n    cat_features=cat_feats,\n    feature_weights = feature_weight,\n    custom_metric=[\'Logloss\', \'Precision\', \'F1\', \'Recall\']\n)\n#use_best_model=True\n\nmodel.fit(\n    X_train, y_train,\n    plot=True\n)\n#eval_set=(X_val, y_val),'

In [65]:
#model.get_feature_importance(prettified = True)

In [66]:
def feature_weights_transf(params, columns):
    feature_weights_str = ""
    for item in columns:
        if item in params.keys():
            if feature_weights_str == '':
                feature_weights_str = feature_weights_str + f"{item}:{params[item]}"
            else:
                feature_weights_str = feature_weights_str + f",{item}:{params[item]}"
    return feature_weights_str

In [67]:
from hyperopt import fmin, tpe, hp, Trials
n_iter = 10
random_st = 12

def score_func(params, data=df_ranker_train, X_train=X_train,
               y_train=y_train, #X_val=X_val, y_val=y_val,
               random_state_val=random_st,
               TOPK_PRECISION = 5
              ):
    feature_weights_str = feature_weights_transf(params, X_train.columns)

    #feature_weights_str = f'({",".join([str(item) for item in params['feature_weights']])})
    # the function gets a set of variable parameters in "param"
    model_params = {'iterations': int(params['iterations']),
                    'learning_rate': params['learning_rate'],
                    'depth': int(params['depth']),
                    'logging_level': params['logging_level'],
                    'cat_features': params['cat_features'],
                    'feature_weights': feature_weights_str
                   }
    
    # we use this params to create a new CatBoost model
    model = CatBoostClassifier(random_seed=random_state_val, **model_params)
    
    #use_best_model=True
    model.fit(X=X_train, y=y_train)
    
    train_preds = model.predict_proba(X_train)
    
    data['proba_item_purchase'] = train_preds[:,1]
    result_eval = data.groupby(USER_COL)[ITEM_COL].unique().reset_index()
    result_eval.columns=[USER_COL, ACTUAL_COL]
    
    def rerank(user_id):
        return data[data[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()
    
    result_eval['reranked_als_rec'] = result_eval[USER_COL].apply(lambda user_id: rerank(user_id))
    current_precision = sorted(calc_precision(result_eval, TOPK_PRECISION), key=lambda x: x[1], reverse=True)[0][1]

    # precision
    if current_precision > 0:
        score = 1/current_precision
    else:
        score = 1e100
    
    return score

In [68]:
param={'iterations': hp.uniform('iterations', 100, 150),
       'learning_rate': hp.uniform('learning_rate', 0.07, 0.1),
       'depth': hp.uniform('depth', 8, 16),
       'logging_level': 'Silent',
       'cat_features': cat_feats,
       'random_state_val': 12
      }

"""    elif item == 'sales_value':
        param.update({item:hp.uniform(item, 0.07, 0.09)})
    elif item == 'quantity':
        param.update({item:hp.uniform(item, 0.07, 0.09)})
    elif item == 'store_id':
        param.update({item:hp.uniform(item, 0.07, 0.09)})
    elif item == 'retail_disc':
        param.update({item:hp.uniform(item, 0.3, 0.9)})
    """
"""feature_weight = {"week_no":0.08, "sales_value":0.083, "quantity":0.084, "store_id":0.08,
                  "retail_disc":0.5}"""
param.update({'week_no':hp.uniform('week_no', 0.074,0.08)})
param.update({'retail_disc':hp.uniform('retail_disc', 0.6,0.94)})
param.update({'sales_value':hp.uniform('sales_value', 0.08,0.09)})
param.update({'quantity':hp.uniform('quantity', 0.076,0.09)})
param.update({'store_id':hp.uniform('store_id', 0.085,0.1)})
"""for item in X_train.columns:
    if item == 'week_no':
        param.update({item:hp.uniform(item, 0.07, 0.9)})
    elif item == 'brand':
        param.update({item:hp.uniform(item, 0.3, 0.5)})
    elif item == 'commodity_desc':
        param.update({item:hp.uniform(item, 0.6, 1)})
    elif item == 'age_desc':
        param.update({item:hp.uniform(item, 0.6, 1)})
    else:
        param.update({item:hp.uniform(item, 0.0001, 1)})"""

print(f'{param}')

{'iterations': <hyperopt.pyll.base.Apply object at 0x7f81d4063760>, 'learning_rate': <hyperopt.pyll.base.Apply object at 0x7f81d4063640>, 'depth': <hyperopt.pyll.base.Apply object at 0x7f81d4063460>, 'logging_level': 'Silent', 'cat_features': ['age_desc', 'income_desc', 'homeowner_desc', 'household_size_desc', 'hh_comp_desc', 'department', 'marital_status_code', 'manufacturer', 'brand', 'kid_category_desc', 'curr_size_of_product', 'user_id', 'sub_commodity_desc', 'commodity_desc', 'item_id'], 'random_state_val': 12, 'week_no': <hyperopt.pyll.base.Apply object at 0x7f81d1f731c0>, 'retail_disc': <hyperopt.pyll.base.Apply object at 0x7f81dd1f4c10>, 'sales_value': <hyperopt.pyll.base.Apply object at 0x7f81dd1f4e20>, 'quantity': <hyperopt.pyll.base.Apply object at 0x7f81dd1f4d90>, 'store_id': <hyperopt.pyll.base.Apply object at 0x7f81dd1f4c40>}


In [69]:
best=fmin(score_func, # function to optimize
          space=param, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
         )
best

100%|████████| 10/10 [39:30<00:00, 237.01s/trial, best loss: 1.1634512960113756]


{'depth': 9.822802531494748,
 'iterations': 124.01986043075092,
 'learning_rate': 0.08442849640174455,
 'quantity': 0.07628255066156421,
 'retail_disc': 0.7872924670058793,
 'sales_value': 0.08757648685168383,
 'store_id': 0.08713841860786548,
 'week_no': 0.07587184219443838}

In [70]:
"""
{'age_desc': 0.7123073636565711,
 'brand': 0.3514451199790951,
 'commodity_desc': 0.7502414842684827,
 'curr_size_of_product': 0.9980096123334959,
 'department': 0.2381266386575291,
 'depth': 8.891231078146838,
 'hh_comp_desc': 0.06150869801147496,
 'homeowner_desc': 0.4697454898882302,
 'household_size_desc': 0.1311161495203425,
 'income_desc': 0.5208016708622205,
 'item_freq': 0.8159060191832026,
 'item_id': 0.1005737038414557,
 'item_quantity_per_user': 0.619439124360935,
 'item_quantity_per_week': 0.4043988417784879,
 'iterations': 32.8740719606796,
 'kid_category_desc': 0.568428941905317,
 'learning_rate': 0.018901095632145003,
 'manufacturer': 0.7206477482942951,
 'marital_status_code': 0.653818371182886,
 'retail_disc': 0.7263458604417883,
 'sub_commodity_desc': 0.43104609520738457,
 'total_item_sales_value': 0.9927342971058644,
 'total_quantity_value': 0.7292289194190094,
 'user_id': 0.30236219241627343,
 'week_no': 0.997843855630844}
"""

"\n{'age_desc': 0.7123073636565711,\n 'brand': 0.3514451199790951,\n 'commodity_desc': 0.7502414842684827,\n 'curr_size_of_product': 0.9980096123334959,\n 'department': 0.2381266386575291,\n 'depth': 8.891231078146838,\n 'hh_comp_desc': 0.06150869801147496,\n 'homeowner_desc': 0.4697454898882302,\n 'household_size_desc': 0.1311161495203425,\n 'income_desc': 0.5208016708622205,\n 'item_freq': 0.8159060191832026,\n 'item_id': 0.1005737038414557,\n 'item_quantity_per_user': 0.619439124360935,\n 'item_quantity_per_week': 0.4043988417784879,\n 'iterations': 32.8740719606796,\n 'kid_category_desc': 0.568428941905317,\n 'learning_rate': 0.018901095632145003,\n 'manufacturer': 0.7206477482942951,\n 'marital_status_code': 0.653818371182886,\n 'retail_disc': 0.7263458604417883,\n 'sub_commodity_desc': 0.43104609520738457,\n 'total_item_sales_value': 0.9927342971058644,\n 'total_quantity_value': 0.7292289194190094,\n 'user_id': 0.30236219241627343,\n 'week_no': 0.997843855630844}\n"

In [71]:
feature_weights_str = feature_weights_transf(best, X_train.columns)

model_params = {'iterations': int(best['iterations']),
                'learning_rate': best['learning_rate'],
                'depth': int(best['depth']),
                'logging_level': 'Silent',
                'cat_features': cat_feats,
                'feature_weights': feature_weights_str
                }

"""y_train = data['target']
X_train = data.drop('target', axis=1)"""

"y_train = data['target']\nX_train = data.drop('target', axis=1)"

In [72]:
# computing the score on the test set
model = CatBoostClassifier(random_seed=12, **model_params)
model.fit(
    X_train, y_train,
    plot=False
)

<catboost.core.CatBoostClassifier at 0x7f81d4073eb0>

In [73]:
model.get_feature_importance(prettified = True)

Unnamed: 0,Feature Id,Importances
0,store_id,34.50912
1,sales_value,22.77168
2,retail_disc,14.00539
3,week_no,7.49801
4,curr_size_of_product,2.249575
5,user_quantity_per_week,1.990408
6,user_freq_per_basket,1.671887
7,item_freq,1.636524
8,item_freq_per_basket,1.568332
9,item_id,1.459561


In [74]:
#train_preds = model.predict_proba(data.drop('target', axis=1))
train_preds = model.predict_proba(X_train)
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
#df_ranker_predict.head(12)

## Подведем итоги

    Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от als_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете.

# Evaluation on test dataset

In [75]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [76]:
%%time
result_eval_ranker['own_rec'] = recommended_items(0, result_eval_ranker[USER_COL], N_PREDICT, recommender.get_own_recommendations)
result_eval_ranker['als_rec'] = recommended_items(0, result_eval_ranker[USER_COL], N_PREDICT, recommender.get_als_recommendations)

CPU times: user 54.7 s, sys: 2.82 s, total: 57.5 s
Wall time: 1min 2s


In [77]:
f'TOPK_PRECISION = {TOPK_PRECISION}'

'TOPK_PRECISION = 5'

In [78]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

#sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.
    

In [79]:
def rerank(user_id, df = df_ranker_predict):
    return df[df[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [80]:
"""data_val_ranker = df_preparation(data_val_ranker, keep_columns=columns)
data_val_ranker = df_merge_features(data_val_ranker, item_features, user_features)
data_val_ranker = df_add_features(data_val_ranker, data_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)
data_val_ranker = train_data_preparation(data_val_ranker)

train_preds = model.predict_proba(data_val_ranker)
data_val_ranker['proba_item_purchase'] = train_preds[:,1]"""

result_eval_ranker['reranked_als_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True),  f'TOPK_PRECISION = {TOPK_PRECISION}', sep='\n')

('reranked_als_rec', 0.16632756866734272)
('als_rec', 0.09491353001017258)
('own_rec', 0.03234994913530021)
TOPK_PRECISION = 5


In [81]:
#precision@5 >= 0.25

In [82]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим

# Оценка на тесте для выполнения курсового проекта

In [83]:
TOPK_PRECISION = 5

df_test = pd.read_csv('retail_test1.csv')
#df_transactions = pd.read_csv('retail_train.csv')

In [84]:
#df_test = df_test[df_test.user_id.isin(common_users)]

In [85]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [86]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [87]:
"""df_test = df_preparation(df_test, keep_columns=columns)
df_test = df_merge_features(df_test, item_features, user_features)
df_test = df_add_features(df_test, data_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)
df_test = train_data_preparation(df_test)

train_preds = model.predict_proba(df_test)
df_test['proba_item_purchase'] = train_preds[:,1]"""

"df_test = df_preparation(df_test, keep_columns=columns)\ndf_test = df_merge_features(df_test, item_features, user_features)\ndf_test = df_add_features(df_test, data_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)\ndf_test = train_data_preparation(df_test)\n\ntrain_preds = model.predict_proba(df_test)\ndf_test['proba_item_purchase'] = train_preds[:,1]"

In [88]:
#sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

In [89]:
result_test['reranked_als_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [90]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True),  f'TOPK_PRECISION = {TOPK_PRECISION}', sep='\n')

('reranked_als_rec', 0.13549911399881784)
TOPK_PRECISION = 5


  return flags.sum() / len(recommended_list)
