# #4 Content-based recommendations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
USER_COL = 'user_id'
ITEM_COL = 'item_id'

In [2]:
from src.recommenders import MainRecommender
from src.metrics import precision_at_k, recall_at_k
from src.utils import  prefilter_items
print('Ok')

Ok


### Prepare data

In [3]:
data = pd.read_csv('../2_baselines_implicit/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

item_features = pd.read_csv('../3_collaborative_filtering/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [4]:
data_train = prefilter_items(data_train, item_features)

== Starting prefilter info ==
shape: (2278490, 12)
# users: 2499
# items: 86865
Sparsity: 1.050%
== Ending prefilter info ==
shape: (641574, 13)
# users: 2474
# items: 5000
Sparsity: 5.187%
new_columns: {'price'}


### Train recommender

In [5]:
mr = MainRecommender(data_train, weighting=True)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

### Get recommendations

In [6]:
sim_users = mr.model.similar_users(0, N=5)
recs = [(mr.id_to_userid[uid], sim) for (uid, sim) in zip(sim_users[0], sim_users[1])]
print('user_id  similarity')
for rec in recs:
    print(str(rec[0]).ljust(8), rec[1])

user_id  similarity
1        1.0
1900     0.77986956
993      0.72640586
712      0.7074581
2441     0.67684877


In [7]:
# recommend items by similar_users
user_to_recommend = mr.id_to_userid[0]
mr.get_similar_users_recommendation(user_to_recommend, N=5)


[954486, 9487420, 889618, 1037332, 9245479]

In [8]:
data[(data.user_id == 1900) & (data.item_id == 954486)]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
1501453,1900,33472696781,440,954486,1,0.89,329,0.0,1458,64,0.0,0.0
1509496,1900,33493601358,443,954486,2,1.78,384,0.0,1410,64,0.0,0.0
1526711,1900,33655811672,447,954486,2,1.78,329,0.0,1421,65,0.0,0.0


In [9]:
item_id = mr.id_to_itemid[2]
recs = mr._get_similar_items(item_id, N=5)


recs = [(mr.id_to_itemid[iid], sim) for (iid, sim) in zip(recs[0], recs[1])]
print('item_id  similarity')
for rec in recs:
    print(str(rec[0]).ljust(8), rec[1])

item_id  similarity
201704   0.99999994
43871    0.94874406
28897    0.94142014
10456575 0.6790069
10456371 0.6297973


In [13]:
sim_item_recs = mr.get_similar_items_recommendation(user_to_recommend)
print(sim_item_recs)

[7166839, 855325, 1058997, 1023720, 966103]


In [21]:
bought = data_train[data_train['user_id'] == user_to_recommend].item_id.unique()

In [25]:
recall_at_k(recommended_list=sim_item_recs, bought_list=bought[:5])

0.0

In [31]:
np.mean([1, 2, 3])

2.0

In [101]:
def print_eval_stats(name, recs, bought):
    print('{:*^21}'.format(name))
    p = precision_at_k(recommended_list=recs, bought_list=bought, k=5)
    average_precisions[name].append(p)
    print('precision@5:', p)
    r = recall_at_k(recommended_list=recs, bought_list=bought, k=50)
    average_recalls[name].append(r)
    print('recall@50:', r)

In [102]:
N=50

model_names =  ('similar users', 'similar items', 'ALS', 'own recommendations')
average_precisions = {k: [] for k in model_names}
average_recalls = {k: [] for k in model_names}

for user in data_test.user_id.unique()[:10]:
    print('USER: {}'.format(user))
    actual = data_test[data_test.user_id==user].item_id.unique().tolist()
    
    sim_user_rec = mr.get_similar_users_recommendation(user, N=N)
    sim_item_rec = mr.get_similar_items_recommendation(user, N=N)
    als_rec = mr.get_als_recommendations(user, N=N)
    own_rec = mr.get_own_recommendations(user, N=N)
    
    for rec, name in ((sim_user_rec, 'similar users'), (sim_item_rec, 'similar items'), (als_rec, 'ALS'), (own_rec, 'own recommendations')):
        print_eval_stats(name, rec, actual)
    print()

USER: 338
****similar users****
precision@5: 0.0
recall@50: 0.0380952380952381
****similar items****
precision@5: 0.2
recall@50: 0.05714285714285714
*********ALS*********
precision@5: 0.6
recall@50: 0.14285714285714285
*own recommendations*
precision@5: 0.0
recall@50: 0.0380952380952381

USER: 2120
****similar users****
precision@5: 0.0
recall@50: 0.0
****similar items****
precision@5: 0.0
recall@50: 0.0
*********ALS*********
precision@5: 0.0
recall@50: 0.0
*own recommendations*
precision@5: 0.0
recall@50: 0.0

USER: 2324
****similar users****
precision@5: 0.0
recall@50: 0.015384615384615385
****similar items****
precision@5: 0.2
recall@50: 0.03076923076923077
*********ALS*********
precision@5: 0.4
recall@50: 0.1076923076923077
*own recommendations*
precision@5: 0.0
recall@50: 0.12307692307692308

USER: 514
****similar users****
precision@5: 0.2
recall@50: 0.07692307692307693
****similar items****
precision@5: 0.0
recall@50: 0.07692307692307693
*********ALS*********
precision@5: 0.4
re

In [122]:
print('average precision@5 by 10 test users'.rjust(58))
for key, value in average_precisions.items():
    print('{}  {:.6f}'.format(key.rjust(20), np.mean(value)))

                      average precision@5 by 10 test users
       similar users  0.060000
       similar items  0.180000
                 ALS  0.320000
 own recommendations  0.060000


In [118]:
print('average recall@50 by 10 test users'.rjust(56))
for key, value in average_recalls.items():
    print('{}  {:.6f}'.format(key.rjust(20), np.mean(value)))

                      average recall@50 by 10 test users
       similar users  0.020883
       similar items  0.043361
                 ALS  0.081609
 own recommendations  0.075305
