# #4 Content-based recommendations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
USER_COL = 'user_id'
ITEM_COL = 'item_id'

In [2]:
from src.recommenders import MainRecommender
from src.metrics import precision_at_k, recall_at_k
from src.utils import  prefilter_items
print('Ok')

Ok


In [3]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, weighting=None,fake_id=99999):
        
        self.fake_id = fake_id
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby([USER_COL, ITEM_COL])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases[ITEM_COL] != self.fake_id]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby(ITEM_COL)['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases[ITEM_COL] != self.fake_id]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        self.fitted = True

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index=USER_COL, columns=ITEM_COL,
                                          values='price',
                                          aggfunc='sum',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.002, iterations=15, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).tocsr())

        return model

    def _update_dict(self, user_id):
        """Если появился новый user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():
            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})
            print('new user: %d\tusers count: %d' % (user_id, len(list(self.userid_to_id.values()))))

    def _get_similar_items(self, item_id, N):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N)
        return recs

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]
        recommendations = recommendations[:N]
        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        userid = self.userid_to_id[user]
        user_item_matrix = csr_matrix(self.user_item_matrix).tocsr()

        res = [self.id_to_itemid[rec] for rec in model.recommend(userid=userid,
                                    user_items=user_item_matrix[userid],
                                    N=N,
                                    filter_already_liked_items=False,
                                    filter_items=[self.itemid_to_id[self.fake_id]],
                                    recalculate_user=True)[0]]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        self._update_dict(user_id=user)
        
        top_users_purchases = self.top_purchases[self.top_purchases[USER_COL] == user].head(N)

        res = top_users_purchases[ITEM_COL].apply(lambda x: self.id_to_itemid[
                                                            self._get_similar_items(x, N=2)[0][1]
                                                            ]).tolist()
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res
    
    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        self._update_dict(user_id=user)
        
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [pair for pair in similar_users[0]]
        similar_users = similar_users[1:]   # удалим юзера из запроса
        
        for userid in similar_users:
            res.extend(self.get_own_recommendations(self.id_to_userid[userid], N=N))
            
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

### Prepare data

In [4]:
data = pd.read_csv('../data/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

item_features = pd.read_csv('../features_data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [5]:
data_train = prefilter_items(data_train, item_features)

== Starting prefilter info ==
shape: (2278490, 12)
# users: 2499
# items: 86865
Sparsity: 1.050%
== Ending prefilter info ==
shape: (641574, 13)
# users: 2474
# items: 5000
Sparsity: 5.187%
[1mnew_columns:[0m {'price'}


In [6]:
data_train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,1.5
6,2375,26984851516,1,99999,1,1.57,364,-0.68,1642,1,0.0,0.0,1.57
8,2375,26984851516,1,1102651,1,1.89,364,0.0,1642,1,0.0,0.0,1.89
19,1130,26984905972,1,1048462,1,1.19,31642,-0.8,1340,1,0.0,0.0,1.19


In [7]:
data_test = data_test.loc[data_test.user_id.isin(data_train.user_id.unique())]

### Train recommender

In [8]:
mr = MainRecommender(data_train)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

### Get recommendations

In [9]:
user_to_recommend = 1

In [10]:
sim_users = mr.model.similar_users(mr.userid_to_id[user_to_recommend], N=5)
recs = [(mr.id_to_userid[uid], sim) for uid, sim in zip(sim_users[0], sim_users[1])]
print('user_id    similarity')
for rec in recs:
    print(str(int(rec[0])).ljust(10), rec[1])

user_id    similarity
1          0.99999994
367        0.8556664
1234       0.8379306
1121       0.83521575
1376       0.83106863


In [11]:
# recommend items by similar_users with extend popular
mr.get_similar_users_recommendation(user_to_recommend, N=5)

[904360, 883404, 1133018, 951590, 961554]

In [12]:
item_to_recommend = 0
recs = mr.model.similar_items(item_to_recommend, N=5)


recs = [(mr.id_to_itemid[iid], sim) for (iid, sim) in zip(recs[0], recs[1])]
print('item_id    similarity')
for rec in recs:
    print(str(int(rec[0])).ljust(10), rec[1])

item_id    similarity
28897      0.9999999
43871      0.7217138
201704     0.71223605
99999      0.4713153
938141     0.461287


In [13]:
sim_item_recs = mr.get_similar_items_recommendation(user_to_recommend)
print(sim_item_recs)

[1006546, 894968, 1049897, 927498, 1082212]


In [14]:
recs = mr.get_als_recommendations(user_to_recommend, 50)
bought = data_train[data_train['user_id'] == user_to_recommend].item_id.unique()

In [15]:
recall_at_k(recommended_list=recs, bought_list=bought), precision_at_k(recommended_list=recs, bought_list=bought)

(0.013986013986013986, 0.4)

In [16]:
def print_eval_stats(name, recs, bought):
    print('{:*^21}'.format(name))
    p = precision_at_k(recommended_list=recs, bought_list=bought, k=5)
    average_precisions[name].append(p)
    print('precision@5: {:.3f}'.format(p))
    r = recall_at_k(recommended_list=recs, bought_list=bought, k=50)
    average_recalls[name].append(r)
    print('recall@50: {:.3f}'.format(r))

In [17]:
%%time
N=50

# i want to store evaluation data
model_names =  ('similar users', 'similar items', 'ALS', 'own recommendations')
average_precisions = {k: [] for k in model_names}
average_recalls = {k: [] for k in model_names}

for user in data_test.user_id.unique():
    actual = data_test[data_test.user_id==user].item_id.unique().tolist()
    
    sim_user_rec = mr.get_similar_users_recommendation(user, N=N)
    sim_item_rec = mr.get_similar_items_recommendation(user, N=N)
    als_rec = mr.get_als_recommendations(user, N=N)
    own_rec = mr.get_own_recommendations(user, N=N)
    
#     print('USER: {}'.format(user))    
    for rec, name in ((sim_user_rec, 'similar users'), (sim_item_rec, 'similar items'), (als_rec, 'ALS'), (own_rec, 'own recommendations')):
        average_precisions[name].append(precision_at_k(rec, actual, k=5))
        average_recalls[name].append(recall_at_k(rec, actual, k=50))

CPU times: user 6h 15min 31s, sys: 22min 5s, total: 6h 37min 36s
Wall time: 6h 21min 8s


In [18]:
print('average precision@5 by 10 test users'.rjust(58))
for key, value in average_precisions.items():
    print('{}  {:.6f}'.format(key.rjust(20), np.mean(value)))

                      average precision@5 by 10 test users
       similar users  0.082946
       similar items  0.075095
                 ALS  0.132674
 own recommendations  0.082946


In [19]:
print('average recall@50 by 10 test users'.rjust(56))
for key, value in average_recalls.items():
    print('{}  {:.6f}'.format(key.rjust(20), np.mean(value)))

                      average recall@50 by 10 test users
       similar users  0.058604
       similar items  0.038477
                 ALS  0.074018
 own recommendations  0.058584
