# #4 Content-based recommendations

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [100]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight


class MainRecommender:
    """Рекоммендации, которые можно получить из ALS
    
    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    
    def __init__(self, data, weighting=True):
        
        # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать
        
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
        
        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
     
    @staticmethod
    def prepare_matrix(data):
        
        user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='sum', 
                                  fill_value=0
                                 ).astype(float) # необходимый тип матрицы для implicit

        
        return user_item_matrix
    
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender
    
    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        
        model = AlternatingLeastSquares(factors=n_factors, 
                                             regularization=regularization,
                                             iterations=iterations,  
                                             num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return model

    def get_similar_items_recommendation(self, item, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        recs = self.model.similar_items(self.itemid_to_id[item], N=N)


        assert len(recs) == N, 'Количество рекомендаций != {}'.format(N)
        return recs
    
    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
    
        recs = self.model.similar_users(self.userid_to_id[user], N=N)

        assert len(recs) == N, 'Количество рекомендаций != {}'.format(N)
        return recs

In [98]:
# utils.py

def prefilter_items(data, item_features, take_n_popular=5000):

    print('== Starting prefilter info ==')
    n_users = data.user_id.nunique()
    n_items = data.item_id.nunique()
    sparsity = float(data.shape[0]) / float(n_users*n_items) * 100
    print('shape: {}'.format(data.shape))
    print('# users: {}'.format(n_users))
    print('# items: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    data_train = data.copy()

    # do not use top popular items (they'd be bought anyway)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
    popularity['user_id'] = popularity['user_id'] / data_train.user_id.nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    top_popular = popularity[popularity['share_unique_users'] > .5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]

    print('before top_not_popular:', data.shape)
    # do not use top not popular
    top_not_popular = popularity[popularity.share_unique_users < .0009]
    data = data[~data.item_id.isin(top_not_popular.item_id.tolist())]
    print('after top_not_popular:', data.shape)
    # do not use items that have not been sold in the last 12 month
    num_weeks = 12*4
    start_week = data_train.week_no.max() - num_weeks
    items_sold_last_year = data[data.week_no >= start_week].item_id.tolist()
    data = data[data.item_id.isin(items_sold_last_year)]
    print('-----', data.shape)

    # do not use not popular departments
    not_popular_departments = item_features[item_features.item_id.isin(top_not_popular.sort_values('share_unique_users')[:1000].item_id.tolist())].department.unique()
    not_popular_departments_items = item_features[
        item_features.department.isin(not_popular_departments)].item_id.tolist()
    print(not_popular_departments)
    data = data[~data.item_id.isin(not_popular_departments_items)]
    print('----', data.shape)
    # do not use too expensive and too cheap items
    high_cost_threshold = data_train.sales_value.quantile(.99995)
    low_cost_threshold = data_train.sales_value.quantile(.11)
    data = data[
        (data.sales_value < high_cost_threshold)
        &
        (data.sales_value > low_cost_threshold)
        ]    
    print()
    # do not use too popular stores
    # store_df = data.groupby('store_id')['user_id'].nunique().reset_index()
    # data = data[~data.store_id.isin(
    #     store_df[store_df.user_id > store_df.user_id.quantile(.985)].store_id.tolist()
    # )]
    if take_n_popular:
        popularity = data.groupby('item_id')['user_id'].nunique().reset_index().sort_values('user_id', ascending=False).item_id.tolist()
        data = data[data.item_id.isin(popularity[:take_n_popular])]
    print('== Ending prefilter info ==')
    print('shape: {}'.format(data.shape))
    n_users = data.user_id.nunique()
    n_items = data.item_id.nunique()
    sparsity = float(data.shape[0]) / float(n_users*n_items) * 100
    print('# users: {}'.format(n_users))
    print('# items: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    return data

In [24]:
from src.recommenders import MainRecommender
from src.metrics import precision_at_k, recall_at_k
from src.utils import  prefilter_items

In [94]:
data = pd.read_csv('../2_baselines_implicit/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

item_features = pd.read_csv('../3_collaborative_filtering/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']


In [99]:
prefilter_items(data_train, item_features)

== Starting prefilter info ==
shape: (2278490, 12)
# users: 2499
# items: 86865
Sparsity: 1.050%
before top_not_popular: (2183574, 12)
after top_not_popular: (2115456, 12)
----- (2086940, 12)
['MISC. TRANS.' 'COSMETICS' 'DRUG GM' 'SEAFOOD-PCKGD' 'GROCERY' 'PASTRY'
 'PRODUCE' 'DELI' 'MEAT-PCKGD' 'FLORAL' 'MEAT' 'NUTRITION' 'SPIRITS'
 'SEAFOOD' 'GARDEN CENTER' 'TRAVEL & LEISUR' 'RESTAURANT' ' ']
---- (17181, 12)

== Ending prefilter info ==
shape: (15617, 12)
# users: 1855
# items: 106
Sparsity: 7.942%


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
335,1617,27008755658,3,882305,1,1.00,440,0.00,1033,1,0.0,0.0
553,1345,27008843374,3,1117393,1,3.80,32004,0.00,2110,1,0.0,0.0
555,1345,27008843374,3,1133378,1,4.99,32004,0.00,2110,1,0.0,0.0
1501,216,27021568529,4,1110843,1,4.13,324,0.00,1804,1,0.0,0.0
1918,1172,27045462475,6,1007136,1,10.35,32004,0.00,2013,2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2278039,708,41260647860,635,882305,1,8.99,402,0.00,1844,91,0.0,0.0
2278158,1740,41266671516,633,397896,8349,18.94,3242,-0.25,1319,91,0.0,0.0
2279805,1074,41297460791,635,12484608,7,85.00,292,0.00,1214,91,0.0,0.0
2279929,1211,41297471413,635,6534166,15750,34.95,319,-1.57,1315,91,0.0,0.0


In [85]:

data_train = prefilter_items(data_train, item_features)

mr = MainRecommender(data_train, weighting=True)


== Starting prefilter info ==
shape: (2278490, 12)
# users: 2499
# items: 86865
Sparsity: 1.050%
before top_not_popular: (2183574, 12)
after top_not_popular: (2183574, 12)
----- (2132347, 12)
[] 0
---- (2132347, 12)

== Ending prefilter info ==
shape: (1195284, 12)
# users: 2482
# items: 5000
Sparsity: 9.632%


100%|██████████| 15/15 [00:03<00:00,  3.99it/s]
100%|██████████| 5000/5000 [00:01<00:00, 4231.91it/s]


In [115]:
data_train = data.copy()

merged_data_departments = data_train[['user_id', 'item_id', 'quantity']].merge(item_features[['item_id', 'department']], how='left')
quantity_by_department = merged_data_departments.groupby('department')['quantity'].sum().reset_index()
quantity_coef_by_department = quantity_by_department.copy()
quantity_coef_by_department['quantity'] = quantity_by_department.quantity / quantity_by_department.quantity.sum()
top_not_popular_department = quantity_coef_by_department[quantity_coef_by_department.quantity < ]

# not_popular_departments_items = item_features[
#     item_features.department.isin(not_popular_departments)].item_id.tolist()
# print(not_popular_departments)
# data = data[~data.item_id.isin(not_popular_departments_items)]

In [123]:
tmp.quantity.describe()

count    4.400000e+01
mean     2.272727e-02
std      1.291025e-01
min      0.000000e+00
25%      2.909606e-08
50%      1.481821e-06
75%      4.398909e-05
max      8.486266e-01
Name: quantity, dtype: float64

In [110]:
merged_data_departments.groupby('department')['user_id'].nunique().reset_index().sort_values('user_id', ascending=True)

Unnamed: 0,department,user_id
16,GRO BAKERY,1
42,VIDEO,1
19,HOUSEWARES,1
11,ELECT &PLUMBING,1
30,PORK,2
2,CHARITABLE CONT,2
32,PROD-WHS SALES,2
40,TOYS,3
43,VIDEO RENTAL,3
28,PHARMACY SUPPLY,4


In [56]:
user_to_recommend = mr.id_to_userid[0]
mr.get_similar_users_recommendation(user_to_recommend)


[(28, 1.0), (6, 1.0), (31, 1.0), (0, 0.9999999), (8, 0.9999999)]

In [58]:
item_to_recommend = mr.id_to_itemid[2]
for i in range(len(mr.id_to_itemid)):
    # print(item)
    print(mr.get_similar_items_recommendation(mr.id_to_itemid[i]))


[(0, 1.0), (2, 0.011095172), (1, -0.001978551), (4, -0.0034034785), (5, -0.012541463)]
[(1, 1.0), (2, 0.006971217), (0, -0.001978551), (5, -0.02770517), (3, -0.03757184)]
[(2, 0.9999999), (4, 0.013571143), (0, 0.011095172), (1, 0.006971217), (5, -0.019656051)]
[(3, 1.0), (5, 0.038335435), (4, 0.018111914), (0, -0.023345383), (2, -0.027634539)]
[(4, 1.0000001), (3, 0.018111914), (2, 0.013571131), (5, 0.004725494), (0, -0.0034034692)]
[(5, 0.99999994), (3, 0.038335435), (4, 0.0047254874), (0, -0.012541466), (2, -0.019656083)]
