# ALS Based Recommender Systems

---

## prefilter_items function

In [1]:
# Demo func at src/utils.py
# -------------------------


# def prefilter_items_(data, take_n_popular=5000, margin_slice_rate=0.9):
    
#     """Предфильтрация товаров"""
    
#     # рачсет цены единицы товара
#     data['price'] = data['sales_value'] / data['quantity']
    
#     # 1. Удаление товаров, со средней ценой < 1$
#     data = data[data['price'] > 1]
    
#     # 2. Удаление товаров со средней ценой > 30$
#     data = data[data['price'] < 30]
    
#     # 3. Удаление 10% товаров c наименьшей выручкой (сдвигает минимум выручки с 1.1$ до 94.8$ для unsplitted data)
#     marginality = data.groupby('item_id')['sales_value'].sum().reset_index()
#     ten_percent_slice_idx = int(marginality.shape[0] * margin_slice_rate)

#     top_margin = marginality.sort_values('sales_value', ascending=False)[:ten_percent_slice_idx].item_id.tolist()
#     data = data[data['item_id'].isin(top_margin)]
    
#     # 4. Выбор топ-N самых популярных товаров (N = take_n_popular)
#     popularity = data.groupby('item_id')['quantity'].sum().reset_index()
#     top_popular = popularity.sort_values('quantity', ascending=False)[:take_n_popular].item_id.tolist()
#     data = data[data['item_id'].isin(top_popular)]
    
#     return data
    

In [2]:
###############################
# Import libs 
###############################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# from implicit.bpr import BayesianPersonalizedRanking

import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k
from src.utils import *  # prefilter_items

In [3]:
###############################
# Settings && Constants 
###############################

RAW_DATA_PATH = './data/retail_train.csv'
RAW_FEATURES_PATH = './data/product.csv'
TEST_SIZE_WEEKS = 3
N_POPULAR_ITEMS = 5000
NUM_THREADS = 8


In [4]:

def get_data_splits(data_path, test_weeks_split):
    
    """
    Data base columns:

    ['user_id', 'basket_id', 'day',
    'item_id', 'quantity', 'sales_value',
    'store_id', 'retail_disc', 'trans_time',
    'week_no', 'coupon_disc','coupon_match_disc']
    """
    print("Preparing raw data...", end='')
    data = pd.read_csv(RAW_DATA_PATH)

    data_train = data[data['week_no'] < data['week_no'].max() - test_weeks_split]
    data_test = data[data['week_no'] >= data['week_no'].max() - test_weeks_split]

    # Warm start
    train_items = data_train['item_id'].unique()
    train_users = data_train['user_id'].unique()

    data_test = data_test[data_test['item_id'].isin(train_items)]
    data_test = data_test[data_test['user_id'].isin(train_users)]
    
    print('\033[94mDone\033[0m')
    return data_train, data_test


def get_features(features_path):
    
    """
    Features base columns:
    
    ['item_id', 'manufacturer', 'department', 'brand', 
    'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
    """
    
    print("Preparing raw features...", end='')
    item_features = pd.read_csv(features_path)
    item_features.columns = [col.lower() for col in item_features.columns]
    item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
    
    print('\033[94mDone\033[0m')
    return item_features


####  TODO migrate THIS to main class
def get_user_item_matrix(df, value_pivot='quantity', agg='count'):
    
    """Output:
    pivot table over target field
    formated for implicit func
    """  
    
    print("Preparing ui matrix...", end='')
    ui_matrix = pd.pivot_table(df, 
                                  index='user_id', columns='item_id', 
                                  values=value_pivot,
                                  aggfunc=agg, 
                                  fill_value=0
                                 )

    ui_matrix = ui_matrix.astype(float) # необходимый тип матрицы для implicit
    print('\033[94mDone\033[0m')
    
    return ui_matrix

In [5]:
class MainRecommender_:
    
    """ALS Rec system
    
    Input
    -----
    user_item_matrix: pd.DataFrame

    """
    
    def __init__(self, data, features, weighting=True):
                
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
#         self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]


        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
#         self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
                        self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
        
        # List item_id == CTM
        self.ctm = self.get_ctm(features)
        self.ctm_itemid_to_id = {k: v for k, v in self.itemid_to_id.items() if k in self.ctm}
        
        
        # Own recommender обучается до взвешивания матрицы
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T
            
#         # формат saprse matrix
        self.sparse_user_item = csr_matrix(self.user_item_matrix).T.tocsr()
        
        self.model = self.fit(self.user_item_matrix)
        
     
    @staticmethod
    def prepare_matrix(data, value_pivot='quantity', agg='count'): 
        
        
        """Output:
        pivot table over target field
        formated for implicit func
        """
            
        print("Preparing ui matrix...", end='')
        ui_matrix = pd.pivot_table(data, 
                                      index='user_id', columns='item_id', 
                                      values=value_pivot,
                                      aggfunc=agg, 
                                      fill_value=0
                                     )

        ui_matrix = ui_matrix.astype(float) # необходимый тип матрицы для implicit
        print('\033[94mDone\033[0m')
        
        return ui_matrix
    
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
    
    @staticmethod
    def get_ctm(item_features):
        
        # Dict {item_id: 0/1}
        # Deprecated
        # item_features['ctm'] = item_features['brand']=='Private'
        # item_features['ctm'] = item_features['ctm'].astype('uint8')
        # is_ctm = item_features[['item_id', 'ctm']].groupby(['item_id']).mean().to_dict()['ctm']
        
        # Dict {item_id: 1}
        ctm_ids = item_features[item_features['brand']=='Private']['item_id'].unique()

        return ctm_ids
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender
   

    @staticmethod
    def fit(user_item_matrix, n_factors=32, regularization=0.001, iterations=15, num_threads=8):
        """Обучает ALS"""
        
        model = AlternatingLeastSquares(factors=n_factors, 
                                             regularization=regularization,
                                             iterations=iterations,  
                                             num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return model
    
    
    def _extend_with_top_popular(self, recommendations, N=5):
        
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""
        
        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]
        return recommendations
    
    
    def get_similar_ctm_item(self, item_id):
        
        """Находит товар, похожий на item_id"""
        
        res = None
        
        # Товар похож на себя -> рекомендуем 2 товара
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=100)
        for rec in recs:
            if rec[0] in self.ctm:
                top_rec = recs[1][0]
                res = self.id_to_itemid[top_rec]
        top_rec = recs[1][0]
        res = self.id_to_itemid[top_rec]
        
        return res


    def get_recommendations(self, user, N=5, filter_=None):
        
        
        res = [self.id_to_itemid[rec[0]] for rec in 
                    self.model.recommend(userid=self.userid_to_id[user], 
                                    user_items=self.sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=filter_, 
                                    recalculate_user=True)]
        return res
    

    def get_own_recommendations(self, user, N=5):
        
        """Рекомендуем товары среди тех, которые юзер уже купил"""
        
        return self.get_recommendations(user)

    
    
    def get_similar_items_recommendation(self, user, N=5):
        
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        
        # 1. топ-N покупок юзера по не-СТМ товарам с сортировкой по кол-ву
        top_user_nonctm_purchases = self.top_purchases[self.top_purchases['user_id'] == user]\
                                               [~self.top_purchases['item_id'].isin(self.ctm)][:N]
                
        # 2. Для каждого товара по эмбеддингам находим ближайший СТМ
        res = top_user_nonctm_purchases['item_id'].apply(lambda x: self.get_similar_ctm_item(x)).tolist()

        return res
    

    def get_similar_users_recommendation(self, user, N=5):
        
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]

        for user in similar_users:
            res.extend(self.get_own_recommendations(user, N=1))

        res = self._extend_with_top_popular(res, N=N)

        return res

----

In [6]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender



In [7]:
%%time

# Get data splits
data_train, data_test = get_data_splits(RAW_DATA_PATH, TEST_SIZE_WEEKS)
# Get feats
item_features = get_features(RAW_FEATURES_PATH)

# n_items_before = data_train['item_id'].nunique()
data_train = prefilter_items(data_train, take_n_popular=N_POPULAR_ITEMS) # Prefilter routine

Preparing raw data...[94mDone[0m
Preparing raw features...[94mDone[0m
Prefilter items...Data reduced from: [91m86865[0m to: [91m5000[0m samples...[94mDone[0m
Wall time: 3.44 s


In [8]:
%%time

rec = MainRecommender(data_train, item_features)

Preparing ui matrix...[94mDone[0m


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Wall time: 5.42 s


In [11]:
rec.model.use_gpu

True

In [9]:
rec.get_similar_users_recommendation(user=2000)[:5]

[1128812, 854852, 1004906, 981760, 961979]

In [10]:
rec.ctm

array([   26093,    26190,    26355, ..., 18244317, 18244391, 18253088],
      dtype=int64)