# ALS Based Recommender Systems

---

## prefilter_items function

In [128]:
# Demo func at src/utils.py
# -------------------------


# def prefilter_items_(data, take_n_popular=5000, margin_slice_rate=0.9):
    
#     """Предфильтрация товаров"""
    
#     # рачсет цены единицы товара
#     data['price'] = data['sales_value'] / data['quantity']
    
#     # 1. Удаление товаров, со средней ценой < 1$
#     data = data[data['price'] > 1]
    
#     # 2. Удаление товаров со средней ценой > 30$
#     data = data[data['price'] < 30]
    
#     # 3. Удаление 10% товаров c наименьшей выручкой (сдвигает минимум выручки с 1.1$ до 94.8$ для unsplitted data)
#     marginality = data.groupby('item_id')['sales_value'].sum().reset_index()
#     ten_percent_slice_idx = int(marginality.shape[0] * margin_slice_rate)

#     top_margin = marginality.sort_values('sales_value', ascending=False)[:ten_percent_slice_idx].item_id.tolist()
#     data = data[data['item_id'].isin(top_margin)]
    
#     # 4. Выбор топ-N самых популярных товаров (N = take_n_popular)
#     popularity = data.groupby('item_id')['quantity'].sum().reset_index()
#     top_popular = popularity.sort_values('quantity', ascending=False)[:take_n_popular].item_id.tolist()
#     data = data[data['item_id'].isin(top_popular)]
    
#     return data
    

In [13]:
###############################
# Import libraries 
###############################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# from implicit.bpr import BayesianPersonalizedRanking

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items

In [30]:
###############################
# Constants 
###############################

RAW_DATA_PATH = './data/retail_train.csv'
RAW_FEATURES_PATH = './data/product.csv'
TEST_SIZE_WEEKS = 3


In [107]:
def get_data_splits(data_path, test_weeks_split):
    
    """
    Data base columns:

    ['user_id', 'basket_id', 'day',
    'item_id', 'quantity', 'sales_value',
    'store_id', 'retail_disc', 'trans_time',
    'week_no', 'coupon_disc','coupon_match_disc']
    """
    
    data = pd.read_csv(RAW_DATA_PATH)

    data_train = data[data['week_no'] < data['week_no'].max() - test_weeks_split]
    data_test = data[data['week_no'] >= data['week_no'].max() - test_weeks_split]

    # Warm start
    train_items = data_train['item_id'].unique()
    train_users = data_train['user_id'].unique()

    data_test = data_test[data_test['item_id'].isin(train_items)]
    data_test = data_test[data_test['user_id'].isin(train_users)]
    
    return data_train, data_test


def get_features(features_path):
    
    """
    Features base columns:
    
    ['item_id', 'manufacturer', 'department', 'brand', 
    'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
    """
    
    item_features = pd.read_csv(features_path)
    item_features.columns = [col.lower() for col in item_features.columns]
    item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
    
    return item_features

In [111]:
data_train, data_test = get_data_splits(RAW_DATA_PATH, TEST_SIZE_WEEKS)

item_features = get_features(RAW_FEATURES_PATH)

In [37]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [40]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [41]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [42]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

user_item_matrix.head(3)

item_id,397896,818980,818981,819063,819255,819304,819308,819330,819518,819594,...,15596279,15596488,15596515,15831255,15926712,15926775,15926844,15926886,15927403,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [44]:
user_item_matrix = bm25_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

In [45]:
%%time

model = AlternatingLeastSquares(factors=20, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Wall time: 3.05 s


-----

# Домашнее задание

1. Перенесите метрики из ДЗ 1 src/metrics.py
3. Создайте модуль src/recommenders.py. Напищите код для класса ниже 
(задание обсуждали на вебинаре, для первой функции практически сделали) и положите его в src/recommenders.py
4. Проверьте, что все модули корректно импортируются

In [None]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MainRecommender:
    """Рекоммендации, которые можно получить из ALS
    
    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    
    def __init__(self, data, weighting=True):
                
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \ 
            self.itemid_to_id, self.userid_to_id = prepare_dicts(self.user_item_matrix)
        
        # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ
        self.item_id_to_ctm = #your_code
        
        # Own recommender обучается до взвешивания матрицы
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
        
        self.model = self.fit(self.user_item_matrix)
     
    @staticmethod
    def prepare_matrix(data):
        
        # your_code
        
        return user_item_matrix
    
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender
    
    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        
        model = AlternatingLeastSquares(factors=factors, 
                                             regularization=regularization,
                                             iterations=iterations,  
                                             num_threads=num_threads)
        model.fit(csr_matrix(self.user_item_matrix).T.tocsr())
        
        return model

    def get_similar_items_recommendation(self, user, filter_ctm=True, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        # your_code
        # Практически полностью реализовали на прошлом вебинаре
        # Не забывайте, что нужно учесть параметр filter_ctm

        return res
    
    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
    
        # your_code

        return res

----

Проверка, что все работает

In [49]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

