# Курсовой проект. Рекомендательные системы

## Библиотеки

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from xgboost import XGBClassifier

## Константы

In [2]:
PATH = '../data/'

ITEM_ID = 'item_id'
USER_ID = 'user_id'
ACTUAL = 'actual'
CANDIDATE = 'candadate'

TEST_MATCHER_WEEKS = 6
TEST_RANKER_WEEKS = 3

N_PREDICT = 50
N_RECALL = 50
N_PRECISION = 5
FICT = 999999

## Функции

In [3]:
def print_stat_data(df, name):
    print(name)
    print(f'Shape: {df.shape} Users: {df[USER_ID].nunique()} Items: {df[ITEM_ID].nunique()}')
    
def common_stat_df():
    print_stat_data(train_matcher,'train_matcher')
    print_stat_data(test_matcher,'test_matcher')
    print_stat_data(train_ranker,'train_ranker')
    print_stat_data(test_ranker,'test_ranker')

def prefilter_item(data, take_n_popular=5000, item_features=None):
    # Уберем не интересные для рекоммендаций категории (department)
#     if item_features is not None:
#         department_size = pd.DataFrame(item_features. \
#                                        groupby('department')['item_id'].nunique(). \
#                                        sort_values(ascending=False)).reset_index()

#         department_size.columns = ['department', 'n_items']
#         rare_departments = department_size[department_size['n_items'] < 150].department.tolist()
#         items_in_rare_departments = item_features[
#             item_features['department'].isin(rare_departments)].item_id.unique().tolist()

#         data = data[~data['item_id'].isin(items_in_rare_departments)]

#     # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
#     data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
#     data = data[data['price'] > 2]

#     # Уберем слишком дорогие товарыs
#     data = data[data['price'] < 50]

    # Возбмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()

    # Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = FICT
    return data

def recall(recommended_list, bought_list):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(bought_list)

def recall_at_k(recommended_list, bought_list, k=5):
    return recall(recommended_list[:k], bought_list)

def precision(recommended_list, bought_list):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(recommended_list)

def precision_at_k(recommended_list, bought_list, k=5):
    return precision(recommended_list[:k], bought_list)

def calc_recall(df, top):
    for col in df.columns[2:]:
        yield col, df.apply(lambda row: recall_at_k(row[col], row[ACTUAL], k=top), axis=1).mean()

def calc_precision(df, top):
    for col in df.columns[2:]:
        yield col, df.apply(lambda row: precision_at_k(row[col], row[ACTUAL], k=top), axis=1).mean()
        
def rerank(df, user_id):
    return df[df[USER_ID]==user_id].sort_values('proba_item', ascending=False).item_id.tolist()        

In [4]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS
    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data: pd.DataFrame, weighting: bool = True):
#         # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != FICT]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != FICT]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self._prepare_dict(self.user_item_matrix)

#         if weighting:
#             self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data: pd.DataFrame):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id',
                                          columns='item_id',
                                          values='quantity',  # Можно пробовать другие варианты
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit
        return user_item_matrix

    @staticmethod
    def _prepare_dict(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())
        return model

#     def _update_dict(self, user_id):
#         """Если появился новыю user / item, то нужно обновить словари"""

#         if user_id not in self.userid_to_id.keys():
#             max_id = max(list(self.userid_to_id.values()))
#             max_id += 1

#             self.userid_to_id.update({user_id: max_id})
#             self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]
        return recommendations

    def _get_recommendation(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        # self._update_dict(user_id=user)
        res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=self.userid_to_id[user],
                                                                    user_items=csr_matrix(
                                                                        self.user_item_matrix).tocsr(),
                                                                    N=N,
                                                                    filter_already_liked_items=False,
                                                                    filter_items=[self.itemid_to_id[FICT]],
                                                                    recalculate_user=True)]
        try:
            res.remove(FICT)
        except:
            pass
        res = self._extend_with_top_popular(res, N=N)
        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendation(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""
        # self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_own_recommendation(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""
        # self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user_id, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user_id].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        try:
            res.remove(FICT)
        except:
            pass
        res = self._extend_with_top_popular(res, N=N)
        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

#     def get_similar_users_recommendation(self, user_id, N=5):
#         """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

#         res = []

#         # Находим топ-N похожих пользователей
#         similar_users = self.model.similar_users(self.userid_to_id[user_id], N=N + 1)
#         similar_users = [self.id_to_userid[rec[0]] for rec in similar_users]
#         similar_users = similar_users[1:]  # удалим юзера из запроса

#         for _user_id in similar_users:
#             res.extend(self.get_own_recommendations(_user_id, N=1))

#         res = self._extend_with_top_popular(res, N=N)

#         assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
#         return res


## Исходные данные

In [5]:
df = pd.read_csv(PATH + 'retail_train.csv')
item_df = pd.read_csv(PATH + 'product.csv')
user_df = pd.read_csv(PATH + 'hh_demographic.csv')

In [6]:
df[[USER_ID, ITEM_ID, 'basket_id', 'day', 'quantity', 'sales_value','store_id', 'retail_disc', 
    'trans_time', 'week_no', 'coupon_disc','coupon_match_disc']]

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,1004906,26984851472,1,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,1033142,26984851472,1,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,1036325,26984851472,1,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,1082185,26984851472,1,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,8160430,26984851472,1,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2396799,1613,16102849,41655820646,663,1,2.00,3262,-1.15,1231,95,0.0,0.0
2396800,1001,13217063,41655829421,663,1,1.69,3131,0.00,2231,95,0.0,0.0
2396801,1001,13217800,41655829421,663,1,1.69,3131,0.00,2231,95,0.0,0.0
2396802,1167,6410462,41656790510,663,22451,43.98,3385,-0.65,1059,95,0.0,0.0


In [7]:
item_df.columns = [col.lower() for col in item_df.columns]
item_df.rename(columns={'product_id': ITEM_ID}, inplace=True)
item_df

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


In [8]:
user_df.columns = [col.lower() for col in user_df.columns]
user_df.rename(columns={'household_key': USER_ID}, inplace=True)
user_df[[USER_ID, 'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc','hh_comp_desc', 
         'household_size_desc', 'kid_category_desc']]

Unnamed: 0,user_id,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
1,7,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown
2,8,25-34,U,25-34K,Unknown,2 Adults Kids,3,1
3,13,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2
4,16,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown
...,...,...,...,...,...,...,...,...
796,2494,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown
797,2496,45-54,A,75-99K,Homeowner,Unknown,3,1
798,2497,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown
799,2498,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown


## Модель первого уровня

### Разбиение на трейн/тест

In [9]:
# берем данные для тренировки matching модели
train_matcher = df[df['week_no'] < df['week_no'].max() - (TEST_MATCHER_WEEKS + TEST_RANKER_WEEKS)]

# берем данные для валидации matching модели
test_matcher = df[(df['week_no'] >= df['week_no'].max() - (TEST_MATCHER_WEEKS + TEST_RANKER_WEEKS)) &
                      (df['week_no'] < df['week_no'].max() - (TEST_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
train_ranker = test_matcher.copy()

# берем данные для теста ranking, matching модели
test_ranker = df[df['week_no'] >= df['week_no'].max() - TEST_RANKER_WEEKS]

common_stat_df()

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
test_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
test_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


### Префильтр айтемов

In [10]:
n_items_before = train_matcher[ITEM_ID].nunique()

train_matcher = prefilter_item(train_matcher, item_features=item_df, take_n_popular=5000)

n_items_after = train_matcher[ITEM_ID].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


### Убираю холодных юзеров

In [11]:
# ищем общих пользователей
common_user = train_matcher.user_id.values

test_matcher = test_matcher[test_matcher.user_id.isin(common_user)]
train_ranker = train_ranker[train_ranker.user_id.isin(common_user)]
test_ranker = test_ranker[test_ranker.user_id.isin(common_user)]

common_stat_df()

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 5001
test_matcher
Shape: (169707, 12) Users: 2153 Items: 27649
train_ranker
Shape: (169707, 12) Users: 2153 Items: 27649
test_ranker
Shape: (118303, 12) Users: 2041 Items: 24326


### Получаю кандидатов

In [12]:
recommender = MainRecommender(train_matcher)
result_test_matcher = test_matcher.groupby(USER_ID)[ITEM_ID].unique().reset_index()
result_test_matcher.columns=[USER_ID, ACTUAL]



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [13]:
# %%time
# result_test_matcher['als'] = result_test_matcher[USER_ID].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

In [14]:
# %%time
# result_test_matcher['own'] = result_test_matcher[USER_ID].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [15]:
%%time
result_test_matcher['sim_item'] = result_test_matcher[USER_ID].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))

CPU times: user 47.4 s, sys: 1.74 s, total: 49.1 s
Wall time: 12.3 s


In [16]:
result_test_matcher

Unnamed: 0,user_id,actual,sim_item
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[969977, 1029743, 9527494, 6703742, 898121, 11..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1137346, 1133018, 1021324, 1106523, 979707, 8..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[835530, 944599, 9526410, 1111786, 846764, 880..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[904360, 874149, 845208, 948650, 1021324, 1116..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1019247, 1015247, 1094955, 821565, 1133018, 1..."
...,...,...,...
2148,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[904360, 979707, 1044078, 1021324, 1077555, 74..."
2149,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1127831, 860776, 962609, 855279, 819255, 8255..."
2150,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[1058997, 862349, 6034857, 1092026, 1098066, 8..."
2151,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[862349, 1098066, 13945244, 937791, 1062966, 5..."


In [17]:
sorted(calc_precision(result_test_matcher, N_PRECISION), key=lambda x: x[1],reverse=True)

[('sim_item', 0.1365536460752426)]

## Модель второго уровня

### Подготовка данных для трейна

In [18]:
%%time
match_candidates = pd.DataFrame(train_ranker[USER_ID].unique())
match_candidates.columns = [USER_ID]
match_candidates[CANDIDATE] = match_candidates[USER_ID].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))

CPU times: user 47.4 s, sys: 1.83 s, total: 49.2 s
Wall time: 12.3 s


In [19]:
match_candidates

Unnamed: 0,user_id,candadate
0,2070,"[879755, 1069175, 879948, 1124729, 9527158, 10..."
1,2021,"[960318, 846550, 904360, 878710, 1004906, 1118..."
2,1753,"[986394, 9837842, 861445, 1138189, 995242, 100..."
3,2120,"[397896, 1082185, 6534178, 1029743, 995242, 11..."
4,1346,"[1127831, 878996, 9802981, 866540, 947146, 879..."
...,...,...
2148,1446,"[1106523, 983936, 967144, 6979427, 13038913, 1..."
2149,1784,"[1033142, 1127831, 5568729, 999965, 909894, 11..."
2150,436,"[1006184, 995242, 1075170, 9526411, 1133018, 9..."
2151,1697,"[995242, 883404, 9527160, 926905, 1119546, 103..."


In [20]:
item = match_candidates.apply(lambda x: pd.Series(x[CANDIDATE]), axis=1).stack().reset_index(level=1, drop=True)
item.name = ITEM_ID
match_candidates = match_candidates.drop(CANDIDATE, axis=1).join(item)
match_candidates

Unnamed: 0,user_id,item_id
0,2070,879755
0,2070,1069175
0,2070,879948
0,2070,1124729
0,2070,9527158
...,...,...
2152,1745,968146
2152,1745,997907
2152,1745,6602697
2152,1745,1128665


In [21]:
print_stat_data(match_candidates, 'match_candidates')

match_candidates
Shape: (107650, 2) Users: 2153 Items: 3022


In [22]:
ranker_train = train_ranker[[USER_ID, ITEM_ID]].copy()
ranker_train['target'] = 1
ranker_train

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1
...,...,...,...
2282320,222,1120741,1
2282321,462,993339,1
2282322,462,995242,1
2282323,462,10180324,1


In [23]:
ranker_train = match_candidates.merge(ranker_train, on=[USER_ID, ITEM_ID], how='left')
ranker_train = ranker_train.drop_duplicates(subset=[USER_ID, ITEM_ID])
ranker_train['target'].fillna(0, inplace= True)
df_agg = ranker_train.groupby(by=[USER_ID, ITEM_ID]).count()
len(df_agg[df_agg.target > 1])

0

In [24]:
ranker_train.target.value_counts()

0.0    94949
1.0     7780
Name: target, dtype: int64

In [25]:
ranker_train['target'].mean()

0.07573323988357718

### Фичи для обучения

In [26]:
ranker_train = ranker_train.merge(item_df, on=ITEM_ID, how='left')
ranker_train = ranker_train.merge(user_df, on=USER_ID, how='left')
ranker_train

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,879755,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1069175,1.0,2224,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879948,0.0,531,DRUG GM,National,CANDY - CHECKLANE,CANDY BARS (SINGLES)(INCLUDING,1.55 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,1124729,0.0,857,DRUG GM,National,CANDY - CHECKLANE,CANDY BARS (SINGLES)(INCLUDING,1 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,9527158,0.0,544,GROCERY,National,BAG SNACKS,SGL SV/VEND MACH SNACKS CHIP/P,1.0 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102724,1745,1074333,0.0,1646,PRODUCE,National,SALAD MIX,REGULAR GARDEN,2 LB,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
102725,1745,968146,0.0,69,GROCERY,Private,COLD CEREAL,KIDS CEREAL,14.5 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
102726,1745,997907,0.0,69,GROCERY,Private,CHEESE,SHREDDED CHEESE,8 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
102727,1745,6602697,0.0,1769,MEAT,National,TURKEY,GROUND TURKEY,1.3LB,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown


In [27]:
X_train = ranker_train.drop('target', axis=1)
y_train = ranker_train[['target']]
cat_feat = X_train.columns[2:].tolist()
X_train = pd.get_dummies(X_train, columns=cat_feat)
X_train

Unnamed: 0,user_id,item_id,manufacturer_2,manufacturer_5,manufacturer_16,manufacturer_35,manufacturer_36,manufacturer_42,manufacturer_58,manufacturer_61,...,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
0,2070,879755,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
1,2070,1069175,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2,2070,879948,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
3,2070,1124729,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,2070,9527158,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102724,1745,1074333,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
102725,1745,968146,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
102726,1745,997907,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
102727,1745,6602697,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


### Обучение

In [28]:
%%time
xgb = XGBClassifier(n_jobs=4)
xgb.fit(X_train, y_train)

CPU times: user 11min 9s, sys: 1.86 s, total: 11min 10s
Wall time: 2min 48s


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [29]:
train_pred = xgb.predict_proba(X_train)
ranker_pred = ranker_train.copy()
ranker_pred['proba_item'] = train_pred[:,1]

In [30]:
result_test_ranker = test_ranker.groupby(USER_ID)[ITEM_ID].unique().reset_index()
result_test_ranker.columns=[USER_ID, ACTUAL]
result_test_ranker

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
...,...,...
2036,2496,[6534178]
2037,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
2038,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."
2039,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


In [31]:
%%time
result_test_ranker['sim_item'] = result_test_ranker[USER_ID].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))

CPU times: user 44.9 s, sys: 1.82 s, total: 46.7 s
Wall time: 11.7 s


### Оценка

In [32]:
sorted(calc_precision(result_test_ranker, N_PRECISION), key=lambda x: x[1], reverse=True)

[('sim_item', 0.10690837824595724)]

In [33]:
result_test_ranker['reranked'] = result_test_ranker[USER_ID].apply(lambda user_id: rerank(ranker_pred,user_id))

In [34]:
sorted(calc_precision(result_test_ranker, N_PRECISION), key=lambda x: x[1], reverse=True)

  return flags.sum() / len(recommended_list)


[('reranked', 0.2326892950391621), ('sim_item', 0.10690837824595724)]

## Оценка на тесте

In [35]:
df_test = pd.read_csv(PATH + 'retail_test1.csv')
df_test

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.10,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
88729,98,41944918665,684,945779,2,2.00,421,0.0,1705,98,0.0,0.0
88730,98,41944918665,684,993617,2,2.00,421,0.0,1705,98,0.0,0.0
88731,98,41944918665,684,1128647,2,2.00,421,0.0,1705,98,0.0,0.0
88732,98,41944918665,684,9526886,2,0.60,421,0.0,1705,98,0.0,0.0


In [36]:
result_test = df_test.groupby(USER_ID)[ITEM_ID].unique().reset_index()
result_test.columns=[USER_ID, ACTUAL]
result_test

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663..."
...,...,...
1880,2496,"[829291, 862139, 912704, 933067, 933835, 95537..."
1881,2497,[6534178]
1882,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1..."
1883,2499,"[826249, 895327, 9858944, 820321, 829291, 8323..."


In [42]:
X_test = df_test[[USER_ID, ITEM_ID]].copy()
X_test = X_test.merge(item_df, on=ITEM_ID, how='left')
X_test = X_test.merge(user_df, on=USER_ID, how='left')
X_test = pd.get_dummies(X_test, columns=cat_feat)

train_col = set(X_train.columns.to_list())
test_col = set(X_test.columns.to_list())
drop_col = test_col - train_col
zero_col = train_col - test_col
X_test = X_test.drop(list(drop_col), axis=1)
X_test[list(zero_col)] = 0
X_test = X_test[xgb.feature_names_in_]

test_pred = xgb.predict_proba(X_test)
pred = df_test[[USER_ID, ITEM_ID]].copy()
pred['proba_item'] = test_pred[:,1]

result_test['your_prediction'] = result_test[USER_ID].apply(lambda user_id: rerank(pred,user_id))
result_test

Unnamed: 0,user_id,actual,your_prediction
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[962568, 979707, 13877012, 1112333, 1004906, 1..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1133018, 1003421, 903674, 866950, 1044759, 10..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[13842214, 1133018, 908531, 1130858, 827683, 1..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[995242, 840361, 1055646, 849843, 994928, 9949..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[1082185, 1082185, 938700, 938700, 1133018, 11..."
...,...,...,...
1880,2496,"[829291, 862139, 912704, 933067, 933835, 95537...","[1106523, 10285149, 15452677, 15452531, 110951..."
1881,2497,[6534178],"[6534178, 6534178]"
1882,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1...","[858303, 858303, 963719, 1076875, 1030455, 105..."
1883,2499,"[826249, 895327, 9858944, 820321, 829291, 8323...","[1070820, 5569327, 5568378, 826249, 913210, 83..."


In [43]:
sorted(calc_precision(result_test, N_PRECISION), key=lambda x: x[1], reverse=True)

[('your_prediction', 0.8676392572944218)]