# Course project


**Основное**
- Дедлайн - 27 декабря 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**


**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


# Set global const

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

# Process features dataset

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [6]:
# Схема обучения и валидации
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [7]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [8]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [9]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [10]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [11]:
# выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [12]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


# Prefilter items

In [13]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (784420, 13) Users: 1915 Items: 4999
val_matcher
Shape: (163261, 12) Users: 1915 Items: 27118
train_ranker
Shape: (163261, 12) Users: 1915 Items: 27118
val_ranker
Shape: (115989, 12) Users: 1915 Items: 24042


# Init/train recommender

In [15]:
recommender = MainRecommender(data_train_matcher)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4999.0), HTML(value='')))




# Eval precision of matching

In [16]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [17]:
%%time
# для понятности расписано все в строчку, без функций, ваша задача уметь оборачивать все это в функции
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

Wall time: 1min 2s


In [18]:
%%time
# result_eval_matcher['sim_user_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))

Wall time: 0 ns


### Precision@5 of matching

In [19]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [20]:
TOPK_PRECISION = 5

In [21]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.18872062663185182),
 ('als_rec', 0.1266840731070487),
 ('sim_item_rec', 0.060887728459530444)]

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [22]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

## Подготовка данных для трейна

In [23]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [24]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [25]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1097398, 101..."


In [26]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [27]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [28]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640


### Check warm start

In [29]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (95750, 2) Users: 1915 Items: 4437


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [30]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [31]:
df_ranker_train.target.value_counts()

0.0    88346
1.0    11053
Name: target, dtype: int64

In [32]:
df_ranker_train.head(9)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0
2,2070,879194,0.0
3,2070,948640,0.0
4,2070,928263,0.0
5,2070,944588,0.0
6,2070,1032703,0.0
7,2070,1138596,0.0
8,2070,1092937,1.0


(!) На каждого юзера 50 item_id-кандидатов

In [33]:
df_ranker_train['target'].mean()

0.11119830179378062

## Подготавливаем фичи для обучения модели

### Описательные фичи

In [34]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [35]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [36]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [37]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [38]:
%%time
N_PREDICT = 50 
result_eval_matcher['own_rec_' + str(N_PREDICT)] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 14.7 s


In [39]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


In [40]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,113,99,1996,5754.86,1.241758,1218.32967,0.000461,0.452137,0.000404,0.00814
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,54,51,1996,5754.86,0.593407,1218.32967,0.00022,0.452137,0.000208,0.00814
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,54,46,1996,5754.86,0.593407,1218.32967,0.00022,0.452137,0.000188,0.00814
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,49,44,1996,5754.86,0.538462,1218.32967,0.0002,0.452137,0.000179,0.00814
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,59,53,1996,5754.86,0.648352,1218.32967,0.000241,0.452137,0.000216,0.00814


In [41]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [42]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value',
 'total_quantity_value',
 'item_freq',
 'user_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'user_quantity_per_week',
 'item_quantity_per_basket',
 'user_quantity_per_baskter',
 'item_freq_per_basket',
 'user_freq_per_basket']

In [43]:
retail_data = pd.read_csv('retail_train.csv')
df_retail = df_ranker_train.merge(retail_data, on=[USER_COL, ITEM_COL], how='left')

In [44]:
# средний чек пользователя
df_for_mean_sales = df_retail.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()
df_for_mean_sales['mean_sales'] = df_for_mean_sales['sales_value'] / df_for_mean_sales['quantity']
df_for_mean_sales = df_for_mean_sales[['user_id', 'mean_sales']]
df_ranker_train = df_ranker_train.merge(df_for_mean_sales, on=[USER_COL], how='left')

In [45]:
# доля покупок для данной категории среди всех покупок пользователя
group_category = df_retail.groupby(USER_COL)[['department']].count().reset_index()
list_with_share_category = []
for index, row in df_ranker_train.iterrows():
    quantity_in_category = df_retail[df_retail[USER_COL] == row[USER_COL]]['department'].value_counts()[row['department']]
    quantity_all_category = group_category[group_category[USER_COL] == row[USER_COL]]['department'].values[0]
    list_with_share_category.append(quantity_in_category / quantity_all_category)
df_ranker_train['share_category'] = list_with_share_category

In [46]:
#цена товара
df_for_price = df_retail.groupby('item_id')[['quantity', 'sales_value']].sum().reset_index()
df_for_price['price_item'] =  df_for_price['sales_value'] / df_for_price['quantity']
df_for_price = df_for_price[['item_id', 'price_item']]
df_ranker_train = df_ranker_train.merge(df_for_price, on=['item_id'], how='left')

In [47]:
# отношение цены к средней цене товара в категории 
df_avg_price = df_retail.groupby('department')[['quantity', 'sales_value']].sum().reset_index()
df_avg_price['avg_price'] = df_avg_price['sales_value'] / df_avg_price['quantity']
df_avg_price = df_avg_price[['department', 'avg_price']]
list_with_avg_price = []
for index, row in df_ranker_train.iterrows():
    price = df_for_price[df_for_price['item_id'] == row['item_id']]['price_item'].values[0]
    average_price_in_category = df_avg_price[df_avg_price['department'] == row['department']]['avg_price'].values[0]
    list_with_avg_price.append(price / average_price_in_category)
df_ranker_train['price_to_average'] = list_with_avg_price

In [48]:
# Среднее количество покупок всеми юзерами конкретной категории в неделю
num_week = max(df_retail['week_no'])
df_avg_for_category_on_week = df_retail.groupby('department')['quantity'].count().reset_index()
df_avg_for_category_on_week['avg_quantity_in_week'] = df_avg_for_category_on_week['quantity'].values / num_week
df_avg_for_category_on_week['avg_quantity_buy_in_week'] = df_avg_for_category_on_week['avg_quantity_in_week'].apply(lambda x: round(x))
df_avg_for_category_on_week = df_avg_for_category_on_week[['department', 'avg_quantity_buy_in_week']]
df_ranker_train = df_ranker_train.merge(df_avg_for_category_on_week, on=['department'], how='left')

In [49]:
# Среднее отношение покупок в категории за неделю к общему количеству покупок за неделю
df_sum_for_category_on_week = df_retail.groupby(['week_no','department'])['quantity'].count().reset_index()
df_sum_all_category_on_week = df_sum_for_category_on_week.groupby(['week_no'])['quantity'].sum().reset_index()
df_sum_for_category_on_week = df_sum_for_category_on_week.merge(df_sum_all_category_on_week, on=['week_no'], how='left')
df_sum_for_category_on_week['ratio_in_category_to_total_in_week'] = df_sum_for_category_on_week['quantity_x'] / df_sum_for_category_on_week['quantity_y']
df_sum_for_category_on_week = df_sum_for_category_on_week[['week_no', 'department', 'ratio_in_category_to_total_in_week']]
df_sum_for_category_on_week = df_sum_for_category_on_week.groupby(['department'])['ratio_in_category_to_total_in_week'].mean().reset_index()
df_ranker_train = df_ranker_train.merge(df_sum_for_category_on_week, on=['department'], how='left')

In [50]:
recommender = MainRecommender(data_train_matcher)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4999.0), HTML(value='')))




In [51]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,mean_sales,share_category,price_item,price_to_average,avg_quantity_buy_in_week,ratio_in_category_to_total_in_week
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,0.000461,0.452137,0.000404,0.00814,3.437661,0.107843,3.96184,1.055393,184,0.052741
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,0.00022,0.452137,0.000208,0.00814,3.437661,0.54902,11.471481,3.328048,2248,0.644194
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,0.00022,0.452137,0.000188,0.00814,3.437661,0.078431,7.237222,1.414057,344,0.095613
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,0.0002,0.452137,0.000179,0.00814,3.437661,0.078431,6.596122,1.288794,344,0.095613
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,0.000241,0.452137,0.000216,0.00814,3.437661,0.078431,7.650973,1.494898,344,0.095613


In [52]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [53]:
cat_feats = ['manufacturer', 'department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'age_desc', 
             'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Обучение модели ранжирования

In [54]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=500,
                     n_estimators=10000,
                     learning_rate=0.1,
                     categorical_column=cat_feats)


lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [55]:
df_ranker_predict = df_ranker_train.copy()

In [56]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [57]:
df_ranker_predict.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,mean_sales,share_category,price_item,price_to_average,avg_quantity_buy_in_week,ratio_in_category_to_total_in_week,proba_item_purchase
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,0.452137,0.000404,0.00814,3.437661,0.107843,3.96184,1.055393,184,0.052741,6.691577e-06
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,0.452137,0.000208,0.00814,3.437661,0.54902,11.471481,3.328048,2248,0.644194,2.888811e-06
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,0.452137,0.000188,0.00814,3.437661,0.078431,7.237222,1.414057,344,0.095613,2.7633e-06
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,0.452137,0.000179,0.00814,3.437661,0.078431,6.596122,1.288794,344,0.095613,2.162131e-07
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,0.452137,0.000216,0.00814,3.437661,0.078431,7.650973,1.494898,344,0.095613,9.961481e-06


# Evaluation on test dataset

In [58]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


## Eval matching on test dataset

In [59]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 14.9 s


In [60]:
# померяем precision только модели матчинга, чтобы понимать влияние ранжирования на метрики
TOPK_PRECISION = 5
sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1462140992167092)]

In [61]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [62]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [63]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.1538381201044369)
('own_rec', 0.1462140992167092)


# Оценка на тесте для выполнения курсового проекта

In [64]:
df_test = pd.read_csv('retail_test1.csv')
# df_transactions = pd.read_csv('retail_train.csv')

In [65]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [66]:
df_test = df_test[df_test.user_id.isin(common_users)]

In [67]:
df_retail = df_test.merge(retail_data, on=[USER_COL, ITEM_COL], how='left')

In [68]:
df_retail.head(3)

Unnamed: 0,user_id,basket_id_x,day_x,item_id,quantity_x,sales_value_x,store_id_x,retail_disc_x,trans_time_x,week_no_x,...,basket_id_y,day_y,quantity_y,sales_value_y,store_id_y,retail_disc_y,trans_time_y,week_no_y,coupon_disc_y,coupon_match_disc_y
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,...,,,,,,,,,,
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,...,,,,,,,,,,
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,...,31502510000.0,305.0,6.0,11.34,311.0,-0.6,44.0,44.0,0.0,0.0


In [69]:
df_test

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.10,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
88729,98,41944918665,684,945779,2,2.00,421,0.0,1705,98,0.0,0.0
88730,98,41944918665,684,993617,2,2.00,421,0.0,1705,98,0.0,0.0
88731,98,41944918665,684,1128647,2,2.00,421,0.0,1705,98,0.0,0.0
88732,98,41944918665,684,9526886,2,0.60,421,0.0,1705,98,0.0,0.0


In [70]:
df_for_preds = df_test.copy()
df_for_preds = df_for_preds.merge(item_features, on='item_id', how='left')
df_for_preds = df_for_preds.merge(user_features, on='user_id', how='left')

In [71]:
df_for_preds

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,...,BEERS/ALES,BEERALEMALT LIQUORS,12 OZ,,,,,,,
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,...,BROOMS AND MOPS,MOPS: SPONGE DECK DUST,,35-44,A,50-74K,Homeowner,2 Adults Kids,5+,3+
2,2070,41652857291,664,995242,5,9.10,311,-0.6,46,96,...,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,...,BEERS/ALES,BEERALEMALT LIQUORS,12 OZ,,,,,,,
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,...,SEAFOOD - SHELF STABLE,TUNA,6 OZ,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83651,98,41944918665,684,945779,2,2.00,421,0.0,1705,98,...,FRZN MEAT/MEAT DINNERS,SS ECONOMY ENTREES/DINNERS ALL,7.10 OZ,35-44,U,35-49K,Unknown,1 Adult Kids,2,1
83652,98,41944918665,684,993617,2,2.00,421,0.0,1705,98,...,FRZN MEAT/MEAT DINNERS,SS ECONOMY ENTREES/DINNERS ALL,9.25 OZ,35-44,U,35-49K,Unknown,1 Adult Kids,2,1
83653,98,41944918665,684,1128647,2,2.00,421,0.0,1705,98,...,FRZN MEAT/MEAT DINNERS,SS ECONOMY ENTREES/DINNERS ALL,10.5 OZ,35-44,U,35-49K,Unknown,1 Adult Kids,2,1
83654,98,41944918665,684,9526886,2,0.60,421,0.0,1705,98,...,BAG SNACKS,SGL SV/VEND MACH SNACKS CHIP/P,1 OZ,35-44,U,35-49K,Unknown,1 Adult Kids,2,1


In [72]:
df_for_preds = df_for_preds.merge(df_ranker_predict[['user_id', 'mean_sales']], on=['user_id'], how='left').drop_duplicates()
df_for_preds = df_for_preds.merge(df_ranker_predict[['item_id', 'price_item']], on='item_id', how='left').drop_duplicates()
df_for_preds = df_for_preds.merge(df_ranker_predict[['user_id', 'department', 'share_category']], on=['user_id', 'department'], how='left').drop_duplicates()
df_for_preds = df_for_preds.merge(df_ranker_predict[['item_id', 'department', 'price_to_average']], on=['item_id', 'department'], how='left').drop_duplicates()
df_for_preds = df_for_preds.merge(df_avg_for_category_on_week, on=['department'], how='left').drop_duplicates()
df_for_preds = df_for_preds.merge(df_sum_for_category_on_week, on=['department'], how='left').drop_duplicates()

In [73]:
result_test = df_for_preds.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."


In [74]:
%%time
N_PREDICT = 500
result_test['rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 15 s


In [75]:
result_test.head()

Unnamed: 0,user_id,actual,rec
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[856942, 9297615, 5577022, 877391, 9655212, 10..."
1,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[13003092, 995598, 923600, 972416, 1084036, 11..."
2,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[998519, 894360, 7147142, 9338009, 896666, 939..."
3,8,"[846334, 850834, 857503, 862139, 865891, 87829...","[12808385, 939860, 981660, 7410201, 5577022, 6..."
4,9,"[883404, 995242, 1056005, 889692, 911140, 918046]","[872146, 918046, 9655676, 985622, 1056005, 109..."


In [76]:
TOPK_PRECISION = 5
sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('rec', 0.12928442573661988)]

In [77]:
final_result_test = result_test[['user_id', 'rec']]

In [78]:
final_result_test.head()

Unnamed: 0,user_id,rec
0,1,"[856942, 9297615, 5577022, 877391, 9655212, 10..."
1,6,"[13003092, 995598, 923600, 972416, 1084036, 11..."
2,7,"[998519, 894360, 7147142, 9338009, 896666, 939..."
3,8,"[12808385, 939860, 981660, 7410201, 5577022, 6..."
4,9,"[872146, 918046, 9655676, 985622, 1056005, 109..."


In [79]:
final_result_test.reset_index(drop=True).to_csv('Klimova_prediction.csv')