In [39]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items

import pandas as pd

In [40]:
from src.recommenders import MainRecommender

In [41]:
# Загрузим данные о транзакциях
data = pd.read_csv('./data/transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [42]:
# Загрузим данные о продуктах - они нам потребуются для предобработки
item_features = pd.read_csv('./data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [43]:
# Выполним предобработку данных с использованием ранее подготовленной функции prefilter_items из src.utils
n_items_before = data['item_id'].nunique()

data = prefilter_items(data, take_n_popular=5000, item_features=item_features)

n_items_after = data['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 92339 to 5001


In [44]:
# Разделим данные на train и test выборки
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19


In [45]:
# Подготовим таблицу для результатов рекомендаций
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[999999, 1115576, 1124029, 6514011, 13115375, ..."
1,3,"[823704, 999999, 1008714, 1123434, 15926712]"


In [46]:
# Создадим класс и обучим модель на train
recommender = MainRecommender(data_train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [47]:
%%time
# Метод - получение топ-N товаров среди купленных похожими юзерами
# Предскажем результаты на test и рассчитаем метрику precision_at_k
result['from_similar_users'] = result['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x))
result.apply(lambda row: precision_at_k(row['from_similar_users'], row['actual']), axis=1).mean()

  precision = flags.sum() / len(recommended_list)


Wall time: 2min 40s


0.07110879750130183

In [48]:
%%time
# Метод - получение товаров, похожих на топ-N купленных юзером товаров
# Предскажем результаты на test и рассчитаем метрику precision_at_k
result['from_similar_items'] = result['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x))
result.apply(lambda row: precision_at_k(row['from_similar_items'], row['actual']), axis=1).mean()

Wall time: 2min 9s


0.05590838105153606

In [49]:
result.head(5)

Unnamed: 0,user_id,actual,from_similar_users,from_similar_items
0,1,"[999999, 1115576, 1124029, 6514011, 13115375, ...","[947412, 857503, 1037332, 1062002, 1029743]","[842762, 1007512, 9297615, 5577022, 1132231]"
1,3,"[823704, 999999, 1008714, 1123434, 15926712]","[8090521, 1106523, 1029743, 1037840, 8090537]","[1044078, 1024051, 832678, 839419, 833940]"
2,5,[999999],"[5569845, 1122358, 8090536, 1029743, 964521]","[865156, 946839, 6391152, 7409969, 1070702]"
3,6,"[825541, 1067606, 1082627, 999999, 1008032, 95...","[1026118, 1025650, 1106523, 1126899, 1024306]","[948650, 5569845, 8357613, 1105488, 941361]"
4,7,"[999999, 1054509, 849505, 993638, 1083111, 110...","[6552318, 9653535, 909268, 5585510, 1062002]","[917384, 1044078, 872177, 7147145, 8293385]"


Как видим, при выборе топ-N товаров среди купленных похожими юзерами результат получился лучше, чем при выборе товаров, похожих на топ-N купленных юзером товаров.