In [2]:
!pip install implicit



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [4]:
data = pd.read_csv('/content/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
#data.sort_values('quantity')

In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [46]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [8]:
def popular_weights(data):
  popular_weight = data.groupby('item_id')['sales_value'].sum().reset_index()
  popular_weight['weight']=popular_weight['sales_value']/popular_weight['sales_value'].sum()
  return popular_weight[['item_id', 'weight']] #.sort_values('weight', ascending=False)

In [9]:
def weighted_random_recommendation(items_weights, n=5):
    #Случайные рекоммендации

    recs = np.random.choice(items_weights['item_id'], size=n, replace=False)
    recs_sort= items_weights.loc[items_weights['item_id'].isin(recs.tolist())].sort_values('weight', ascending=False)['item_id']
    return recs_sort.to_list()

In [10]:
%%time


items_weights = popular_weights(data_train)

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
# your_code

CPU times: user 9.62 s, sys: 50.6 ms, total: 9.67 s
Wall time: 9.64 s


In [12]:
result

Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[7123871, 957010, 15972491, 5571672, 13506527]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1120791, 2017058, 8091100, 12646245, 858391]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1052294, 1056746, 9803483, 828015, 105729]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[914190, 1134152, 943275, 9858963, 1432456]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[13911330, 823569, 912125, 10355911, 1755861]"
...,...,...,...
2037,2496,[6534178],"[12263682, 6602328, 6556386, 1280208, 1569331]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[15737492, 12263968, 13095623, 2042859, 2503277]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1067271, 1015872, 12949895, 1115691, 414430]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[946344, 13189859, 1460994, 1479873, 98382]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [13]:
scors = pd.read_excel('/content/predictions_basic.xlsx')
scors

Unnamed: 0,scor,algoritms
0,1.0,actual
1,0.0005,random_recommendation
2,0.1552,popular_recommendation
3,0.1369,itemitem
4,0.1329,cosine
5,0.139,tfidf
6,0.2019,own_purchases


In [14]:
def precision(recommended_list, bought_list):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(recommended_list)

def precision_at_k(recommended_list, bought_list, k=5):
    return precision(recommended_list[:k], bought_list)


In [15]:
s,n = round(result.apply(lambda row: precision_at_k(row['weighted_random_recommendation'], row['actual']), axis=1).mean(),4), 'weighted_random_recommendation'
scors.append(pd.DataFrame([s, n], ['scor','algoritms']).T)


Unnamed: 0,scor,algoritms
0,1.0,actual
1,0.0005,random_recommendation
2,0.1552,popular_recommendation
3,0.1369,itemitem
4,0.1329,cosine
5,0.139,tfidf
6,0.2019,own_purchases
0,0.0008,weighted_random_recommendation


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [16]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popular_weight = data.groupby('item_id')['sales_value'].sum().reset_index()
popular_weight.rename(columns={'sales_value': 'val_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000)
top_5000_weight=popular_weight.sort_values('val_sold', ascending=False).head(5000)

In [40]:
#data_train_top5000=data_train['item_id'].isin(top_5000['item_id'])
data_train_top5000=data_train.loc[data_train['item_id'].isin(top_5000['item_id'])]
data_train_top5000

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2282319,222,41297772783,635,926804,1,0.59,304,0.00,1716,91,0.0,0.0
2282320,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,0.0,0.0
2282322,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,0.0,0.0
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0


In [17]:
merdge_df=pd.merge(top_5000, top_5000_weight, on='item_id', how='outer')

In [18]:
merdge_df["n_sold"] = merdge_df["n_sold"].fillna(0)

In [19]:
merdge_df['n/val']=merdge_df["n_sold"]/merdge_df["val_sold"] 
merdge_df.sort_values('n/val', ascending=False, inplace=True)
merdge_df

Unnamed: 0,item_id,n_sold,val_sold,n/val
9,5668996,295610.0,690.63,428.029480
5,397896,1214994.0,2932.59,414.307489
8,480014,371107.0,912.63,406.634671
0,6534178,190227964.0,467993.62,406.475550
4,1404121,1562004.0,3890.00,401.543445
...,...,...,...,...
4991,1018278,120.0,,
4992,845522,120.0,,
4993,981874,120.0,,
4997,838842,119.0,,


In [20]:
def popularity_recommendation_new(data, n=5):
    """Топ-n популярных товаров по продажам"""
    
    popular = data.groupby('item_id')['weight'].sum().reset_index()
    popular.sort_values('weight', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [21]:
# Можно так делать, так как рекомендация не зависит от юзера
popular_weight = popularity_recommendation_new(items_weights, n=5)

result['popular_recommendation_new'] = result['user_id'].apply(lambda x: popular_weight)
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation,popular_recommendation_new
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[7123871, 957010, 15972491, 5571672, 13506527]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1120791, 2017058, 8091100, 12646245, 858391]","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [47]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [49]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [59]:
%time

for k_ in range(1,11):
  model = ItemItemRecommender(K=k_, num_threads=4) # K - кол-во билжайших соседей

  model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
            show_progress=True)
  
  result['ItemItem_'+str(k_)] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    #filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)])

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.54 µs


HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=86865.0), HTML(value='')))




In [60]:
result

Unnamed: 0,user_id,actual,weighted_random_recommendation,popular_recommendation_new,ItemItem_1,ItemItem_2,ItemItem_3,ItemItem_4,ItemItem_5,ItemItem_6,ItemItem_7,ItemItem_8,ItemItem_9,ItemItem_10
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[7123871, 957010, 15972491, 5571672, 13506527]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 995242, 1127831, 840361]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1120791, 2017058, 8091100, 12646245, 858391]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 1098066, 6534178, 826249, 1127831]","[1082185, 981760, 1098066, 1127831, 826249]","[1082185, 981760, 1098066, 826249, 1127831]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1052294, 1056746, 9803483, 828015, 105729]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 1127831]","[1082185, 981760, 1127831, 995242, 6534178]","[1082185, 981760, 1127831, 995242, 995785]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 995242, 1127831, 1098066]","[1082185, 981760, 995242, 1127831, 1098066]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 840361, 995242, 1127831]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[914190, 1134152, 943275, 9858963, 1432456]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 995242, 1029743, 1127831, 826249]","[1082185, 981760, 995242, 1127831, 1029743]","[1082185, 981760, 1127831, 995242, 961554]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1098066, 995242, 1127831]","[1082185, 981760, 995242, 1098066, 826249]","[1082185, 981760, 995242, 1098066, 840361]","[1082185, 981760, 995242, 840361, 1098066]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[13911330, 823569, 912125, 10355911, 1755861]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 1098066]","[1082185, 981760, 6534178, 995242, 1127831]","[1082185, 981760, 1127831, 1098066, 995242]","[1082185, 981760, 1127831, 1098066, 995242]","[1082185, 981760, 1098066, 995242, 1127831]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 840361]","[1082185, 981760, 1098066, 995242, 840361]","[1082185, 981760, 995242, 1098066, 840361]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2037,2496,[6534178],"[12263682, 6602328, 6556386, 1280208, 1569331]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1098066, 6534178, 840361]","[1082185, 981760, 6534178, 1098066, 840361]","[1082185, 981760, 1098066, 1127831, 826249]","[1082185, 981760, 1098066, 1127831, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 840361]","[1082185, 981760, 1098066, 840361, 826249]","[1082185, 981760, 1098066, 840361, 995242]","[1082185, 981760, 1098066, 840361, 995242]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[15737492, 12263968, 13095623, 2042859, 2503277]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 6534178]","[1082185, 981760, 6534178, 1098066, 995242]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1098066, 1127831, 995242]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 840361]","[1082185, 981760, 995242, 1098066, 840361]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1067271, 1015872, 12949895, 1115691, 414430]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 1029743, 840361, 826249, 961554]","[1082185, 981760, 1029743, 826249, 840361]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 995242]","[1082185, 981760, 1098066, 1127831, 995242]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 826249]","[1082185, 981760, 1098066, 995242, 840361]","[1082185, 981760, 840361, 1098066, 995242]","[1082185, 981760, 840361, 995242, 1098066]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[946344, 13189859, 1460994, 1479873, 98382]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 6534178, 1098066, 826249]","[1082185, 981760, 1098066, 6534178, 826249]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]","[1082185, 981760, 1098066, 826249, 995242]"


In [96]:
for k_ in range(1,11):
#k_=10
  s,n = round(result.apply(lambda row: precision_at_k(row['ItemItem_'+str(k_)], row['actual']), axis=1).mean(),4), 'ItemItem_'+str(k_)
  scors=scors.append(pd.DataFrame([s, n], ['scor','algoritms']).T, ignore_index = True)

In [97]:
scors

Unnamed: 0,scor,algoritms
0,1.0,actual
1,0.0005,random_recommendation
2,0.1552,popular_recommendation
3,0.1369,itemitem
4,0.1329,cosine
5,0.139,tfidf
6,0.2019,own_purchases
7,0.2195,ItemItem_1
8,0.2063,ItemItem_2
9,0.17,ItemItem_3
