In [1]:
!pip install implicit



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [3]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [5]:
items_weights = data.groupby('item_id')['quantity'].sum().reset_index()
items_weights.sort_values('quantity', ascending=False, inplace=True)
items_weights['weights']=items_weights['quantity']/sum(items_weights['quantity'])
items_weights.drop(['quantity'],  axis=1, inplace=True)
#items_weights

In [6]:
def weighted_random_recommendation(items_weights, n=5):
    items = items_weights.item_id.unique()
    rng = np.random.default_rng()
    recs = rng.choice(items, 5, p=items_weights['weights'].values, replace = False)
    
    return recs.tolist()

In [7]:
weighted_random_recommendation(items_weights ,5 )

[6534178, 6544236, 1426702, 6534166, 1096343]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [8]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[12695726, 15717063, 858903, 12984698, 1066657]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1082185, 1029743, 995785, 1004906, 1081177]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[1016578, 2260500, 1086096, 1060985, 1043534]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[1082185, 1098066, 6534178, 1127831, 1068719]"


In [9]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
weighted_random_rec = weighted_random_recommendation(items_weights ,5 )

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_rec)

Wall time: 6 ms


In [10]:
result.head(5)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,[ 821867 834484 856942 865456 889248 ...,"[12695726, 15717063, 858903, 12984698, 1066657]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1082185, 1029743, 995785, 1004906, 1081177]","[6534178, 6534166, 397896, 6544236, 6533889]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[1016578, 2260500, 1086096, 1060985, 1043534]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[1082185, 1098066, 6534178, 1127831, 1068719]","[6534178, 6534166, 397896, 6544236, 6533889]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[949954, 2702042, 14111721, 2017119, 878462]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[1082185, 1029743, 6534178, 1127831, 995785]","[6534178, 6534166, 397896, 6544236, 6533889]"
3,7,[ 840386 889774 898068 909714 929067 ...,"[2004835, 1018669, 12946257, 9885337, 9707530]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[1082185, 1029743, 1127831, 995785, 1044078]","[6534178, 6534166, 397896, 6544236, 6533889]"
4,8,[ 835098 872137 910439 924610 992977 ...,"[922896, 1315743, 6554209, 6773073, 1011790]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 1029743, 1098066, 6534178, 1127831]","[6534178, 6534166, 397896, 6544236, 6533889]"


In [11]:
# почему то сохранение таблицы произошло с изменением поле actual. при загрузке не хватает заяптых и поля преобразовынны
# в другие значени. Нормализуем
res=result.copy()

for i in range(len(res['actual'])):
    res['actual'][i]=res['actual'][i].replace('  ',' ').replace('\n ', ' ').replace('  ',' ').replace('[ ','[').replace(' ',', ')
    
res.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['actual'][i]=res['actual'][i].replace('  ',' ').replace('\n ', ' ').replace('  ',' ').replace('[ ','[').replace(' ',', ')


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[12695726, 15717063, 858903, 12984698, 1066657]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1082185, 1029743, 995785, 1004906, 1081177]","[6534178, 6534166, 397896, 6544236, 6533889]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1016578, 2260500, 1086096, 1060985, 1043534]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[1082185, 1098066, 6534178, 1127831, 1068719]","[6534178, 6534166, 397896, 6544236, 6533889]"


In [12]:
import ast

In [13]:
for name_col in res.columns[1:-1]:
    for n in range(len(res[name_col])):
        res[name_col][n]=ast.literal_eval(res[name_col][n])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res[name_col][n]=ast.literal_eval(res[name_col][n])


In [14]:
import os, sys
from metrics import precision_at_k, recall_at_k

In [15]:
result=res.copy()

In [16]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4)}:{name_col}")

1.0:actual
0.0008:random_recommendation
0.1552:popular_recommendation
0.1369:itemitem
0.1329:cosine
0.139:tfidf
0.2019:own_purchases
0.0461:weighted_random_recommendation


  return flags.sum() / len(recommended_list)


ВЫВОД:

weighted_random_recommendation похоже по составу на топ-n товаров т.к. они имеют больший вес и соответвенно рандомно выбираются чаще. Метрика же показывает что он хуже просто популярных товаров.

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [None]:
# your_code