In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
data_train.describe()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
count,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0
mean,1271.764,32945260000.0,349.1402,2791955.0,100.6171,3.09511,2992.061,-0.5393603,1562.467,50.56328,-0.01646478,-0.002915685
std,726.9816,3964679000.0,167.6271,3673791.0,1153.002,4.196106,8693.638,1.23608,402.5741,23.94798,0.2179563,0.03995998
min,1.0,26984850000.0,1.0,25671.0,0.0,0.0,1.0,-130.02,0.0,1.0,-55.93,-7.7
25%,654.0,30035460000.0,208.0,916767.0,1.0,1.27,330.0,-0.69,1306.0,30.0,0.0,0.0
50%,1271.0,32149760000.0,351.0,1027068.0,1.0,2.0,370.0,-0.02,1615.0,51.0,0.0,0.0
75%,1914.0,34338250000.0,494.0,1131351.0,1.0,3.49,422.0,0.0,1846.0,71.0,0.0,0.0
max,2500.0,41297770000.0,635.0,17829230.0,89638.0,840.0,34280.0,3.99,2359.0,91.0,0.0,0.0


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [5]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items=items_weights['item_id']
    weights=items_weights['weight']
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False, p=weights)
    
    return recs.tolist()

In [6]:
%%time

# your_code
items_weights = data_train.groupby('item_id')['sales_value'].sum().reset_index()
items_weights['weight'] = items_weights['sales_value'] / items_weights['sales_value'].sum()
items_weights = items_weights.drop('sales_value', 1)

wrandom_recs = weighted_random_recommendation(items_weights)

print('Рекомендации: ', wrandom_recs)
print('Weight sum:' , items_weights['weight'].sum())

Рекомендации:  [983646, 874585, 856119, 860703, 6534178]
Weight sum: 0.9999999999999999
Wall time: 90.8 ms


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [48]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [49]:
result.iloc[0]['actual']

'[  821867   834484   856942   865456   889248   907957   914190   943316\n   951954   954486   958046   962568   969231   971585   979707   986947\n   990656   995242  1004906  1005186  1042083  1050310  1060819  1062002\n  1064441  1069733  1074612  1082185  1131115  1132771  6534544 13876341\n 15971874 17178953   883616   917704   931860   961554  1002032  1031190\n  8090541  8293439  9297615  9527329 15926712  1049998   861272   869465\n   877373   908213   933913   940947   945809   959316   978974  1031697\n  1041796  1048918  1081189  1101422  1115576  1122428  1132231  1132814\n  5577022  8091601  9296986  9677939 10356149 13417048 15741823 15830875]'

In [50]:
result['actual'] = result['actual'].str.strip("[]").replace('\n','')
result['actual'] = result['actual'].str.split().apply(lambda x: [int(i) for i in x])

In [52]:
type(result.iloc[0]['actual'])

list

In [53]:
# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k

In [54]:
%%time
result['weighted_random_recommendation'] = result.apply(lambda _: weighted_random_recommendation(items_weights), axis=1)
result.head(2)

Wall time: 1.66 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[12810393, 840361, 1054814, 9884698, 1138443]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[15716445, 866548, 979468, 1136257, 946793]"


In [55]:
result.iloc[0]['weighted_random_recommendation']

[12810393, 840361, 1054814, 9884698, 1138443]

In [56]:
result.apply(lambda row: precision_at_k(row['weighted_random_recommendation'], row['actual']), axis=1).mean()

0.022918707149853147

Сравним с результатами, полученными на уроке:

result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()
* 0.0002938295788442704

result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()
* 0.15523996082272082

result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean()
* 0.13692458374142857

result.apply(lambda row: precision_at_k(row['cosine'], row['actual']), axis=1).mean()
* 0.13290891283055686

result.apply(lambda row: precision_at_k(row['tfidf'], row['actual']), axis=1).mean()
* 0.1389813907933383

result.apply(lambda row: precision_at_k(row['own_purchases'], row['actual']), axis=1).mean()
* 0.17969311132876015

#### Вывод: получается гораздо лучше, чем случайные рекомендации, но хуже чем любые рассмотренные модели

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [13]:
# your_code