In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

from metrics import hit_rate_at_k, recall_at_k, precision_at_k
# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

In [2]:
data = pd.read_csv('../Manuals/webinar_2/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [4]:
total_sales = data_train['item_id'].value_counts()
items_weights = pd.DataFrame({'item_id': total_sales.axes[0], 'weights': data_train['item_id'].value_counts() / total_sales.sum()})

In [5]:
def weighted_random_recommendation(items_weights_, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    
    samples = items_weights_['item_id'].sample(n, weights=items_weights_['weights']).tolist()
    
    return samples

In [6]:
weighted_random_recommendation(items_weights)

[12263172, 962721, 867547, 985709, 995242]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [7]:
result = pd.read_csv('../Manuals/webinar_2/predictions_basic.csv', sep=',')
result.head(3)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[689420, 5571734, 598152, 15596518, 6904409]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[1371922, 13039690, 973340, 1311202, 1099332]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[12132657, 978937, 1085229, 875030, 1021438]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 878996]","[1082185, 981760, 995242, 1029743, 840361]"


In [8]:
result.actual = np.array(result.actual.str.strip('[]').str.split().apply(pd.to_numeric))
for col in result.columns.values[2:]:
    result[col] = np.array(result[col].str.strip('[]').str.split(',').apply(pd.to_numeric))

In [9]:
res_metrics = pd.DataFrame()
metrics = {'Hit Rate @ 5': hit_rate_at_k, 'Precision @ 5': precision_at_k, 'Recall @ 5': recall_at_k}
for name, func in metrics.items():
    temp = dict()
    for col in result.columns[2:]:
        temp[col] = data=round(result.apply(lambda row: func(row[col], row['actual']), axis=1).mean(), 4)
    res_metrics = res_metrics.append(pd.Series(temp, name=name), ignore_index=False)

In [10]:
res_metrics

Unnamed: 0,cosine,itemitem,own_purchases,popular_recommendation,random_recommendation,tfidf
Hit Rate @ 5,0.4765,0.4868,0.6126,0.5313,0.002,0.4853
Precision @ 5,0.1329,0.1368,0.2193,0.1552,0.0004,0.139
Recall @ 5,0.0148,0.0157,0.0289,0.025,0.0,0.0154


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [11]:
? ItemItemRecommender

[0;31mInit signature:[0m  [0mItemItemRecommender[0m[0;34m([0m[0mK[0m[0;34m=[0m[0;36m20[0m[0;34m,[0m [0mnum_threads[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Base class for Item-Item Nearest Neighbour recommender models
here.

Parameters
----------
K : int, optional
    The number of neighbours to include when calculating the item-item
    similarity matrix
num_threads : int, optional
    The number of threads to use for fitting the model. Specifying 0
    means to default to the number of cores on the machine.
[0;31mFile:[0m           ~/.local/lib/python3.8/site-packages/implicit/nearest_neighbours.py
[0;31mType:[0m           ABCMeta
[0;31mSubclasses:[0m     CosineRecommender, TFIDFRecommender, BM25Recommender


In [12]:
# your_code