In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [4]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [5]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    items = np.array(items_weights["item_id"])
    weights = np.array(items_weights["weights"])
    
    recs = np.random.choice(items, size=n, replace=False, p=weights)
    
    return recs.tolist()

In [6]:
data_train

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2282320,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,0.0,0.0
2282321,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,0.0,0.0
2282322,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,0.0,0.0
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0


In [7]:
# find weights by weight_function
weight_function = lambda x: np.log(x+1)

In [8]:
def get_weights(data):
    items_weights = data.groupby("item_id")["sales_value"].sum().reset_index()
    items_weights["sales_value"] = items_weights["sales_value"].apply(weight_function)
    total_weight = items_weights["sales_value"].sum()
    items_weights = items_weights.rename(columns={"sales_value": "weights"})
    items_weights["weights"] = items_weights["weights"].apply(lambda x: x / total_weight)
    return items_weights

In [9]:
items_weights = get_weights(data_train)

In [10]:
# check sum
items_weights["weights"].sum()

1.0

In [11]:
#test
n = 10
random_res = random_recommendation(items=data_test["item_id"], n=n)
items_weights = get_weights(data_test)
weight_random_res = weighted_random_recommendation(items_weights=items_weights, n=n)

In [12]:
random_res, weight_random_res

([1056162,
  981760,
  1098161,
  854373,
  13007846,
  1125904,
  1060005,
  914190,
  834662,
  1035321],
 [12695858,
  960791,
  1127520,
  12523757,
  1054917,
  9836519,
  7467039,
  921277,
  827047,
  10356789])

In [13]:
result = data_test.groupby("user_id")["item_id"].unique().reset_index()
result["random_sampler"] = result["user_id"].apply(lambda x: random_recommendation(data_test["item_id"], n=n))
result["weight_random_sampler"] = result["user_id"].apply(lambda x: weighted_random_recommendation(items_weights, n=n))
result.head(5)

Unnamed: 0,user_id,item_id,random_sampler,weight_random_sampler
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[914754, 1025611, 951746, 963727, 864705, 1082...","[972569, 866140, 977330, 1004568, 987650, 9568..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1082185, 5585735, 1026334, 1016314, 1005274, ...","[13877124, 1098025, 1121367, 17106323, 868075,..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[5978656, 5995609, 15971883, 1010578, 835098, ...","[1319520, 16219317, 1044127, 939189, 8181119, ..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[12384694, 1100140, 844179, 13945288, 1101706,...","[13158992, 1043279, 7409599, 995816, 5575652, ..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[845208, 866140, 902172, 983096, 6534178, 9115...","[1134956, 1011459, 6772875, 15716427, 1017896,..."


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [14]:
def indicate_at_k(recommended_list: list, bought_list: list, k=-1):
    recommended_list = np.asarray(recommended_list) if k == -1 else np.asarray(recommended_list)[:k]
    bought_list = np.asarray(bought_list)
    
    return np.isin(recommended_list, bought_list)

In [15]:
def preccision_at_k(recommended_list, bought_list, k=-1):
    indication = indicate_at_k(recommended_list, bought_list, k=k)
    if k != -1:
        recommended_list = recommended_list[:k]
        
    precision = indication.sum() / len(recommended_list)
    
    return precision

In [17]:
for k in range(1, 5):
    result[f"random_p@{k}"] = result.apply(lambda row: preccision_at_k(row["random_sampler"], row["item_id"], k=k), axis =1)
    result[f"weight_random_p@{k}"] = result.apply(lambda row: preccision_at_k(row["weight_random_sampler"], row["item_id"], k=k), axis=1)

In [18]:
result.describe()

Unnamed: 0,user_id,random_p@1,weight_random_p@1,random_p@2,weight_random_p@2,random_p@3,weight_random_p@3,random_p@4,weight_random_p@4
count,2042.0,2042.0,2042.0,2042.0,2042.0,2042.0,2042.0,2042.0,2042.0
mean,1257.93095,0.019589,0.003428,0.018609,0.002693,0.019589,0.003265,0.019344,0.003428
std,718.052041,0.138616,0.058463,0.098477,0.036608,0.082475,0.034453,0.070384,0.030115
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,648.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1260.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1879.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2500.0,1.0,1.0,1.0,0.5,0.666667,0.666667,0.5,0.5


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [19]:
n = 5000
result = data_test.groupby("user_id")["item_id"].unique().reset_index()
result["random_sampler"] = result["user_id"].apply(lambda x: random_recommendation(data_test["item_id"], n=n))
result["weight_random_sampler"] = result["user_id"].apply(lambda x: weighted_random_recommendation(items_weights, n=n))
result.head(5)

Unnamed: 0,user_id,item_id,random_sampler,weight_random_sampler
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[12731683, 1135971, 844498, 1057855, 1013149, ...","[1116476, 12757377, 835243, 854056, 7442880, 1..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1055863, 1075918, 874972, 866778, 1014206, 92...","[828896, 9655482, 1081177, 872137, 870217, 155..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1046201, 1096343, 13115524, 955747, 1125497, ...","[824819, 830920, 15596128, 883107, 1054545, 92..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1082185, 838261, 1106523, 8090546, 913210, 15...","[9835509, 980353, 7142861, 9445502, 1002240, 8..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[15716719, 921852, 842707, 1068292, 1000542, 1...","[1119993, 987237, 17106166, 12301839, 1033187,..."


In [20]:
for k in range(5000, 5001):
    result[f"random_p@{k}"] = result.apply(lambda row: preccision_at_k(row["random_sampler"], row["item_id"], k=k), axis =1)
    result[f"weight_random_p@{k}"] = result.apply(lambda row: preccision_at_k(row["weight_random_sampler"], row["item_id"], k=k), axis=1)

In [21]:
result.describe()

Unnamed: 0,user_id,random_p@5000,weight_random_p@5000
count,2042.0,2042.0,2042.0
mean,1257.93095,0.020677,0.003105
std,718.052041,0.018218,0.002986
min,1.0,0.0,0.0
25%,648.5,0.0056,0.0008
50%,1260.5,0.0166,0.0022
75%,1879.75,0.0314,0.0044
max,2500.0,0.122,0.0212
