In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [51]:
data = pd.read_csv('data/retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2396804 entries, 0 to 2396803
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 219.4 MB


In [53]:
data.describe()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
count,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0
mean,1271.904,33366430000.0,363.977,2827247.0,100.3763,3.100821,3048.227,-0.5400708,1561.714,52.68156,-0.01638696,-0.002897905
std,726.5644,4284798000.0,175.9385,3732798.0,1152.379,4.210229,8785.542,1.245824,401.5691,25.1331,0.2168615,0.03974618
min,1.0,26984850000.0,1.0,25671.0,0.0,0.0,1.0,-130.02,0.0,1.0,-55.93,-7.7
25%,655.0,30087140000.0,216.0,916993.0,1.0,1.29,330.0,-0.69,1307.0,32.0,0.0,0.0
50%,1271.0,32419980000.0,366.0,1027569.0,1.0,2.0,370.0,-0.02,1614.0,53.0,0.0,0.0
75%,1914.0,35145800000.0,515.0,1132178.0,1.0,3.49,422.0,0.0,1844.0,74.0,0.0,0.0
max,2500.0,41656790000.0,663.0,18024560.0,89638.0,840.0,34280.0,3.99,2359.0,95.0,0.0,0.0


In [7]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [61]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items = np.array(items_weights['item_id'])
    weights = np.array(items_weights['weights'])
    recs = np.random.choice(items, size=n, p=weights, replace=False)   
    
    return recs.tolist()

In [72]:
#Группируем товары по item_id и суммируем их sales, затем нормируем, получая веса
items_weights = data_train.groupby('item_id').agg(sales_sum=('sales_value', 'sum')).reset_index()
items_weights['weights'] = items_weights[['sales_sum']].apply(lambda x: x / np.sum(x))

In [73]:
items_weights

Unnamed: 0,item_id,sales_sum,weights
0,25671,20.94,2.969296e-06
1,26081,0.99,1.403822e-07
2,26093,1.59,2.254623e-07
3,26190,1.54,2.183723e-07
4,26355,1.98,2.807644e-07
...,...,...,...
86860,17381856,0.00,0.000000e+00
86861,17382205,7.99,1.132983e-06
86862,17383227,4.49,6.366828e-07
86863,17827644,2.50,3.545005e-07


In [63]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actuality']
result.head()

Unnamed: 0,user_id,actuality
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [64]:
%%time
result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(5)

CPU times: user 3.61 s, sys: 636 ms, total: 4.25 s
Wall time: 4.37 s


Unnamed: 0,user_id,actuality,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1050851, 884145, 1032060, 1108940, 826666]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1121213, 12301405, 965267, 1136654, 1040807]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13008461, 860439, 932456, 13008328, 9677109]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[933637, 1085604, 832678, 872137, 939752]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[871756, 868386, 9837501, 892317, 906643]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [108]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(recommended_list) 

In [109]:
result = pd.read_csv('output/predictions_basic.csv')
result['actual'] = result['actual'].apply(lambda x: np.fromstring(x.replace('[','')
                                                                        .replace(']',''),sep=' ')).apply(lambda x: np.array(x.astype('int')))
for col in result.columns[2:]:
    result[col] = result[col].apply(eval).apply(np.array)

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5)).apply(np.array)
result.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[9485371, 17104661, 9837998, 12171628, 13777104]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[8091006, 1072438, 898068, 1042942, 8119123]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13910038, 48728, 13416521, 12263605, 828300]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1127624, 1817725, 6534178, 13876877, 1144581]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1109352, 10150064, 9878845, 893398, 6554122]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]","[13158494, 878715, 1139651, 1135768, 13115700]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[966390, 12781773, 891659, 1107651, 6442464]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]","[6534178, 917772, 1085604, 1095486, 8354399]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[52240, 1972104, 978851, 10144585, 921383]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]","[914190, 926320, 1108168, 5584368, 1007484]"


In [110]:
result.columns[2:]

Index(['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine',
       'tfidf', 'own_purchases', 'weighted_random_recommendation'],
      dtype='object')

In [114]:
# your_code
for col in result.columns[2:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[col], row['actual']), axis=1).mean(), 4):<6} - {col}")

0.0003 - random_recommendation
0.1552 - popular_recommendation
0.1369 - itemitem
0.1329 - cosine
0.139  - tfidf
0.1797 - own_purchases
0.024  - weighted_random_recommendation


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [None]:
# your_code