In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('./data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [4]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    items = np.array(items_weights['item_id'])
    weights = np.array(items_weights['weight'])
    
    recs = np.random.choice(items, size=n, replace=False, p=weights)
    
    return recs.tolist()

In [5]:
def clean(seq_string):
    """
    Для преобразования данных из строк в списки с числами
    """
    if seq_string != '[]':
        return list(map(int, seq_string[1:-1].split(',')))
    else:
        return []

result = pd.read_csv('./data/predictions_basic.csv', converters={ 
                                                                'actual': clean,
                                                                'random_recommendation': clean,
                                                                'popular_recommendation': clean,
                                                                'itemitem': clean,
                                                                'cosine': clean,
                                                                'tfidf': clean,
                                                                'own_purchases': clean})
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[869477, 9420142, 1044193, 836286, 1134199]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[1082185, 1029743, 995785, 1004906, 1081177]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[915797, 1094625, 1128173, 1126929, 927011]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[1082185, 1098066, 6534178, 1127831, 1068719]"


In [6]:
%%time

items_weights = data_train.groupby('item_id', as_index=False).agg({'sales_value': sum}
                                                                 ).rename(columns={'sales_value': 'weight'})
items_weights['weight'] = np.log(items_weights['weight'] + 1)
items_weights['weight'] = items_weights['weight'] / np.sum(items_weights['weight'])

result['WRR'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(2)

CPU times: user 2.37 s, sys: 32.8 ms, total: 2.41 s
Wall time: 2.41 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,WRR
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[869477, 9420142, 1044193, 836286, 1134199]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[1082185, 1029743, 995785, 1004906, 1081177]","[902192, 947635, 825625, 895702, 13190670]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[915797, 1094625, 1128173, 1126929, 927011]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[1082185, 1098066, 6534178, 1127831, 1068719]","[1066953, 13158064, 929127, 9677551, 1019991]"


In [7]:
items_weights.head(2)

Unnamed: 0,item_id,weight
0,25671,1.3e-05
1,26081,3e-06


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [8]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,WRR
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[869477, 9420142, 1044193, 836286, 1134199]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[1082185, 1029743, 995785, 1004906, 1081177]","[902192, 947635, 825625, 895702, 13190670]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[915797, 1094625, 1128173, 1126929, 927011]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[1082185, 1098066, 6534178, 1127831, 1068719]","[1066953, 13158064, 929127, 9677551, 1019991]"


In [9]:
def precision_at_k(recommended_list, bought_list, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list 
    if len(recommended_list) > k:
        recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    if len(recommended_list):
        precision = flags.sum() / len(recommended_list)
    else:
        precision = 0
    
    return precision

### Задание 2. Измерим качество по precision@5

In [10]:
def print_scores(result):
    """Выводит отсортированные по возрастанию значения precision@5 для всех 
    присутствующих в result колонок
    
    Input
    ------
    result: dataframe, содержащий колонку 'actual' с купленными товарами 
    и колонки с рекомендациями
    """
    columns_to_print = set(result.columns) - set(('user_id', 'actual'))
    scores = {}
    for col in columns_to_print:
        precision = np.mean(result.apply(lambda x: precision_at_k(x[col], x['actual']), axis=1))
        scores[col] = precision

    for col, score in sorted(scores.items(), key=lambda item: item[1]):
        print(f'{col} precision: {score}')

In [11]:
print_scores(result)

random_recommendation precision: 0.0005876591576885408
WRR precision: 0.0009794319294809011
itemitem precision: 0.033202742409402554
cosine precision: 0.03525954946131244
tfidf precision: 0.036141038197845254
popular_recommendation precision: 0.15523996082272282
own_purchases precision: 0.20112634671890306


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

#### Бейзлайны

In [12]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [13]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [14]:
len(data_train['item_id'].unique())

86865

In [15]:
top_5000 = popularity_recommendation(data_train, n=5000)

In [16]:
%%time

result['random_recommendation_5000'] = result['user_id'].apply(lambda x: random_recommendation(top_5000, n=5))
result.head(2)

CPU times: user 700 ms, sys: 13.4 ms, total: 713 ms
Wall time: 703 ms


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,WRR,random_recommendation_5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[869477, 9420142, 1044193, 836286, 1134199]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[1082185, 1029743, 995785, 1004906, 1081177]","[902192, 947635, 825625, 895702, 13190670]","[936753, 827578, 931579, 862070, 6979253]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[915797, 1094625, 1128173, 1126929, 927011]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[1082185, 1098066, 6534178, 1127831, 1068719]","[1066953, 13158064, 929127, 9677551, 1019991]","[870980, 1044068, 1028422, 1057260, 895348]"


In [17]:
items_weights = data_train.groupby('item_id').agg({'sales_value': sum}
                                                 ).rename(columns={'sales_value': 'weight'}).reset_index()
items_weights.sort_values('weight', ascending=False, inplace=True)
items_weights = items_weights.head(5000)

items_weights['weight'] = np.log(items_weights['weight'] + 1)
items_weights['weight'] = items_weights['weight'] / np.sum(items_weights['weight'])

result['WRR_5000'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,WRR,random_recommendation_5000,WRR_5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[869477, 9420142, 1044193, 836286, 1134199]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[1082185, 1029743, 995785, 1004906, 1081177]","[902192, 947635, 825625, 895702, 13190670]","[936753, 827578, 931579, 862070, 6979253]","[5585510, 926523, 1065538, 884774, 873324]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[915797, 1094625, 1128173, 1126929, 927011]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[1082185, 1098066, 6534178, 1127831, 1068719]","[1066953, 13158064, 929127, 9677551, 1019991]","[870980, 1044068, 1028422, 1057260, 895348]","[1010582, 12757133, 1004906, 9523058, 1083328]"


In [18]:
print_scores(result)

random_recommendation precision: 0.0005876591576885408
WRR precision: 0.0009794319294809011
random_recommendation_5000 precision: 0.005778648383937317
WRR_5000 precision: 0.0071498530852105785
itemitem precision: 0.033202742409402554
cosine precision: 0.03525954946131244
tfidf precision: 0.036141038197845254
popular_recommendation precision: 0.15523996082272282
own_purchases precision: 0.20112634671890306


#### Item-Item алгоритмы

In [19]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [20]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [21]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [23]:
# Значения k, по которым будем производить поиск наилучшего
k_grid = [1, 2, 5, 10, 25, 50, 100]

# Словарь моделей
models = {
    'itemitem': ItemItemRecommender,
    'cosine': CosineRecommender,
    'tfidf': TFIDFRecommender
         }

In [32]:
def calculate_score(model_type, k, result=result, user_item_matrix=user_item_matrix, id_to_itemid=id_to_itemid):
    """
    Функция принимает на вход тип используемой модели и количество соседей, 
    обучает модель и добавляет в матрицу result соответствубщую колонку, 
    возвращает precision@5 
    
    Input
    -----
    result, user_item_matrix, id_to_itemid - из глобального контекста
    model_type - string. Возможные знаечния - ключи словаря models: itemitem, cosine, tfidf 
    k - int. Число соседей, для которых оцениваем точность
    
    Output
    ------
    score - float. Precision@5
    """
    
    model = models[model_type](K=k, num_threads=-1)
    model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)
    
    colname = f'{model_type}_{k}'
    
    if k != 1:
        # filter_already_liked_items=True
        result[colname] = result['user_id'].apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                                                  model.recommend(userid=userid_to_id[x], 
                                                                  user_items=sparse_user_item,   # на вход user-item matrix
                                                                  N=5, 
                                                                  filter_already_liked_items=True, 
                                                                  filter_items=[itemid_to_id[999999]], 
                                                                  recalculate_user=False)])
    else:
        # filter_already_liked_items=False
        result[colname] = result['user_id'].apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                                                  model.recommend(userid=userid_to_id[x], 
                                                                  user_items=sparse_user_item,   # на вход user-item matrix
                                                                  N=5, 
                                                                  filter_already_liked_items=False, 
                                                                  filter_items=[itemid_to_id[999999]], 
                                                                  recalculate_user=False)])
    
    precision_5 = np.mean(result.apply(lambda x: precision_at_k(x[colname], x['actual']), axis=1))
    
    return precision_5

In [33]:
scores = {}

for model_type in models.keys():
    for k in k_grid:
        scores[(model_type, k)] = calculate_score(model_type, k)
        
for model, score in sorted(scores.items(), key=lambda item: item[1]):
        print(f'{model} precision: {score}')

('tfidf', 2) precision: 0.02888507998694091
('itemitem', 25) precision: 0.029480901077375124
('itemitem', 50) precision: 0.03016650342801176
('itemitem', 100) precision: 0.030362389813907934
('itemitem', 10) precision: 0.030362389813907938
('cosine', 2) precision: 0.03113777342474698
('cosine', 25) precision: 0.032713026444662105
('itemitem', 5) precision: 0.033202742409402554
('cosine', 50) precision: 0.033692458374142996
('tfidf', 50) precision: 0.03398628795298727
('tfidf', 25) precision: 0.03457394711067581
('tfidf', 100) precision: 0.0346718903036239
('cosine', 100) precision: 0.034965719882468174
('cosine', 5) precision: 0.03525954946131244
('cosine', 10) precision: 0.03525954946131244
('tfidf', 10) precision: 0.03574926542605289
('tfidf', 5) precision: 0.036141038197845254
('itemitem', 2) precision: 0.056072477962781586
('tfidf', 1) precision: 0.1302073130917401
('cosine', 1) precision: 0.17261671563826314
('itemitem', 1) precision: 0.20112634671890306


Лучший результат дают модели, которые рекомендуют собственные покупки.
Есть различие в точности между моделями, посчитанными в этом ноутбуке, и моделями, посчитанными на уроке, с числом соседей 5. Затрудняюсь сказать, в чем причина.