# **#2 Baselines and Deterministic item-item algorithms**

### Imports and data overview

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [4]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# of users: ', users)
print('# of items: ', items)
print('# of interactions: ', interactions)

# of users:  2499
# of items:  89051
# of interactions:  2396804


In [5]:
print('Description of Sales Values by Item')
popularity_item_by_sales = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity_item_by_sales.sort_values(by='sales_value', ascending=False).sales_value.describe()


Description of Sales Values by Item


count     89051.000000
mean         83.458481
std        1628.715079
min           0.000000
25%           3.500000
50%          10.780000
75%          46.105000
max      467993.620000
Name: sales_value, dtype: float64

In [10]:
print('Populatiry per Item by Purchase Fact')
popularity_item_by_purchase_fact = data.groupby('item_id')['user_id'].count().reset_index()
popularity_item_by_purchase_fact.sort_values(by='user_id', ascending=False).head(5)

Populatiry per Item by Purchase Fact


Unnamed: 0,item_id,user_id
35054,1082185,27362
56233,6534178,18364
29195,1029743,13455
25333,995242,11397
37719,1106523,9175


In [7]:
# other interactions
data.loc[(data['quantity'] == 0)]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
97,744,26985165432,1,5978648,0,0.0,31582,0.0,1119,1,0.0,0.0
128,1287,26985336468,1,5978648,0,0.0,304,0.0,1351,1,0.0,0.0
249,2305,26996870743,2,5978656,0,0.0,414,0.0,1300,1,-1.0,0.0
293,271,26997082949,2,5978656,0,0.0,329,0.0,1719,1,-2.0,0.0
694,315,27008952267,3,957951,0,0.0,327,0.0,1707,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2395898,709,41653107125,663,1076056,0,0.0,400,0.0,1559,95,0.0,0.0
2396003,338,41653143843,663,927090,0,0.0,369,0.0,1430,95,0.0,0.0
2396151,430,41653163527,663,5978656,0,0.0,433,0.0,2039,95,-1.0,0.0
2396309,1326,41653184219,663,5978656,0,0.0,318,0.0,2319,95,-1.0,0.0


### Data train-test split

In [8]:
# В рекомендательных системах корректно разбивать 
# датасет по времени, а не случайно.
## ниже берутся последние 3 недели в качестве тестовой выборки.
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.shape[0], data_test.shape[0]

(2278490, 118314)

In [9]:
# датафрейм с покупками юзеров на тестовом датасете 
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head()

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [11]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### Build some Baselines
_(random_recommendations, popularity recommendations)_

[On the Difficulty of Evaluating Baselines](https://arxiv.org/pdf/1905.01395.pdf)*(pdf)*

In [12]:
# Random Recommendation

def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [13]:
%%time

items = data_train.item_id.unique()

result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result['random_recommendation_5k'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5000))

result.head()

CPU times: user 13.8 s, sys: 197 ms, total: 14 s
Wall time: 14.1 s


Unnamed: 0,user_id,actual,random_recommendation,random_recommendation_5k
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[131099, 10456117, 840557, 2588096, 8091644]","[895327, 1857895, 854900, 10352375, 13944765, ..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1007484, 13072895, 12649144, 1745709, 874954]","[1064386, 990851, 7442091, 1033238, 901668, 88..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[5571749, 13417334, 15800945, 1031866, 1228798]","[1075796, 1902501, 906234, 907153, 1048851, 88..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[953609, 1138474, 10285149, 1088889, 5591395]","[1017472, 9553047, 17291184, 1019709, 13382070..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12523737, 833041, 890099, 982475, 1109976]","[5133292, 6919276, 5592931, 2160695, 1030995, ..."


In [14]:
# TopPop

def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)

    recs = popular.head(n).item_id
    return recs.tolist()

In [15]:
%%time
# Составление рекомендаций для тренировочного датасета
popular_recs = popularity_recommendation(data_train)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result['popular_recommendation_5k'] = result['user_id'].apply(lambda x: popularity_recommendation(data_train, n=5000))

result.head(3)

CPU times: user 6min 44s, sys: 1min 26s, total: 8min 10s
Wall time: 8min 17s


Unnamed: 0,user_id,actual,random_recommendation,random_recommendation_5k,popular_recommendation,popular_recommendation_5k
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[131099, 10456117, 840557, 2588096, 8091644]","[895327, 1857895, 854900, 10352375, 13944765, ...","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 1029743, 6534166, 1082185, ..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1007484, 13072895, 12649144, 1745709, 874954]","[1064386, 990851, 7442091, 1033238, 901668, 88...","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 1029743, 6534166, 1082185, ..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[5571749, 13417334, 15800945, 1031866, 1228798]","[1075796, 1902501, 906234, 907153, 1048851, 88...","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 1029743, 6534166, 1082185, ..."


### Task 1. Random Weighted Recommendation

In [16]:
# Random Weighted Recommendation

def weighted_random_recommendation(items_weights, n=5):
    """Случайные взвешенные рекоммендации
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    items = items_weights.item_id
    weights = np.abs(items_weights.weight)
    recs = np.random.choice(items, p=weights, size=n, replace=False)
    
    return recs.tolist()

In [17]:
popular = data_train.groupby('item_id')['sales_value'].sum().reset_index()
print(round(popular.loc[popular.sales_value < 1].shape[0] / popular.shape[0] * 100), "% of observations have sales_value less than 1")

4 % of observations have sales_value less than 1


In [18]:
%%time

popular.sort_values('sales_value', ascending=False, inplace=True)
popular = popular.loc[popular['sales_value'] > 1]

# get weights 
p = popular['sales_value'].apply(lambda x: np.log(x) / popular.shape[0])
p /= p.sum()

# join new column by id
popular['weight'] = p
items_weights = popular[['item_id', 'weight']]

# get weighted random recommendation for each user
result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights))
result['weighted_random_recommendation_5k'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5000))

CPU times: user 16.1 s, sys: 282 ms, total: 16.4 s
Wall time: 19.8 s


In [19]:
result[['weighted_random_recommendation', 'weighted_random_recommendation_5k']].head(3)

Unnamed: 0,weighted_random_recommendation,weighted_random_recommendation_5k
0,"[1100821, 965606, 1119644, 1120970, 13008079]","[938409, 105599, 1067400, 5995246, 945800, 141..."
1,"[15800767, 1021164, 8181451, 959847, 1227722]","[895611, 1097906, 7443178, 12523855, 1108106, ..."
2,"[1109642, 6979321, 944249, 9522331, 16805946]","[942459, 13382432, 9526427, 5567506, 1100379, ..."


### Calc baselines quality

In [20]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

new_cols = result.columns[2:]
for col in new_cols:
    precision = result.apply(lambda row: precision_at_k(row[col], row['actual']), axis=1).mean()
    print(f'Precision@5 {col}: {round(precision, 4)*100}%')

Precision@5 random_recommendation: 0.06%
Precision@5 random_recommendation_5k: 0.08%
Precision@5 popular_recommendation: 15.52%
Precision@5 popular_recommendation_5k: 15.52%
Precision@5 weighted_random_recommendation: 0.2%
Precision@5 weighted_random_recommendation_5k: 0.11%


Вывод: качество предсказаний бейзлайнов не улучшается (и может ухудшиться) с семплированием наблюдений (н-р, топ 5000, как в коде выше). 
Полнота precision обратно зависима от количества рекомендаций.