[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shashist/recsys-course/blob/master/week_02_neighbourhood_based/rs_seminar1.ipynb)

In [3]:
# !pip install -q rs_datasets
!pip install scipy

Collecting scipy
  Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m9.7 MB/s[0m  [33m0:00:03[0mm0:00:01[0m00:01[0mm
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.15.3


In [4]:
from copy import deepcopy

import numpy as np
import pandas as pd
import scipy.sparse as sp
from rs_datasets import MovieLens
from scipy.sparse import csr_matrix, dok_matrix

## 0. MovieLens-1M dataset

- probably the most popular dataset in recommender systems
- `user_id` ranges from 1 to 6040
- `item_id` ranges from 1 to 3952
- 1000209 ratings available

In [5]:
%%time
movielens = MovieLens('1m')
movielens.info()

5.93MB [01:28, 67.3kB/s]                              


ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance



CPU times: user 1.9 s, sys: 377 ms, total: 2.28 s
Wall time: 1min 28s


Dataset has different versions, more details in [paper](http://files.grouplens.org/papers/harper-tiis2015.pdf)

<img src="https://raw.githubusercontent.com/shashist/recsys-course/master/week_02_neighbourhood_based/ml_versions.png" width=700>

## 1. Validation strategy (date split)

In [7]:
log = movielens.ratings
log.head(4)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275


In [8]:
time_treshold = log['timestamp'].quantile(q=0.8, interpolation='nearest')
time_treshold

np.int32(975768738)

In [9]:
train_log = log[log['timestamp'] <= time_treshold]
test_log = log[log['timestamp'] > time_treshold]
print(train_log.shape[0], test_log.shape[0])

800168 200041


In [11]:
users_intersection = set(test_log['user_id']) & set(train_log['user_id'])
print(test_log['user_id'].nunique(), len(users_intersection))

1783 1143


In [12]:
test_users = sorted(list(users_intersection))

More examples of splitting are available [here](https://github.com/sb-ai-lab/RePlay/blob/main/examples/04_splitters.ipynb)

## 2. Metrics

In [13]:
K = 10

#### HitRate

$$HitRate@K(u) = \max_{j \in [1..K]}\mathbb{1}_{r_{uj}}$$


$$ HitRate@K = \frac{\sum_{u=1}^{N}HitRate@K(u)}{N} $$

$\mathbb{1}_{r_{uj}}$ -- indicator function stating that user $u$ interacted with item $j$

In [None]:
def user_hr(row):
    """
    Calculate HitRate value.

    'row' contains
        a list of ground truth items in ``gt_items`` and
        a list of recommended items in ``pred_list``.
    """
    for item in row['pred_list']:
        if item in row['gt_list']:
            return 1
    return 0

#### Coverage

$$Coverage@K=\frac{\left|\bigcup\limits_{u\in U} y_u\right|}{|I|}$$


In [None]:
def coverage(pred, k, all_items=train_log['item_id']):
    pred_to_consider = set(leave_top_k(pred, k)['item_id'].values)
    all_items = set(all_items.values)
    return len(pred_to_consider & all_items) / len(all_items)

#### Wrapping

In [None]:
def metric_wrap(pred, ground_truth, k, metric_by_user):
    """
    Prepare data for metric calculation (create dataframe with columns 'user_id', 'pred_list', 'gt_list').

    'pred_list' is a list of top-k recommendation ordered by relevance (most relevant is the first)
    'gt_list' is a list of items from tests data.
    Return mean metric value and dataframe with metric value for each user
    """
    pred_cropped = leave_top_k(pred, k)
    # prepare score lists
    pred_grouped = (pred_cropped
                .sort_values(['user_id', 'rating'], ascending=[False, False])
                .groupby('user_id')['item_id']
                .apply(list).rename('pred_list')
               )
    gt_grouped = ground_truth.groupby('user_id')['item_id'].apply(list).rename('gt_list')
    to_compare = gt_grouped.to_frame().join(pred_grouped, how='left')
    to_compare['pred_list'] = to_compare['pred_list'].apply(lambda x: x if isinstance(x, list) else [])
    # compute metric
    metric_by_user = to_compare.apply(metric_by_user, axis=1)
    return metric_by_user.mean(), metric_by_user

In [None]:
def leave_top_k(pred: pd.DataFrame,
                 k: int=K,
                 group_by_col: str='user_id',
                 order_by_col: str='rating') -> pd.DataFrame:
    """
    crop predictions to leave top-k recommendations for each user
    """
    if pred.groupby(group_by_col)[group_by_col].count().max() <= k:
        return pred
    cropped_pred = deepcopy(pred)
    cropped_pred['rank'] = (cropped_pred
                            .groupby(group_by_col)[[order_by_col]]
                            .rank(method="first", ascending=False))
    cropped_pred = cropped_pred[cropped_pred['rank'] <= k].drop(columns=['rank'])
    return cropped_pred

In [None]:
def measure(pred, true, name, df=None, cov_items=train_log['item_id']):
    if df is None:
        df = pd.DataFrame(columns=['hit_rate@K', 'coverage@K'])
    df.loc[name, 'hit_rate@K'] = metric_wrap(pred=pred, ground_truth=true, k=K, metric_by_user=user_hr)[0]

    if cov_items is not None:
        df.loc[name, 'coverage@K'] = coverage(pred=pred, k=K)
    return df

## 3. Baseline (most popular)

In [None]:
popular_items = train_log['item_id'].value_counts().head(10).index

In [None]:
users = []
items = []
ratings = []

for i, user in enumerate(test_users):
    users.extend([user] * 10)
    items.extend(popular_items)
    ratings.extend([1] * 10)

In [None]:
popular_preds = pd.DataFrame({'user_id': users, 'item_id': items, 'rating': ratings})

In [None]:
metrics = measure(popular_preds, test_log, 'PopRec')
metrics.sort_values('hit_rate@K', ascending=False)

## 4. EASE

$$r_{ui} = R_{u,\cdot}\cdot W_{\cdot, i}$$

$$P = \left(R^TR + \lambda E\right)^{-1}$$

\begin{equation*}
W_{ij} =
    \begin{cases}
      0, \text{if } i = j\\
      -\frac{P_{ij}}{P_{jj}}, \text{otherwise}\\
    \end{cases}\,
\end{equation*}

#### Get weight matrix

In [None]:
def compute_weight_matrix(rating_matrix, lambd=1000):
    raise NotImplementedError

In [None]:
test_data = csr_matrix(np.random.randint(low=0, high=1, size=(100, 200)))
test_weight = compute_weight_matrix(test_data)
assert test_weight.shape == (test_data.shape[1], test_data.shape[1])
assert np.allclose(np.diagonal(test_weight), np.zeros(200))

In [None]:
%%time
user_num = train_log["user_id"].max() + 1
item_num = train_log["item_id"].max() + 1

rating_matrix = dok_matrix((user_num, item_num), dtype=np.float32)
for _, user, item, rating in train_log[["user_id", "item_id", "rating"]].itertuples():
    rating_matrix[user, item] = rating

In [None]:
%%time
weight_matrix = compute_weight_matrix(rating_matrix)
print(weight_matrix.shape)
weight_matrix

In [None]:
assert weight_matrix.shape == (train_log["item_id"].max() + 1, train_log["item_id"].max() + 1)

#### Score items

In [None]:
%%time
scores = rating_matrix.dot(weight_matrix)
scores

Filter seen items

In [None]:
scores = scores - rating_matrix * 1e6
scores

#### Prediction

In [None]:
test_scores = scores[test_users]
test_scores

In [None]:
top_k_preds = np.argsort(-test_scores)[:,:10].tolist()
top_k_preds[-2:]

In [None]:
top_k_scores = -np.sort(-test_scores)[:,:10]
top_k_scores = top_k_scores.tolist()
top_k_scores[-2:]

In [None]:
users = []
items = []
ratings = []

for i, user in enumerate(test_users):
    users.extend([user] * 10)
    items.extend(top_k_preds[i])
    ratings.extend(top_k_scores[i])

In [None]:
ease_preds = pd.DataFrame({'user_id': users, 'item_id': items, 'rating': ratings})
ease_preds

## Results

In [None]:
metrics = measure(ease_preds, test_log, 'EASE', metrics)
metrics.sort_values('hit_rate@K', ascending=False)