# Jaccard-Similarity Based Recommender

For this recommender, we assume we have user-lists for items $U_i$ and item-lists for users $I_u$

Similarity between two items is defined as $\textrm{sim}(i, j) \doteq \textrm{Jaccard}(U_i, U_j)$

Similarity between two users is defined as $\textrm{sim}(u, v) \doteq \textrm{Jaccard}(I_u, I_v)$

Rating prediction for an item q is $\hat{r}(u,q) = \sum_{i \in I_u} r(u,i) * \frac{\textrm{sim}(q, i)}{\sum_{j \in I_u} \textrm{sim}(q,j)}$


In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
from collections import defaultdict

import numpy as np
import tqdm as tqdm
from scipy.sparse import csr_matrix, lil_matrix

from utils.evaluator import Evaluator

In [2]:
def build_matrix(evaluator: Evaluator) -> np.ndarray[float]:
    n_anime = len(evaluator.anime_mapping)
    n_users = len(evaluator.user_mapping)
    user_id_to_canonical_id = {
        k: i for i, (k, _) in enumerate(evaluator.user_mapping.items())
    }
    sim_matrix = np.zeros((n_anime, n_anime), dtype=np.float32)
    # Sparse matrix, anime x user
    # Intersection: anime @ anime.T
    interaction_matrix = lil_matrix((n_anime, n_users), dtype=np.float32)
    for user_id in tqdm.tqdm(evaluator.train_indices):
        user = evaluator.user_mapping[user_id]
        canonical_user_id = user_id_to_canonical_id[user_id]
        interaction_matrix[user.preserved_canonical_ids, canonical_user_id] = 1
    interaction_matrix = interaction_matrix.tocsr()
    intersection = (interaction_matrix @ interaction_matrix.T).toarray()
    sum_counts = np.array(interaction_matrix.sum(axis=1))
    union = (sum_counts.reshape(-1, 1) + sum_counts.reshape(1, -1)) - intersection
    sim_matrix = intersection / np.where(union == 0, 1, union)
    return sim_matrix

In [3]:
def recommend(
    evaluator: Evaluator,
    sim_matrix: np.ndarray,
    items: list[int],
    ratings: list[float],
    k: int,
    true_weighted_average=False,
):
    n_items = len(evaluator.anime_mapping)
    ratings_vector = np.zeros((n_items, 1))
    ratings_vector[items, 0] = ratings
    if true_weighted_average:
        sim_matrix = sim_matrix[:, items]
        sim_matrx /= np.sum(sim_matrix, axis=1, keepdims=True)
    scores = (sim_matrix @ ratings_vector).flatten()
    scores[items] = -1
    order = np.argsort(-scores).flatten()
    return order[:k]


def inference(
    evaluator: Evaluator,
    model_path: str,
    experiment: str,
    load=False,
    true_weighted_average=False,
):

    output_dir = os.path.join(model_path, experiment)
    os.makedirs(output_dir, exist_ok=True)
    output_matrix_fp = os.path.join(output_dir, "item_similarity_matrix.npy")
    if not load:
        sim_matrix = build_matrix(evaluator)
        np.save(output_matrix_fp, sim_matrix)
    else:
        sim_matrix = np.load(output_matrix_fp)
    row_sums = sim_matrix.sum(axis=1)
    row_sums = np.where(row_sums == 0, 1, row_sums)
    sim_matrix /= row_sums[:, None]

    # Perform recommendations
    user_history, k = evaluator.start_eval_test_set()
    k_recommended_shows = np.zeros((len(user_history), k), dtype=np.int32)
    for i, masked_history in tqdm.tqdm(
        enumerate(user_history), total=len(user_history)
    ):
        items = masked_history.nonzero()[0]
        ratings = masked_history[items]
        recommended = recommend(
            evaluator, sim_matrix, items, ratings, k, true_weighted_average
        )
        k_recommended_shows[i] = recommended
    evaluator.end_eval_test_set(k_recommended_shows)

In [6]:
data_path = "../data/copperunion"
evaluator = Evaluator(data_path, normalize_unrated=False)
print(len(evaluator.test_ids))

normalize_unrated=False


parsing animes...: 100%|██████████| 12294/12294 [00:01<00:00, 12055.80it/s]
parsing users...: 100%|██████████| 73515/73515 [00:20<00:00, 3544.72it/s]


Total Animes: 12294
Total Users: 54077
5409


In [5]:
data_path = "../data/copperunion"
model_path = "../models/jaccard_knn"
experiment = "no_imputation"

evaluator = Evaluator(data_path, normalize_unrated=False)
inference(evaluator, model_path, experiment, load=False)

normalize_unrated=False


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 22722.70it/s]
parsing users...: 100%|██████████| 73515/73515 [00:12<00:00, 5975.19it/s]


Total Animes: 12294
Total Users: 54077


100%|██████████| 43261/43261 [00:04<00:00, 9192.07it/s] 


Percentage Zeroes: 0.41


100%|██████████| 5409/5409 [29:32<00:00,  3.05it/s]


This model took 1772.2530 seconds.
Out of an optimal score of 1.0, you scored 0.2032.


In [4]:
data_path = "../data/copperunion"
model_path = "../models/jaccard_knn"
experiment = "imputation"

evaluator = Evaluator(data_path, normalize_unrated=True)
inference(evaluator, model_path, experiment, load=False)

normalize_unrated=True


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 20884.42it/s]
parsing users...: 100%|██████████| 73515/73515 [00:18<00:00, 3978.47it/s]


Total Animes: 12294
Total Users: 54077


100%|██████████| 43261/43261 [00:04<00:00, 10038.65it/s]


Percentage Zeroes: 0.41


100%|██████████| 5409/5409 [28:15<00:00,  3.19it/s]


This model took 1695.9278 seconds.
Out of an optimal score of 1.0, you scored 0.2284.


In [6]:
data_path = "../data/copperunion"
model_path = "../models/jaccard_knn_10"
experiment = "no_imputation"

evaluator = Evaluator(data_path, normalize_unrated=False, threshold_watch_history=10)
inference(evaluator, model_path, experiment, load=False)

normalize_unrated=False


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 19064.81it/s]
parsing users...: 100%|██████████| 73515/73515 [00:14<00:00, 5002.43it/s]


Total Animes: 12294
Total Users: 61674


100%|██████████| 61674/61674 [00:04<00:00, 14016.84it/s]


Percentage Zeroes: 0.41


100%|██████████| 6168/6168 [15:30<00:00,  6.63it/s]


This model took 930.9927 seconds.
Out of an optimal score of 1.0, you scored 0.2052.


In [7]:
data_path = "../data/copperunion"
model_path = "../models/jaccard_knn_10"
experiment = "no_imputation"

evaluator = Evaluator(data_path, normalize_unrated=True, threshold_watch_history=10)
inference(evaluator, model_path, experiment, load=False)

normalize_unrated=True


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 21727.35it/s]
parsing users...: 100%|██████████| 73515/73515 [00:14<00:00, 4947.96it/s]


Total Animes: 12294
Total Users: 61674


100%|██████████| 61674/61674 [00:04<00:00, 14446.55it/s]


Percentage Zeroes: 0.41


100%|██████████| 6168/6168 [15:24<00:00,  6.67it/s]


This model took 924.9411 seconds.
Out of an optimal score of 1.0, you scored 0.2335.


In [5]:
data_path = "../data/copperunion"
model_path = "../models/jaccard_knn_100"
experiment = "imputation"

evaluator = Evaluator(data_path, normalize_unrated=True, threshold_watch_history=100)
inference(evaluator, model_path, experiment, load=False)

normalize_unrated=True


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 13801.28it/s]
parsing users...: 100%|██████████| 73515/73515 [00:23<00:00, 3088.39it/s]


Total Animes: 12294
Total Users: 24697


100%|██████████| 24697/24697 [00:05<00:00, 4870.91it/s]


Percentage Zeroes: 0.41


100%|██████████| 2471/2471 [20:11<00:00,  2.04it/s]


This model took 1211.6159 seconds.
Out of an optimal score of 1.0, you scored 0.1859.


In [6]:
data_path = "../data/copperunion"
model_path = "../models/jaccard_knn_100"
experiment = "no_imputation"

evaluator = Evaluator(data_path, normalize_unrated=False, threshold_watch_history=100)
inference(evaluator, model_path, experiment, load=False)

normalize_unrated=False


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 12858.93it/s]
parsing users...: 100%|██████████| 73515/73515 [00:23<00:00, 3189.46it/s]


Total Animes: 12294
Total Users: 24697


100%|██████████| 24697/24697 [00:04<00:00, 5698.77it/s]


Percentage Zeroes: 0.41


100%|██████████| 2471/2471 [20:05<00:00,  2.05it/s]


This model took 1205.9595 seconds.
Out of an optimal score of 1.0, you scored 0.1625.
