In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import math
from itertools import product

import numpy as np
import pandas as pd
import tqdm
from IPython.display import clear_output

from jaccard_recommender.recommender import JaccardRecommender
from utils.evaluator import Evaluator

In [None]:
data_path = "../data/copperunion"
output_path = "../models/jaccard_knn"
results = {}
grid_search_parameters = [
    ("normalize_unrated", [True, False]),
    ("thresholded_watch_history", [10, 20, 100]),
    ("true_weighted_average", [True, False]),
    ("sim_matrix_sum_normalization", [True, False]),
]
total = math.prod(len(values) for _, values in grid_search_parameters)
iterator = product(*(values for _, values in grid_search_parameters))
parameter_names = [name for name, _ in grid_search_parameters]
for vals in tqdm.tqdm(iterator, total=total):
    options = {k: v for k, v in zip(parameter_names, vals)}
    recommender = JaccardRecommender(data_path, output_path, save=False, **options)
    recommender.build_matrix()
    runtime, ndcg = recommender.inference()
    results[vals] = (runtime, ndcg)
clear_output()

In [16]:
data = [list(k) + list(v) for k, v in results.items()]
df = pd.DataFrame(
    data,
    columns=parameter_names
    + [
        "runtime",
        "ndcg @ 10",
    ],
)
display(df)

Unnamed: 0,normalize_unrated,thresholded_watch_history,true_weighted_average,runtime,ndcg @ 10
0,True,10,True,37.234467,0.001336
1,True,10,False,24.208,0.232263
2,True,20,True,34.178117,0.001621
3,True,20,False,24.858905,0.228416
4,True,100,True,18.773455,0.002093
5,True,100,False,11.286252,0.184946
6,False,10,True,45.141935,0.001396
7,False,10,False,29.634101,0.204085
8,False,20,True,33.14971,0.001523
9,False,20,False,24.029245,0.203184


In [49]:
options_1 = {
    "normalize_unrated": True,
    "thresholded_watch_history": 20,
    "true_weighted_average": False,
    "sim_matrix_normalization": "row",
}
recommender_1 = JaccardRecommender(data_path, output_path, **options_1)
recommender_1.build_matrix()
runtime, ndcg = recommender_1.inference()

options_2 = {
    "normalize_unrated": True,
    "thresholded_watch_history": 20,
    "true_weighted_average": True,
    "sim_matrix_normalization": "none",
}
recommender_2 = JaccardRecommender(data_path, output_path, **options_2)
recommender_2.build_matrix()
runtime, ndcg = recommender_2.inference()

normalize_unrated=True thresholded_watch_history=20 true_weighted_average=False sim_matrix_normalization='row'


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 13798.02it/s]
parsing users...: 100%|██████████| 73515/73515 [00:20<00:00, 3662.65it/s]


Total Animes: 12294
Total Users: 54077


building interaction matrix...: 100%|██████████| 43261/43261 [00:04<00:00, 9859.89it/s] 


Percentage Zeroes: 0.41
This model took 10.6403 seconds.
Out of an optimal score of 1.0, you scored 0.2284.
normalize_unrated=True thresholded_watch_history=20 true_weighted_average=True sim_matrix_normalization='none'


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 13890.40it/s]
parsing users...: 100%|██████████| 73515/73515 [00:19<00:00, 3708.31it/s]


Total Animes: 12294
Total Users: 54077


building interaction matrix...: 100%|██████████| 43261/43261 [00:04<00:00, 10136.58it/s]


Percentage Zeroes: 0.41
This model took 13.5112 seconds.
Out of an optimal score of 1.0, you scored 0.0016.


In [46]:
from typing import List
import pandas as pd
from collections import Counter

COLUMNS = {
    "statistics": ["name", "value"],
    "ranked_results": ["cai", "name", "freq"],
}


def summary(k_recommended_shows: np.ndarray[int], evaluator: Evaluator):
    """Given a prediction matrix, summarize the results.

    Args:
        k_recommended_shows (np.ndarray[int]): An (n_test, k) matrix of recommended shows.
        evaluator (Evaluator): The evaluator object.
    """
    all_predictions = k_recommended_shows.flatten().tolist()
    pred_ct = Counter(all_predictions)
    pred_ct_sorted = sorted(pred_ct.items(), key=lambda x: x[1], reverse=True)
    frequencies = np.array([freq for _, freq in pred_ct_sorted])
    quantiles = np.quantile(frequencies, [0, 0.25, 0.5, 0.75, 1])

    statistics = [
        ["n_distinct", len(pred_ct)],
        ["n_recs", len(all_predictions)],
        ["n_users", len(evaluator.test_ids)],
        ["mean_freq", np.mean(frequencies)],
        ["std_freq", np.std(frequencies)],
        ["min_freq", quantiles[0]],
        ["25%_freq", quantiles[1]],
        ["median", quantiles[2]],
        ["75%_freq", quantiles[3]],
        ["max_freq", quantiles[4]],
    ]
    ranked_results = [
        [cai, evaluator.canonical_anime_mapping[cai].name, freq]
        for cai, freq in pred_ct_sorted[:20]
    ]
    return {
        "statistics": statistics,
        "ranked_results": ranked_results,
    }


def comparison(
    prediction_matrices: List[np.ndarray[int]], titles: List[str], evaluator: Evaluator
):
    results = [
        summary(prediction_matrix, evaluator)
        for prediction_matrix in prediction_matrices
    ]
    for k in COLUMNS:
        print(f"=== {k} ===")
        num_rows = len(k)
        data = [sum([res[k][j] for res in results], []) for j in range(num_rows)]
        columns = pd.MultiIndex.from_tuples(
            [(title, col) for title in titles for col in COLUMNS[k]]
        )
        df = pd.DataFrame(data, columns=columns).style.set_properties(
            **{"text-align": "left"}
        )
        display(df)


def analysis(recommenders: List[JaccardRecommender], titles: List[str]):

    test_k_recommended_shows = recommenders[
        0
    ].evaluator.get_ground_truth_k_predictions()
    all_titles = ["Ground Truth"] + titles
    all_matrices = [test_k_recommended_shows] + [
        recommender.k_recommended_shows for recommender in recommenders
    ]
    comparison(all_matrices, all_titles, recommenders[0].evaluator)

In [50]:
analysis([recommender_1, recommender_2], ["row-normalized", "true_weighted_average"])

=== statistics ===


Unnamed: 0_level_0,Ground Truth,Ground Truth,row-normalized,row-normalized,true_weighted_average,true_weighted_average
Unnamed: 0_level_1,name,value,name,value,name,value
0,n_distinct,3838.0,n_distinct,868.0,n_distinct,1135.0
1,n_recs,54090.0,n_recs,54090.0,n_recs,54090.0
2,n_users,5409.0,n_users,5409.0,n_users,5409.0
3,mean_freq,14.093278,mean_freq,62.315668,mean_freq,47.656388
4,std_freq,30.205405,std_freq,223.917065,std_freq,212.657119
5,min_freq,1.0,min_freq,1.0,min_freq,1.0
6,25\%_freq,1.0,25\%_freq,1.0,25\%_freq,1.0
7,median,4.0,median,4.0,median,1.0
8,75\%_freq,13.0,75\%_freq,19.0,75\%_freq,5.0
9,max_freq,479.0,max_freq,1808.0,max_freq,2559.0


=== ranked_results ===


Unnamed: 0_level_0,Ground Truth,Ground Truth,Ground Truth,row-normalized,row-normalized,row-normalized,true_weighted_average,true_weighted_average,true_weighted_average
Unnamed: 0_level_1,cai,name,freq,cai,name,freq,cai,name,freq
0,40,Death Note,479,40,Death Note,1808,10720,Warui no wo Taose!! Salaryman Man,2559
1,86,Shingeki no Kyojin,415,1,Fullmetal Alchemist: Brotherhood,1803,9886,Osomatsu-kun (1988): Appare! Chibita no Onitaiji zansu,2103
2,804,Sword Art Online,378,445,Mirai Nikki (TV),1788,9887,Osomatsu-kun: Iyami wa Hitori Kaze no Naka,2096
3,841,Naruto,309,159,Angel Beats!,1777,7198,Yubi wo Nusunda Onna,1990
4,19,Code Geass: Hangyaku no Lelouch,298,19,Code Geass: Hangyaku no Lelouch,1693,7647,Tekkon Kinkreet Pilot,1988
5,760,Elfen Lied,291,804,Sword Art Online,1540,5132,Bobobo-bo Bo-bobo Recap,1889
6,1,Fullmetal Alchemist: Brotherhood,288,86,Shingeki no Kyojin,1520,10951,Cencoroll 2,1587
7,159,Angel Beats!,285,760,Elfen Lied,1487,622,Drifters,1492
8,449,Tokyo Ghoul,275,643,Ao no Exorcist,1459,8384,City Hunter: Ryou no Propose,1485
9,15,Sen to Chihiro no Kamikakushi,255,841,Naruto,1438,6846,Tang Lang Bu Chan,1399
