In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import math
from itertools import product

import os
import numpy as np
import pandas as pd
import tqdm
from IPython.display import clear_output

from jaccard_recommender.recommender import JaccardRecommender
from utils.evaluator import Evaluator

print(os.getcwd())
data_path = "../data/copperunion"
output_path = "../models/jaccard_knn"

c:\Users\James\Desktop\projects\cmu\CMU_10718\src


# Perform Grid Search

Grid searching over regularization methods


In [47]:
results = {}
evaluator = Evaluator(
    data_path,
    normalize_unrated=True,
    threshold_watch_history=10,
)
grid_search_parameters = [
    # ("normalize_unrated", [True]),
    # ("thresholded_watch_history", [10]),
    ("true_weighted_average", [True]),
    ("sim_matrix_normalization", ["none"]),
    ("sim_threshold", [0.01, 0.1]),
    ("normalizing_constant_clip", [0.1, 0.2]),
    ("normalizing_constant_regularization", [0, 1, 5]),
    ("score_regularization", [0, 1, 5]),
    ("load", [True]),
    ("evaluator", [evaluator]),
]

total = math.prod(len(values) for _, values in grid_search_parameters)
iterator = product(*(values for _, values in grid_search_parameters))
parameter_names = [name for name, _ in grid_search_parameters]
for vals in tqdm.tqdm(iterator, total=total):
    options = {k: v for k, v in zip(parameter_names, vals)}
    recommender = JaccardRecommender(data_path, output_path, save=False, **options)
    recommender.build_matrix()
    runtime, ndcg = recommender.inference()
    results[vals] = (runtime, ndcg)
clear_output()

In [51]:
data = [list(k) + list(v) for k, v in results.items()]
df = pd.DataFrame(
    data,
    columns=parameter_names
    + [
        "runtime",
        "ndcg @ 10",
    ],
)
choice_columns = ["sim_threshold", "normalizing_constant_clip", "normalizing_constant_regularization", "score_regularization", "ndcg @ 10"]
df = df[choice_columns]
df = df.sort_values("ndcg @ 10", ascending=False)
display(df)
df.to_csv(f"{output_path}/grid_search.csv", index=False)

Unnamed: 0,sim_threshold,normalizing_constant_clip,normalizing_constant_regularization,score_regularization,ndcg @ 10
17,0.01,0.2,5,5,0.207959
8,0.01,0.1,5,5,0.207959
7,0.01,0.1,5,1,0.207705
16,0.01,0.2,5,1,0.207705
6,0.01,0.1,5,0,0.207574
15,0.01,0.2,5,0,0.207574
34,0.1,0.2,5,1,0.200292
25,0.1,0.1,5,1,0.200292
26,0.1,0.1,5,5,0.200282
35,0.1,0.2,5,5,0.200282


In [41]:
options_1 = {
    "normalize_unrated": True,
    "thresholded_watch_history": 20,
    "true_weighted_average": False,
    "sim_matrix_normalization": "row",
}
recommender_1 = JaccardRecommender(data_path, output_path, **options_1)
recommender_1.build_matrix()
runtime, ndcg = recommender_1.inference()

options_2 = {
    "normalize_unrated": True,
    "thresholded_watch_history": 20,
    "true_weighted_average": True,
    "sim_matrix_normalization": "none",
}
recommender_2 = JaccardRecommender(data_path, output_path, **options_2)
recommender_2.build_matrix()
runtime, ndcg = recommender_2.inference()

normalize_unrated=True thresholded_watch_history=20 true_weighted_average=False sim_matrix_normalization='row'


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 16073.86it/s]
parsing users...: 100%|██████████| 73515/73515 [00:19<00:00, 3695.90it/s]


Total Animes: 12294
Total Users: 54077


building interaction matrix...: 100%|██████████| 43261/43261 [00:06<00:00, 7005.40it/s]


Percentage Zeroes: 0.41
This model took 19.0950 seconds.
Out of an optimal score of 1.0, you scored 0.2284.
normalize_unrated=True thresholded_watch_history=20 true_weighted_average=True sim_matrix_normalization='none'


parsing animes...: 100%|██████████| 12294/12294 [00:00<00:00, 14584.64it/s]
parsing users...: 100%|██████████| 73515/73515 [00:19<00:00, 3720.96it/s]


Total Animes: 12294
Total Users: 54077


building interaction matrix...: 100%|██████████| 43261/43261 [00:04<00:00, 10645.97it/s]


Percentage Zeroes: 0.41
This model took 30.0735 seconds.
Out of an optimal score of 1.0, you scored 0.0706.


In [12]:
from typing import List
import pandas as pd
from collections import Counter

COLUMNS = {
    "statistics": ["name", "value"],
    "ranked_results": ["cai", "name", "freq"],
}


def summary(k_recommended_shows: np.ndarray[int], evaluator: Evaluator):
    """Given a prediction matrix, summarize the results.

    Args:
        k_recommended_shows (np.ndarray[int]): An (n_test, k) matrix of recommended shows.
        evaluator (Evaluator): The evaluator object.
    """
    all_predictions = k_recommended_shows.flatten().tolist()
    pred_ct = Counter(all_predictions)
    pred_ct_sorted = sorted(pred_ct.items(), key=lambda x: x[1], reverse=True)
    frequencies = np.array([freq for _, freq in pred_ct_sorted])
    quantiles = np.quantile(frequencies, [0, 0.25, 0.5, 0.75, 1])

    statistics = [
        ["n_distinct", len(pred_ct)],
        ["n_recs", len(all_predictions)],
        ["n_users", len(evaluator.test_ids)],
        ["mean_freq", np.mean(frequencies)],
        ["std_freq", np.std(frequencies)],
        ["min_freq", quantiles[0]],
        ["25%_freq", quantiles[1]],
        ["median", quantiles[2]],
        ["75%_freq", quantiles[3]],
        ["max_freq", quantiles[4]],
    ]
    ranked_results = [
        [cai, evaluator.canonical_anime_mapping[cai].name, freq]
        for cai, freq in pred_ct_sorted[:20]
    ]
    return {
        "statistics": statistics,
        "ranked_results": ranked_results,
    }


def comparison(
    prediction_matrices: List[np.ndarray[int]], titles: List[str], evaluator: Evaluator
):
    results = [
        summary(prediction_matrix, evaluator)
        for prediction_matrix in prediction_matrices
    ]
    for k in COLUMNS:
        print(f"=== {k} ===")
        num_rows = len(k)
        data = [sum([res[k][j] for res in results], []) for j in range(num_rows)]
        columns = pd.MultiIndex.from_tuples(
            [(title, col) for title in titles for col in COLUMNS[k]]
        )
        df = pd.DataFrame(data, columns=columns).style.set_properties(
            **{"text-align": "left"}
        )
        display(df)


def analysis(recommenders: List[JaccardRecommender], titles: List[str]):

    test_k_recommended_shows = recommenders[
        0
    ].evaluator.get_ground_truth_k_predictions()
    all_titles = ["Ground Truth"] + titles
    all_matrices = [test_k_recommended_shows] + [
        recommender.k_recommended_shows for recommender in recommenders
    ]
    comparison(all_matrices, all_titles, recommenders[0].evaluator)

In [39]:
def analyze_scores(score_vector, evaluator, recommender: JaccardRecommender):
    score_vector_vals = score_vector[~np.isinf(score_vector)]
    print(np.mean(score_vector_vals), np.std(score_vector_vals), np.min(score_vector_vals), np.max(score_vector_vals))
    extract = np.argsort(-score_vector)
    print(extract[:10], score_vector[extract[:10]])
    ratings = recommender.rating_tensor[0,:]
    print(recommender.rating_tensor.shape)
    nz = np.nonzero(ratings)[0]
    # print([(cai, r) for cai, r in zip(nz, ratings[nz])])
    for top_i in extract[:10]:
        print(top_i, evaluator.canonical_anime_mapping[top_i].name, ratings[top_i])
        print(recommender.sim_matrix[top_i, nz])
sc1 = recommender_1.scores[:,0]
sc2 = recommender_2.scores[:,0]

analyze_scores(sc1, recommender_1.evaluator, recommender_1)
print()
analyze_scores(sc2, recommender_2.evaluator, recommender_2)

0.0432621590036011 0.06758185333243452 0.0 0.4107417054264078
[760 206 200  19 322 687  35 841 164  13] [0.41074171 0.40307518 0.39324779 0.39224513 0.39014014 0.38625095
 0.3849709  0.38284804 0.37789757 0.37721829]
(5409, 12294)
760 Elfen Lied 0.0
[1.77870877e-03 7.72860833e-04 2.46618502e-03 1.57954637e-03
 7.21784541e-04 1.39774720e-03 9.12516669e-04 1.37974310e-03
 2.14557280e-03 4.97060653e-04 1.99051248e-03 3.42354935e-04
 2.14310348e-04 2.00204362e-04 1.74970203e-03 1.33021781e-03
 1.53759506e-03 1.97744273e-04 3.45604174e-04 1.01448235e-03
 7.95465457e-05 9.29792586e-05 3.04833142e-04 3.88542394e-04
 1.96461729e-03 1.27559633e-03 1.14130715e-04 3.00697022e-04
 2.07736171e-04 1.85250552e-04 4.26459563e-04 3.38331563e-04
 6.89421722e-04 1.37952832e-03 4.92706953e-04 3.17550061e-04
 1.27695850e-04 1.51739165e-04 4.12439986e-04 3.81751306e-05
 1.25761042e-04 2.57350039e-04 6.70108304e-04 1.47021422e-03
 1.87602220e-03 2.84564769e-04 2.20811157e-03 9.01908090e-04
 7.33765308e-04 6.

In [13]:
analysis([recommender_1, recommender_2], ["row-normalized", "true_weighted_average"])

=== statistics ===


Unnamed: 0_level_0,Ground Truth,Ground Truth,row-normalized,row-normalized,true_weighted_average,true_weighted_average
Unnamed: 0_level_1,name,value,name,value,name,value
0,n_distinct,3838.0,n_distinct,868.0,n_distinct,1134.0
1,n_recs,54090.0,n_recs,54090.0,n_recs,54090.0
2,n_users,5409.0,n_users,5409.0,n_users,5409.0
3,mean_freq,14.093278,mean_freq,62.315668,mean_freq,47.698413
4,std_freq,30.205405,std_freq,223.917065,std_freq,212.750657
5,min_freq,1.0,min_freq,1.0,min_freq,1.0
6,25%_freq,1.0,25%_freq,1.0,25%_freq,1.0
7,median,4.0,median,4.0,median,1.0
8,75%_freq,13.0,75%_freq,19.0,75%_freq,5.0
9,max_freq,479.0,max_freq,1808.0,max_freq,2559.0


=== ranked_results ===


Unnamed: 0_level_0,Ground Truth,Ground Truth,Ground Truth,row-normalized,row-normalized,row-normalized,true_weighted_average,true_weighted_average,true_weighted_average
Unnamed: 0_level_1,cai,name,freq,cai,name,freq,cai,name,freq
0,40,Death Note,479,40,Death Note,1808,10720,Warui no wo Taose!! Salaryman Man,2559
1,86,Shingeki no Kyojin,415,1,Fullmetal Alchemist: Brotherhood,1803,9886,Osomatsu-kun (1988): Appare! Chibita no Onitaiji zansu,2104
2,804,Sword Art Online,378,445,Mirai Nikki (TV),1788,9887,Osomatsu-kun: Iyami wa Hitori Kaze no Naka,2095
3,841,Naruto,309,159,Angel Beats!,1777,7198,Yubi wo Nusunda Onna,1993
4,19,Code Geass: Hangyaku no Lelouch,298,19,Code Geass: Hangyaku no Lelouch,1693,7647,Tekkon Kinkreet Pilot,1985
5,760,Elfen Lied,291,804,Sword Art Online,1540,5132,Bobobo-bo Bo-bobo Recap,1889
6,1,Fullmetal Alchemist: Brotherhood,288,86,Shingeki no Kyojin,1520,10951,Cencoroll 2,1587
7,159,Angel Beats!,285,760,Elfen Lied,1487,622,Drifters,1493
8,449,Tokyo Ghoul,275,643,Ao no Exorcist,1459,8384,City Hunter: Ryou no Propose,1485
9,15,Sen to Chihiro no Kamikakushi,255,841,Naruto,1438,6846,Tang Lang Bu Chan,1399
