In [1]:
import pandas as pd
import os
import tqdm
import numpy as np

In [2]:
animes = pd.read_csv('../data/copperunion/anime.csv')
ratings = pd.read_csv('../data/copperunion/rating.csv')
display(animes.head())
display(ratings.head())

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [3]:
n_users = ratings["user_id"].nunique()
n_items = animes["anime_id"].nunique()
print(f"{n_users=} {n_items=} {len(ratings)=}")

anime_to_inc_id = animes.reset_index().set_index('anime_id')["index"].to_dict()
inc_id_to_anime = {v:k for k,v in anime_to_inc_id.items()}
print(f"{len(anime_to_inc_id)=} {max(anime_to_inc_id.values())=}")

all_animes = set(animes["anime_id"].unique())
all_interacted_animes = set(ratings["anime_id"].unique())
lacking_metadata_animes = set(all_interacted_animes) - set(all_animes)
print(f"{lacking_metadata_animes=}")

ratings = ratings[~ratings["anime_id"].isin(lacking_metadata_animes)]
n_users = ratings["user_id"].nunique()
print(f"{n_users=} {len(ratings)=}")

n_users=73515 n_items=12294 len(ratings)=7813737
len(anime_to_inc_id)=12294 max(anime_to_inc_id.values())=12293
lacking_metadata_animes={np.int64(30913), np.int64(30924), np.int64(20261)}
n_users=73515 len(ratings)=7813727


Find Anime-Anime Jaccard Similarity Matrix (12k x 12k)

In [18]:
ratings_interactions = ratings[["user_id", "anime_id"]]
item_similarity_matrix = np.zeros((n_items, n_items))
for anime_id, canonical_anime_id in tqdm.tqdm(sorted(anime_to_inc_id.items())[:100]):
    filtered_interactions = ratings_interactions[
        ratings_interactions["anime_id"] == anime_id
    ]["user_id"]
    intersection = (
        pd.merge(
            filtered_interactions,
            ratings_interactions,
            left_on="user_id",
            right_on="user_id",
        )
        .groupby("anime_id")
        .count()
        .drop(anime_id, errors="ignore")
        .rename(columns={"user_id": "intersection"})
    )
    sum_counts = (
        (len(filtered_interactions) + ratings_interactions.groupby("anime_id").count())
        .drop(anime_id, errors="ignore")
        .rename(columns={"user_id": "sum_counts"})
        .join(
            intersection, how="left", lsuffix="_left", rsuffix="_right"
        )
        .fillna(0)
    )
    sum_counts["union"] = sum_counts["sum_counts"] - sum_counts["intersection"]
    sum_counts["jaccard"] = sum_counts["intersection"] / sum_counts["union"]
    row = np.zeros(n_items)
    indices, entries = sum_counts.index.map(anime_to_inc_id), sum_counts["jaccard"]
    row[indices] = entries
    item_similarity_matrix[canonical_anime_id] = row
    if np.all(row == 0):
        print(f"{anime_id=} {canonical_anime_id=} All zeroes")
np.save("../data/copperunion/item_similarity_matrix.npy", item_similarity_matrix)

  1%|          | 1/100 [00:00<00:33,  2.91it/s]

In [14]:
item_similarity_matrix = np.load("../data/copperunion/item_similarity_matrix.npy")

In [17]:
all_zeros = np.logical_and.reduce(item_similarity_matrix == 0, axis=1)
len(np.where(all_zeros == False)[0])

99

In [None]:
def inference(user_id, items, ratings, k):
    items = [anime_to_inc_id[anime_id] for anime_id in items]
    ratings_vector = np.zeros((n_items, 1))
    ratings_vector[items,0] = ratings
    scores = item_similarity_matrix @ ratings_vector
    order = np.argsort(-scores)
    filtered_order = order[~np.isin(order, items)]
    return [inc_id_to_anime[v] for v in filtered_order[:k]]