In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
#load data

ratings = pd.read_csv('/data/users-score-2023.csv')
anime = pd.read_csv('/data/anime-dataset-2023.csv')

In [3]:
# data cleaning & preparation

anime['Scored By'] = pd.to_numeric(anime['Scored By'], errors='coerce').astype('Int64')

anime = anime[(anime['Rating'] != 'Rx - Hentai') &  # exclude hentai
                (anime['Type'] != 'Music') &        # exclude anime music videos
                (anime['Score'] != 'UNKNOWN') &     # exclude anime with unknown score
                (anime['Rank'] != 'UNKNOWN') &      # exculde anime with unknown ranking
                (anime['Scored By'] >= 5000) &      # popularity cutoff: only include anime scored by at least 5K users
                (anime['Members'] >= 10000)&        # popularity cutoff: only include anime with at least 10K members
                (anime['Favorites'] >= 60)]         # popularity cutoff: only include anime with at least 60 favorites

                                                    # note: all numbers for the popularity cutoffs are based on the lower average of each category
print(anime.shape)

(3529, 24)


In [4]:
anime = anime[['anime_id', 'Name', 'Genres']]

In [5]:
# split the 'Genres' column
genre_clean = anime['Genres'].str.split(',').explode().str.strip().unique().tolist()

for genre_type in genre_clean:
    genre_name = genre_type.lower().replace("-", "")
    anime[genre_name] = anime.Genres.str.contains(genre_type)

In [6]:
ratings_stats = (ratings.groupby("anime_id").agg(avg_ratings=("rating", "mean"), total_ratings=("rating", "count")).reset_index())
anime_stats = ratings_stats.merge(anime, on="anime_id")

In [7]:
user_anime = ratings.merge(anime[['anime_id', 'Name']], on='anime_id').groupby(['user_id', 'anime_id', 'Name']).agg(rating=('rating', 'mean')).reset_index()

In [8]:
ua_matrix = user_anime.pivot(
    index='anime_id',
     columns='user_id',
      values='rating').fillna(0)

----

In [9]:
# get anime recommendations based on user's most-liked genre

def genre_rec(user_id, user_fav_n=10, popularity_cutoff=0.9, n_to_suggest=10, ratings=ratings, anime_data=anime):

  single_user = ratings[ratings.user_id == user_id]
  if len(single_user) == 0:
    raise ValueError('User Does not exist')
  topn = single_user.sort_values('rating', ascending=False).head(user_fav_n)
  top_genres = topn.merge(anime_data, on=['anime_id']).groupby('user_id').sum(numeric_only=True).drop(['anime_id', 'rating'], axis=1).T.reset_index()
  top_genres.columns = ['genre', 'n_times_ranked']
  favorite_genre = top_genres.sort_values('n_times_ranked', ascending=False).head(1)['genre'].values[0]
  target_anime = anime_stats[anime_stats[favorite_genre] == True]
  omit_seen = target_anime[~target_anime.anime_id.isin(single_user.anime_id)]
  popular_anime = omit_seen[omit_seen.total_ratings > omit_seen.total_ratings.quantile(popularity_cutoff)]
  top_ranked = popular_anime.sort_values('avg_ratings', ascending=False).head(n_to_suggest)
  suggested_anime = top_ranked['Name'].values.tolist()

  return(suggested_anime)

In [10]:
genre_rec(4)

['Hunter x Hunter (2011)',
 'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen',
 'Mononoke Hime',
 'Tengen Toppa Gurren Lagann',
 'Fate/Zero 2nd Season',
 'Kara no Kyoukai Movie 5: Mujun Rasen',
 'Gintama',
 'One Punch Man',
 'One Piece',
 'Hellsing Ultimate']

---

In [11]:
# k-nearest neighbors-based recommendation system

# create sparse matrix
sparse_data = csr_matrix(ua_matrix.values)

In [12]:
# train knn model
knn_model = NearestNeighbors(metric='cosine', algorithm='auto')
knn_model.fit(sparse_data)

In [13]:
# get similar anime recommendation with knn

title_lkp = dict(zip(anime.anime_id.values, anime.Name.values))
index_lkp = dict(zip(range(len(ua_matrix)), anime.anime_id.values))
inverse_index_lkp = dict(zip(ua_matrix.index, range(len(ua_matrix))))

def get_similar_anime(anime, data, index_lkp, model, title_lkp = None, n = 10):

  if type(anime) == str:
    anime = [key for key, value in title_lkp.items() if value == anime][0]
  input = data[data.index == anime].values.reshape(1,-1)
  distances, indices = model.kneighbors(input, n_neighbors=n+1)
  results = [index_lkp[indices[0][i]] for i in range(0,len(distances[0]))]
  if title_lkp is not None:
    results = [title_lkp[x] for x in results]
    
  return(results)

In [14]:
def get_similar_anime(anime, data, index_lkp, model, title_lkp=None, n=10):
    if type(anime) == str:
        anime_id = [key for key, value in title_lkp.items() if value == anime][0]
    input = data[data.index == anime_id].values.reshape(1, -1)
    distances, indices = model.kneighbors(input, n_neighbors=n + 1)
    results = [index_lkp[indices[0][i]] for i in range(0, len(distances[0]))]
    if title_lkp is not None:
        results = [title_lkp[x] for x in results]
    results = [title for title in results if title != anime]

    return results


In [15]:
get_similar_anime('One Piece', ua_matrix, index_lkp, knn_model, title_lkp)

['Bleach',
 'Naruto',
 'Naruto: Shippuuden',
 'Fairy Tail',
 'Death Note',
 'Fullmetal Alchemist',
 'Fullmetal Alchemist: Brotherhood',
 'Code Geass: Hangyaku no Lelouch',
 'One Piece Film: Strong World',
 'Dragon Ball Z']

---

In [16]:
# combining genre-based and knn recommendation systems

def anime_rec(user_id):
    genre_recs = genre_rec(user_id)

    knn_recs = []
    for anime_name in genre_recs:
        top_k = get_similar_anime(anime_name, ua_matrix, index_lkp, knn_model, title_lkp)
        knn_recs.append(top_k[0])

    knn_recs = list(set(knn_recs))
    watched = user_anime[user_anime.user_id == user_id]
    knn_recs = [rec for rec in knn_recs if rec not in watched['Name'].values]

    return knn_recs

In [17]:
anime_rec(4)

['Bleach',
 'Kara no Kyoukai Movie 4: Garan no Dou',
 'Rurouni Kenshin: Meiji Kenkaku Romantan - Seisou-hen',
 'Fate/Zero',
 'Sen to Chihiro no Kamikakushi',
 "Gintama'",
 'Mob Psycho 100']