# Anime Recommendation System Project

## Imported Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import random
from random import randint

In [2]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
#from sklearn.preprocessing import MinMaxScalerort sklearn as skl

from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds

In [3]:

from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms import SVD
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader
from surprise import Dataset

In [None]:

from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType

from pyspark.ml.recommendation import ALS

## Datasets

#### Main Anime Dataset (anime.csv)

In [4]:
anime_main = pd.read_csv("data/anime.csv")

#### Anime Ratings Dataset (rating_complete.csv)

In [5]:
anime_ratings = pd.read_csv("data/rating_complete.csv")

In [None]:
print(anime_main.shape)
print(anime_main.columns.unique())
anime_main.head()

In [None]:
print(anime_ratings.shape)
print(anime_ratings.columns.unique())
anime_ratings.head()

## Dataset Cleaning and Merging

In [6]:
# Clean main anime dataset and keep necessary features
anime_main = anime_main[['MAL_ID', 'Name', 'Score', 'Genres', 'Type', 'Episodes']]
anime_main.dropna(inplace=True)

print(anime_main.shape)
print(anime_main.columns)
anime_main.head()

(17562, 6)
Index(['MAL_ID', 'Name', 'Score', 'Genres', 'Type', 'Episodes'], dtype='object')


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52


In [7]:
# Rename columns and merge datasets
anime_main = anime_main.rename(columns={'MAL_ID': 'Anime ID'})
anime_ratings = anime_ratings.rename(columns={'user_id': 'User ID', 'anime_id': 'Anime ID', 'rating': 'Rating'})
anime_df = pd.merge(anime_main, anime_ratings, on='Anime ID')

In [None]:
print(anime_df.shape)
print(anime_df.columns)
anime_df.head()

### Sampling User Data

In [16]:
anime_df.head(5)

Unnamed: 0,Anime ID,Name,Score,Genres,Type,Episodes,User ID,Rating
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,3,9
1,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,6,6
2,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,14,9
3,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,19,8
4,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,22,9


CSV for mapping Name to ID (app.py use)

In [8]:
# Create csv for mapping name to ID
animeName_id_df = anime_main[['Anime ID', 'Name']]
animeName_id_df.to_csv('data/id_to_name.csv', index=False)

## Data Visualizations

In [None]:
sns.countplot(x='Type', data=anime_main, color='cyan')

OVA = Original Video Animation, ONA = Original Net Animation

In [None]:
sns.countplot(x='Rating', data=anime_ratings, color='red')

In [8]:
# unique Genre values
genres = anime_main['Genres']
genres = pd.DataFrame([sub.split(",") for sub in genres])
pd.unique(genres.values.ravel('K'))

array(['Action', 'Adventure', 'Comedy', 'Slice of Life', 'Drama',
       'Sci-Fi', 'Samurai', 'Game', 'Harem', 'Military', 'Space', 'Music',
       'Mecha', 'Supernatural', 'Historical', 'Mystery', 'School',
       'Hentai', 'Fantasy', 'Ecchi', 'Horror', 'Kids', 'Sports',
       'Dementia', 'Magic', 'Romance', 'Police', 'Psychological', 'Cars',
       'Shounen', 'Demons', 'Parody', 'Shoujo', 'Super Power', 'Vampire',
       'Martial Arts', 'Seinen', 'Yaoi', 'Thriller', 'Josei', 'Unknown',
       'Shounen Ai', ' Adventure', ' Drama', ' Sci-Fi', ' Mystery',
       ' Fantasy', ' Sports', ' Comedy', ' Cars', ' Horror', ' Shounen',
       ' Romance', ' Supernatural', ' Military', ' Mecha', ' Dementia',
       ' Historical', ' Magic', ' Slice of Life', ' Demons', ' Harem',
       ' School', ' Ecchi', ' Psychological', ' Game', ' Super Power',
       ' Hentai', None, ' Parody', ' Music', ' Space', ' Shoujo',
       ' Josei', ' Seinen', ' Samurai', ' Martial Arts', ' Police',
       ' Kids', '

In [9]:
# Create ID to name dictionary
def read_item_names():
    file_name = "data/anime.csv"
    rid_to_name = {}
    name_to_rid = {}
    with open(file_name, encoding="ISO-8859-1") as f:
        # skip header line
        next(f)
        for line in f:
            line = line.split(",")
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

rid_to_name, name_to_rid = read_item_names()

In [10]:
# From a dataframe sample all the ratings from N random users, using 'User ID'
def sample_n_user_ratings(source_df, n):
    users = source_df['User ID'].unique()
    user_id_set = set(np.random.choice(users, size=n, replace=False))

    result_df = source_df[source_df['User ID'].isin(user_id_set)].copy()
    
    return result_df


## Grid Search / SVD

Data Setup

In [25]:
# Sample subset of data
svd_anime_sample = sample_n_user_ratings(anime_df,2000)
svd_anime_sample = svd_anime_sample[['User ID', 'Anime ID', 'Rating']]

In [26]:
# Save as file for frontend usage
svd_anime_sample.to_csv('data/frontend_svd_sample.csv', index=False)

In [192]:
print(len(svd_anime_sample['User ID'].unique()))

2000


In [12]:
def create_dataset_from_df(df):
    reader = Reader(rating_scale=(1,10))
    return Dataset.load_from_df(df, reader)

# Read into Suprise dataset
svd_ratings_dataset =create_dataset_from_df(svd_anime_sample)

Grid Search

In [27]:
# Grid search for optimal hyperparameters
param_grid = {
    'lr_all' : [.01],
    'n_factors' : [50, 75, 100],
    'reg_all': [.1],
    'n_epochs' : [50, 100, 150]
}
gridsearch_svd = GridSearchCV(
    SVD, 
    param_grid = param_grid, 
    n_jobs = -1, 
    joblib_verbose = 3)

gridsearch_svd.fit(svd_ratings_dataset)

print(gridsearch_svd.best_score)
print(gridsearch_svd.best_params)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.3min


{'rmse': 1.1934692448208541, 'mae': 0.8969901616983795}
{'rmse': {'lr_all': 0.01, 'n_factors': 100, 'reg_all': 0.1, 'n_epochs': 50}, 'mae': {'lr_all': 0.01, 'n_factors': 100, 'reg_all': 0.1, 'n_epochs': 50}}


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  4.1min finished


Training Model Instance

In [34]:
# Create model instance from grid search optimal hyperparameters
svd_instance = gridsearch_svd.best_estimator["rmse"]
# Print Params
print(svd_instance.__dict__)

{'n_factors': 100, 'n_epochs': 50, 'biased': True, 'init_mean': 0, 'init_std_dev': 0.1, 'lr_bu': 0.01, 'lr_bi': 0.01, 'lr_pu': 0.01, 'lr_qi': 0.01, 'reg_bu': 0.1, 'reg_bi': 0.1, 'reg_pu': 0.1, 'reg_qi': 0.1, 'random_state': 1, 'verbose': False, 'bsl_options': {}, 'sim_options': {'user_based': True}, 'trainset': <surprise.trainset.Trainset object at 0x0000027B5C7BF190>, 'bu': array([-0.3863042 ,  0.4155742 ,  0.69548226, ..., -0.10589101,
        1.11857463,  0.09056355]), 'bi': array([ 0.95782517,  0.57394875,  0.60300892, ..., -0.69757199,
       -1.80394643, -0.52963547]), 'pu': array([[ 0.41451774,  0.20069583, -0.08855935, ..., -0.29901424,
        -0.01753678,  0.22015775],
       [-0.08117853,  0.14210263, -0.16282617, ...,  0.1262382 ,
        -0.07117056, -0.15915492],
       [-0.09291261, -0.07889247,  0.05056368, ..., -0.28307252,
        -0.06643822, -0.0373215 ],
       ...,
       [-0.05159044, -0.0496819 ,  0.09041949, ..., -0.0546013 ,
         0.06787956,  0.15260583],


Basic SVD Rating Prediction Recommendation System

In [20]:
# Appends a new user to the df for refitting and predicting
# We need to do this because suprise does not support iterative training with SVD
def create_predict_dataset(base_df, anime_ids, ratings):
    predictor_df = base_df.copy()
    for i in range(len(anime_ids)):
        predictor_df.loc[len(predictor_df)] = [-1,anime_ids[i], ratings[i]]
    
    return create_dataset_from_df(predictor_df)


In [21]:

# Displays predictions for some user ID
# TODO: Should display name of anime
def show_predictions(model_instance, user_id, anime_ids, ratings_df):
    for anime_id in anime_ids:
        condition = (ratings_df['User ID'] == user_id) & (ratings_df['Anime ID'] == anime_id)
        model_instance.predict(
            user_id, 
            anime_id, 
            #ratings_df.loc[condition, 'Rating'],
            verbose = True)

# Creates a dataframe for the predictions
def get_predictions(model_instance, user_id, anime_ids):

    predict_ratings = pd.DataFrame(columns=['Anime ID', 'Rating'])

    # Use suprise model predict method to get predictions
    # This only works on userIDs that were in the training set
    for anime_id in anime_ids:
        prediction = model_instance.predict(
            user_id, 
            anime_id)

        predict_ratings.loc[len(predict_ratings)] = [anime_id, prediction.est]

    return predict_ratings

In [22]:
# print(anime_ratings[anime_ratings['User ID'] == 35])
# show_predictions(svd_instance, 35, [64, 6707, 6547, 4898], svd_anime_sample)

In [23]:
# Create predictions for a mew user provided their anime ratings
# This refits the entire model with the new user appended onto the base df with a user ID of -1

def create_predictions_for_user(model_instance, base_df, anime_ids, ratings):

    # Create suprise dataset with new user
    predict_dataset = create_predict_dataset(
        base_df, 
        anime_ids,
        ratings)

    # Train on entire dataset
    # TODO: Should we do this?
    model_instance.fit(predict_dataset.build_full_trainset())
    # model_instance.fit(full_trainset)

    # Create a series of all the anime IDs that want to be predicted (all of them, more or less)
    predict_anime_ids = base_df['Anime ID'];
    predict_anime_ids = predict_anime_ids.append(pd.Series(anime_ids)).unique()
    
    # Show predictions for the known ratings
    # show_predictions(model_instance, -1, anime_ids, svd_anime_sample)

    # Generate and return predictions for all the anime
    return get_predictions(model_instance, -1, predict_anime_ids)

# Print out information for top N predictions
def display_top_n(predictions, n):
    # Sort descending
    predictions = predictions.sort_values('Rating', ascending=False)
    
    print(predictions)
    print("Top {} predicted scores".format(n))
    # Print information about top n
    for index, row in predictions.head(n).iterrows():
        anime_id = row['Anime ID']
        rating = row['Rating']
        name = rid_to_name[str(int(anime_id))]
        print("Anime: {} Rating: {} Name: {}".format(anime_id, rating, name))

In [29]:
# User 1 
# Drama / romance

user_1_anime_ids = [4224, 23273, 1723, 32281, 37450, 2167, 121]
user_1_ratings = [10, 8, 9, 9, 10, 9, 4]

# No randomness between each fit
svd_instance.random_state = 1

user_1_predictions = create_predictions_for_user(
    svd_instance,
    svd_anime_sample,
    user_1_anime_ids,
    user_1_ratings
    )

display_top_n(user_1_predictions, 25)

  predict_anime_ids = predict_anime_ids.append(pd.Series(anime_ids)).unique()


user: -1         item: 4224       r_ui = None   est = 8.92   {'was_impossible': False}
user: -1         item: 23273      r_ui = None   est = 8.48   {'was_impossible': False}
user: -1         item: 1723       r_ui = None   est = 8.57   {'was_impossible': False}
user: -1         item: 32281      r_ui = None   est = 8.85   {'was_impossible': False}
user: -1         item: 37450      r_ui = None   est = 9.13   {'was_impossible': False}
user: -1         item: 2167       r_ui = None   est = 8.87   {'was_impossible': False}
user: -1         item: 121        r_ui = None   est = 6.04   {'was_impossible': False}
       Anime ID    Rating
2474     2904.0  9.250839
3166     4181.0  9.143726
9892    37450.0  9.125629
1389     1575.0  9.063367
10209   38329.0  9.014899
...         ...       ...
3190     4250.0  4.567203
1429     1622.0  4.556024
2700     3287.0  4.447982
383       413.0  4.362190
7636    29949.0  4.342173

[11138 rows x 2 columns]
Top 25 predicted scores
Anime: 2904.0 Rating: 9.25083

In [196]:
# User 2 
# Action / Adventure
user_2_pred = create_predictions_for_user(
    svd_instance,
    svd_anime_sample,
    [114, 31964, 32051, 34134, 38000],
    [9, 10, 8, 9, 9]
    )

display_top_n(user_2_pred, 25)

  predict_anime_ids = predict_anime_ids.append(pd.Series(anime_ids)).unique()


user: -1         item: 114        r_ui = None   est = 9.13   {'was_impossible': False}
user: -1         item: 31964      r_ui = None   est = 9.19   {'was_impossible': False}
user: -1         item: 32051      r_ui = None   est = 8.05   {'was_impossible': False}
user: -1         item: 34134      r_ui = None   est = 8.63   {'was_impossible': False}
user: -1         item: 38000      r_ui = None   est = 9.39   {'was_impossible': False}
      Anime ID    Rating
1513    1695.0  9.861566
5126   11061.0  9.858676
3508    5114.0  9.849106
7782   33050.0  9.785896
7010   28977.0  9.758181
...        ...       ...
382      413.0  5.217306
2715    3287.0  4.882709
5828   16608.0  4.760934
4105    6953.0  4.647107
7431   31634.0  4.317752

[9926 rows x 2 columns]
Top 25 predicted scores
Anime: 1695.0 Rating: 9.8615664599809 Name: Les MisÃ©rables: Shoujo Cosette
Anime: 11061.0 Rating: 9.858676163618954 Name: Hunter x Hunter (2011)
Anime: 5114.0 Rating: 9.849105903840957 Name: Fullmetal Alchemist: Bro

In [197]:
# User 3 
# Boys Love
user_3_anime_ids = [114, 31964, 32051, 34134, 38000, 39533, 30346, 44055, 918]
user_3_ratings = [4, 5, 4, 6, 4, 10, 10, 9, 4]

user_3_predictions = create_predictions_for_user(
    svd_instance,
    svd_anime_sample,
    user_3_anime_ids,
    user_3_ratings
)

display_top_n(user_3_predictions, 25)

  predict_anime_ids = predict_anime_ids.append(pd.Series(anime_ids)).unique()


user: -1         item: 114        r_ui = None   est = 4.90   {'was_impossible': False}
user: -1         item: 31964      r_ui = None   est = 5.23   {'was_impossible': False}
user: -1         item: 32051      r_ui = None   est = 4.31   {'was_impossible': False}
user: -1         item: 34134      r_ui = None   est = 5.51   {'was_impossible': False}
user: -1         item: 38000      r_ui = None   est = 5.55   {'was_impossible': False}
user: -1         item: 39533      r_ui = None   est = 8.82   {'was_impossible': False}
user: -1         item: 30346      r_ui = None   est = 9.32   {'was_impossible': False}
user: -1         item: 44055      r_ui = None   est = 8.90   {'was_impossible': False}
user: -1         item: 918        r_ui = None   est = 4.89   {'was_impossible': False}
      Anime ID    Rating
7152   30346.0  9.315407
9926   44055.0  8.902190
9445   39533.0  8.819841
6145   19511.0  7.952705
3921    6383.0  7.944589
...        ...       ...
1416    1585.0  3.122066
7431   31634.0  3

In [None]:
# from collections import defaultdict

# def get_top_n(predictions, n = 10):
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))

#     # Then sort the predictions for each user and retrieve the k highest ones.
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]

#     return top_n

In [None]:
# top_anime = get_top_n(anime_predictions,n=10)

# for uid, user_ratings in top_anime.items():
#     if len([iid for (iid, _) in user_ratings]) == 10:
#         print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# user_id = -1
# anime_ids = []
# for uid, user_ratings in top_anime.items():
#     if len([iid for (iid, _) in user_ratings]) == 10:
#         user_id = uid
#         anime_ids = [iid for (iid, _) in user_ratings]
#         break
# print(user_id, anime_ids)

In [None]:
# for anime_id in anime_ids:
#     print(anime_main.loc[anime_main['Anime ID'] == anime_id]['Name'].to_string(index=False))

## KNN 

In [15]:
knn_anime_sample = sample_n_user_ratings(anime_df,2000)
knn_anime_sample = knn_anime_sample[['User ID', 'Anime ID', 'Rating']]

reader = Reader(rating_scale=(1,10))
knn_anime_data = Dataset.load_from_df(knn_anime_sample, reader)

knn_anime_trainset = knn_anime_data.build_full_trainset()
knn_anime_testset = knn_anime_trainset.build_anti_testset()

In [16]:
simulation_variables = {"Name" : "pearson_baseline", "user_based" : True, "min_support" : 4}
knn_baseline = KNNBasic(sim_options = simulation_variables)
knn_baseline.fit(knn_anime_trainset)

anime_name = "Cowboy Bebop"
anime_raw_id = int(name_to_rid[anime_name])
anime_inner_id = knn_baseline.trainset.to_inner_iid(anime_raw_id)

print(rid_to_name[str(knn_baseline.trainset.to_raw_iid(anime_inner_id))])

anime_neighbors = knn_baseline.get_neighbors(anime_inner_id, k=10)

for inner_id in anime_neighbors:
    print(rid_to_name[str(knn_baseline.trainset.to_raw_iid(inner_id))])

anime_neighbors = (
    knn_baseline.trainset.to_raw_iid(inner_id) for inner_id in anime_neighbors
)

anime_neighbors = (rid_to_name[str(rid)] for rid in anime_neighbors)

Computing the msd similarity matrix...
Done computing similarity matrix.
Cowboy Bebop
Sonic X
Sakura Taisen: New York
Super GALS! Kotobuki Ran
Onna Senshi Efe & Jira: Gude no Monshou
Galaxy Angel 3 Specials
Fushigi no Kuni no Miyuki-chan
Tales of Symphonia The Animation: Sylvarant-hen
Project ARMS: The 2nd Chapter
Haou Taikei Ryuu Knight
Lupin III: Bye Bye Liberty - Kiki Ippatsu!


In [64]:
print(anime_raw_id)

2167


In [65]:
print("The 10 nearest neighbors of", anime_name, "are:")
for anime in anime_neighbors:
    print(anime)

The 10 nearest neighbors of Clannad are:
Shin Captain Tsubasa
Oni-Tensei
Tsui no Sora
Haru wo Daite Ita
Super Lovers OVA
Super Lovers 2
Green Legend Ran
Saber Marionette R
Mahou Yuugi 3D
Grenadier: Hohoemi no Senshi Specials


## Cross Validation

In [17]:
knn_basic = KNNBasic(sim_options = {'name':'pearson','user_based':True})

cv_knn_baseline = cross_validate(knn_basic, knn_anime_data, n_jobs=-1)
print(np.mean(cv_knn_baseline['test_rmse']))

1.4952481537825544


In [None]:
cv_knn_baseline

## Pickle

In [31]:
import pickle

In [None]:
pickle.dump(knn_baseline, open('knn_model.pkl','wb'))

In [32]:
pickle.dump(svd_instance, open('svd_model.pkl','wb'))