# Anime Recommendation System Project

## Imported Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import random
from random import randint

In [2]:

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
#from sklearn.preprocessing import MinMaxScalerort sklearn as skl

from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds

In [11]:

from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms import SVD
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader
from surprise import Dataset


In [None]:

from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType

from pyspark.ml.recommendation import ALS

## Datasets

#### Main Anime Dataset (anime.csv)

In [3]:
anime_main = pd.read_csv("data/anime.csv")

#### Anime Ratings Dataset (rating_complete.csv)

In [4]:
anime_ratings = pd.read_csv("data/rating_complete.csv")

In [None]:
print(anime_main.shape)
print(anime_main.columns.unique())
anime_main.head()

In [None]:
print(anime_ratings.shape)
print(anime_ratings.columns.unique())
anime_ratings.head()

## Dataset Cleaning and Merging

In [5]:
# Clean main anime dataset and keep necessary features
anime_main = anime_main[['MAL_ID', 'Name', 'Score', 'Genres', 'Type', 'Episodes']]
anime_main.dropna(inplace=True)

print(anime_main.shape)
print(anime_main.columns)
anime_main.head()

(17562, 6)
Index(['MAL_ID', 'Name', 'Score', 'Genres', 'Type', 'Episodes'], dtype='object')


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52


In [6]:
# Rename columns and merge datasets
anime_main = anime_main.rename(columns={'MAL_ID': 'Anime ID'})
anime_ratings = anime_ratings.rename(columns={'user_id': 'User ID', 'anime_id': 'Anime ID', 'rating': 'Rating'})
anime_df = pd.merge(anime_main, anime_ratings, on='Anime ID')

In [None]:
print(anime_df.shape)
print(anime_df.columns)
anime_df.head()

In [7]:
anime_sample = anime_df.sample(n=1000)

CSV for mapping Name to ID (app.py use)

In [19]:
# Create csv for mapping name to ID
animeName_id_df = anime_main[['Anime ID', 'Name']]
animeName_id_df.to_csv('data/id_to_name.csv', index=False)

## Data Visualizations

In [None]:
sns.countplot(x='Type', data=anime_main, color='cyan')

OVA = Original Video Animation, ONA = Original Net Animation

In [None]:
sns.countplot(x='Rating', data=anime_ratings, color='red')

In [9]:
# unique Genre values
genres = anime_main['Genres']
genres = pd.DataFrame([sub.split(",") for sub in genres])
pd.unique(genres.values.ravel('K'))

array(['Action', 'Adventure', 'Comedy', 'Slice of Life', 'Drama',
       'Sci-Fi', 'Samurai', 'Game', 'Harem', 'Military', 'Space', 'Music',
       'Mecha', 'Supernatural', 'Historical', 'Mystery', 'School',
       'Hentai', 'Fantasy', 'Ecchi', 'Horror', 'Kids', 'Sports',
       'Dementia', 'Magic', 'Romance', 'Police', 'Psychological', 'Cars',
       'Shounen', 'Demons', 'Parody', 'Shoujo', 'Super Power', 'Vampire',
       'Martial Arts', 'Seinen', 'Yaoi', 'Thriller', 'Josei', 'Unknown',
       'Shounen Ai', ' Adventure', ' Drama', ' Sci-Fi', ' Mystery',
       ' Fantasy', ' Sports', ' Comedy', ' Cars', ' Horror', ' Shounen',
       ' Romance', ' Supernatural', ' Military', ' Mecha', ' Dementia',
       ' Historical', ' Magic', ' Slice of Life', ' Demons', ' Harem',
       ' School', ' Ecchi', ' Psychological', ' Game', ' Super Power',
       ' Hentai', None, ' Parody', ' Music', ' Space', ' Shoujo',
       ' Josei', ' Seinen', ' Samurai', ' Martial Arts', ' Police',
       ' Kids', '

In [8]:
# Create ID to name dictionary
def read_item_names():
    file_name = "data/anime.csv"
    rid_to_name = {}
    name_to_rid = {}
    with open(file_name, encoding="ISO-8859-1") as f:
        # skip header line
        next(f)
        for line in f:
            line = line.split(",")
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

rid_to_name, name_to_rid = read_item_names()

## Grid Search / SVD

Data Setup

In [None]:
# Sample subset of data
svd_anime_sample = anime_df.sample(n=5000)
svd_anime_sample = svd_anime_sample[['User ID', 'Anime ID', 'Rating']]

In [None]:
def create_dataset_from_df(df):
    reader = Reader(rating_scale=(1,10))
    return Dataset.load_from_df(df, reader)

# Read into Suprise dataset
svd_ratings_dataset =create_dataset_from_df(svd_anime_sample)
svd_ratings_dataset = Dataset.load_from_df(svd_anime_sample, reader)

Grid Search

In [None]:
# Grid search for optimal hyperparameters
param_grid = {
    'lr_all' : [.05, .01],
    'n_factors' : [50, 75, 100],
    'reg_all': [.05,.1],
    'n_epochs' : [50, 100, 150]
}
gridsearch_svd = GridSearchCV(
    SVD, 
    param_grid = param_grid, 
    n_jobs = -1, 
    joblib_verbose = 3)

gridsearch_svd.fit(svd_ratings_dataset)

print(gridsearch_svd.best_score)
print(gridsearch_svd.best_params)


Training Model Instance

In [None]:
# Create model instance from grid search optimal hyperparameters
svd_instance = gridsearch_svd.best_estimator["rmse"]
# Print Params
# print(svd_instance.__dict__)

Basic SVD Rating Prediction Recommendation System

In [None]:
# Appends a new user to the df for refitting and predicting
# We need to do this because suprise does not support iterative training with SVD
def create_predict_dataset(base_df, anime_ids, ratings):
    predictor_df = base_df.copy()
    for i in range(len(anime_ids)):
        predictor_df.loc[len(predictor_df)] = [-1,anime_ids[i], ratings[i]]
    
    return create_dataset_from_df(predictor_df)


In [None]:

# Displays predictions for some user ID
# TODO: Should display name of anime
def show_predictions(model_instance, user_id, anime_ids, ratings_df):
    for anime_id in anime_ids:
        condition = (ratings_df['User ID'] == user_id) & (ratings_df['Anime ID'] == anime_id)
        model_instance.predict(
            user_id, 
            anime_id, 
            #ratings_df.loc[condition, 'Rating'],
            verbose = True)

# Creates a dataframe for the predictions
def get_predictions(model_instance, user_id, anime_ids):

    predict_ratings = pd.DataFrame(columns=['Anime ID', 'Rating'])

    # Use suprise model predict method to get predictions
    # This only works on userIDs that were in the training set
    for anime_id in anime_ids:
        prediction = model_instance.predict(
            user_id, 
            anime_id)

        predict_ratings.loc[len(predict_ratings)] = [anime_id, prediction.est]

    return predict_ratings

In [None]:
print(anime_ratings[anime_ratings['User ID'] == 35])
show_predictions(svd_instance, 35, [64, 6707, 6547, 4898], svd_anime_sample)

In [None]:
# Create predictions for a mew user provided their anime ratings
# This refits the entire model with the new user appended onto the base df with a user ID of -1

def create_predictions_for_user(model_instance, base_df, anime_ids, ratings):

    # Create suprise dataset with new user
    predict_dataset = create_predict_dataset(
        base_df, 
        anime_ids,
        ratings)

    # Train on entire dataset
    # TODO: Should we do this?
    model_instance.fit(predict_dataset.build_full_trainset())
    # model_instance.fit(full_trainset)

    # Create a series of all the anime IDs that want to be predicted (all of them, more or less)
    predict_anime_ids = base_df['Anime ID'];
    predict_anime_ids = predict_anime_ids.append(pd.Series(anime_ids)).unique()
    
    # Show predictions for the known ratings
    show_predictions(model_instance, -1, anime_ids, svd_anime_sample)

    # Generate and return predictions for all the anime
    return get_predictions(model_instance, -1, predict_anime_ids)

# Print out information for top N predictions
def display_top_n(predictions, n):
    # Sort descending
    predictions = predictions.sort_values('Rating', ascending=False)
    
    print(predictions)
    print("Top {} predicted scores".format(n))
    # Print information about top n
    for index, row in predictions.head(n).iterrows():
        anime_id = row['Anime ID']
        rating = row['Rating']
        name = rid_to_name[str(int(anime_id))]
        print("Anime: {} Rating: {} Name: {}".format(anime_id, rating, name))


Tests?

In [None]:
# User 1 
# Drama / romance

user_1_anime_ids = [4224, 23273, 1723, 32281, 37450, 2167, 121]
user_1_ratings = [10, 8, 9, 9, 10, 9, 4]

# No randomness between each fit
svd_instance.random_state = 1

user_1_predictions = create_predictions_for_user(
    svd_instance,
    svd_anime_sample,
    user_1_anime_ids,
    user_1_ratings
    )

display_top_n(user_1_predictions, 25)

In [None]:
# User 1 
# Action / Adventure
user_2_pred = create_predictions_for_user(
    svd_instance,
    svd_anime_sample,
    [114, 31964, 32051, 34134, 38000],
    [9, 10, 8, 9, 9]
    )

display_top_n(user_2_pred, 25)

In [None]:
# User 3 
# Boys Love
user_3_anime_ids = [114, 31964, 32051, 34134, 38000, 39533, 30346, 44055, 918]
user_3_ratings = [4, 5, 4, 6, 4, 10, 10, 9, 4]

user_3_predictions = create_predictions_for_user(
    svd_instance,
    svd_anime_sample,
    user_3_anime_ids,
    user_3_ratings
)

display_top_n(user_3_predictions, 25)

Older SVD Stuff idk what this does

In [None]:
# from collections import defaultdict

# def get_top_n(predictions, n = 10):
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))

#     # Then sort the predictions for each user and retrieve the k highest ones.
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]

#     return top_n

In [None]:
# top_anime = get_top_n(anime_predictions,n=10)

# for uid, user_ratings in top_anime.items():
#     if len([iid for (iid, _) in user_ratings]) == 10:
#         print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# user_id = -1
# anime_ids = []
# for uid, user_ratings in top_anime.items():
#     if len([iid for (iid, _) in user_ratings]) == 10:
#         user_id = uid
#         anime_ids = [iid for (iid, _) in user_ratings]
#         break
# print(user_id, anime_ids)


In [None]:
# for anime_id in anime_ids:
#     print(anime_main.loc[anime_main['Anime ID'] == anime_id]['Name'].to_string(index=False))

## KNN 

In [12]:
knn_anime_sample = anime_df.sample(n=5000)[['User ID', 'Anime ID', 'Rating']]

reader = Reader(line_format = 'user item rating', sep='')
knn_anime_data = Dataset.load_from_df(knn_anime_sample, reader)

knn_anime_trainset = knn_anime_data.build_full_trainset()
knn_anime_testset = knn_anime_trainset.build_anti_testset()

In [13]:
simulation_variables = {"Name" : "pearson_baseline", "user_based" : False}
knn_baseline = KNNBaseline(sim_options = simulation_variables)
knn_baseline.fit(knn_anime_trainset)

anime_name = "Cowboy Bebop"
anime_raw_id = int(name_to_rid[anime_name])
anime_inner_id = knn_baseline.trainset.to_inner_iid(anime_raw_id)

anime_neighbors = knn_baseline.get_neighbors(anime_inner_id, k=10)

anime_neighbors = (
    knn_baseline.trainset.to_raw_iid(inner_id) for inner_id in anime_neighbors
)

anime_neighbors = (rid_to_name[str(rid)] for rid in anime_neighbors)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [14]:
print("The 10 nearest neighbors of", anime_name, "are:")
for anime in anime_neighbors:
    print(anime)

The 10 nearest neighbors of Cowboy Bebop are:
Nisekoi
Bokusatsu Tenshi Dokuro-chan
Cheonnyeon-yeowoo Yeowoobi
High School DxD OVA
Toaru Kagaku no Railgun: Motto Marutto Railgun
Gangsta.
Log Horizon 2nd Season
Suzumiya Haruhi no Yuuutsu
Walkure Romanze
Tsurezure Children


## Cross Validation

In [None]:
knn_basic = KNNBasic(sim_options = {'name':'pearson','user_based':True})

cv_knn_baseline = cross_validate(knn_basic, knn_anime_data, n_jobs=-1)
print(np.mean(cv_knn_baseline['test_rmse']))

In [46]:
cv_knn_baseline

{'test_rmse': array([2.98546479, 2.99583044, 2.96293773, 3.05908483, 2.95059316]),
 'test_mae': array([2.645, 2.657, 2.607, 2.73 , 2.614]),
 'fit_time': (0.43232035636901855,
  0.4418449401855469,
  0.41533398628234863,
  0.4521176815032959,
  0.4691200256347656),
 'test_time': (0.0075299739837646484,
  0.0059986114501953125,
  0.00792551040649414,
  0.006028175354003906,
  0.006009578704833984)}

## Pickle

In [15]:
import pickle
pickle.dump(knn_baseline, open('knn_model.pkl','wb')) 