# Adnime Recommendations with Surprise

## Load in the necessary packages and get a quick look at the data

In [1]:
# Surprise is not downloaded in Google Colab instance so we have to download the packages first
!pip install surprise



In [2]:
# importing all of the relevant packages for the project
# Standard Packages
import pandas as pd
import numpy as np

# Surprise Packages
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split


In [3]:
# Load in the data as Pandas DataFrame
anime_df = pd.read_csv('anime.csv')
anime_list_df = pd.read_csv('animelist.csv')

In [4]:
# Looking at the different features in the dataset, the set will fit into a 
# surprise model well as it has the relavent columns
anime_list_df.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4
2,0,242,10,1,4
3,0,4898,0,1,1
4,0,21,10,1,0


In [5]:
# Looking at the different features of the anime, there seems to be some anime 
# with episode less than a minute and also anime of ill repute that we will
# not include in the scope of the project.
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genders,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,1467,94683,587,4300,46165,5121,5378,33719,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,4369,13224,18,642,7314,766,1108,3394,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


## Cleaning the Data and reformatting

In [6]:
# Need to drop all of the unnecessary columns from the DataFrame
anime_list_df.drop(['watching_status','watched_episodes'],axis=1,inplace=True)

In [7]:
# dropping all of the anime of ill repute
anime_df = anime_df[~anime_df['Genders'].str.contains("Hentai")]

# dropping all of the anime with air rimes less than 1 sec.
anime_df = anime_df[~anime_df['Duration'].str.contains("sec")]

# using the ID of the cleaned anime list to clean the list of ratings as well
anime_list_df = anime_list_df[anime_list_df['anime_id'].isin(list(anime_df['MAL_ID'].tolist()))]

In [8]:
anime_df.shape

(15780, 35)

In [16]:
# There is way too much data in this set so we will be resampling the data to 
# make a smaller data set.
# anime_list_df = anime_list_df.sample(5000)

In [None]:
# Loading in the DataFrame as a surprise objet
reader = Reader()
data = Dataset.load_from_df(anime_list_df,reader)

In [None]:
# Looking at how many users and items are in the data to determine whether we 
# use a user-user or item-item similarity method
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

In [None]:
# Looks like there are vastly more items than there are users here so if we are 
# using a neighborhood-based model we will evaluate with a item-item approach

In [None]:
# loaidng a spark object with the cleaned data for use in pyspark models
# rec_data = spark.createDataFrame(cleaned_df)

## First simple models with SVD, KNNBasic, and KNNBaseline


In [38]:
# Splitting the data into train/test sets we will be using 80% of the data to
# train the model and 20% for validating the model.
trainset,testset = train_test_split(data,test_size=0.2)

In [46]:
# Instantiating the models

#SVD
svd = SVD(n_factors=50,reg_all=0.05)

# KNNBasic
knn_basic = KNNBasic(sim_options = {'name':'pearson', 'user_based':False})

# KNNBaseline
knn_baseline = KNNBaseline(sim_options = {'name':'pearson', 'user_based':False})

###SVD


In [47]:
# Run cross validation on the data  
cv_svd = cross_validate(svd,data,n_jobs=-1)

In [48]:
# Printing out the test metrics of the cross validation for comparison
for i in cv_svd.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svd['test_rmse']))

('test_rmse', array([3.84965735, 3.90452592, 3.91789306, 3.91218521, 3.84606599]))
('test_mae', array([3.61257461, 3.70539795, 3.70322773, 3.71069874, 3.63161982]))
('fit_time', (0.2707395553588867, 0.27812790870666504, 0.28876328468322754, 0.3061184883117676, 0.15135931968688965))
('test_time', (0.016380786895751953, 0.011266231536865234, 0.02576446533203125, 0.01018381118774414, 0.006639242172241211))
-----------------------
3.886065508780715


### KNNBasic

In [49]:
# Run cross validation on the data 
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [50]:
# Printing out the test metrics of the cross validation for comparison
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([3.92929937, 3.83927337, 3.94022312, 3.91110419, 3.90637281]))
('test_mae', array([3.736465, 3.626864, 3.756245, 3.720824, 3.691199]))
('fit_time', (0.27439308166503906, 0.26259660720825195, 0.2595834732055664, 0.2257688045501709, 0.1587071418762207))
('test_time', (0.028040170669555664, 0.01390218734741211, 0.013776063919067383, 0.013631343841552734, 0.010076522827148438))
-----------------------
3.90525457391362


### KNNBaseline

In [51]:
# Run cross validation on the data  
cv_knn_baseline = cross_validate(knn_baseline, data, n_jobs=-1)

In [52]:
# Printing out the test metrics of the cross validation for comparison
for i in cv_knn_baseline.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_baseline['test_rmse']))

('test_rmse', array([3.94815372, 3.86469255, 3.84472542, 3.87792879, 3.90078508]))
('test_mae', array([3.74431268, 3.64038851, 3.60901149, 3.67560409, 3.70454819]))
('fit_time', (0.2838473320007324, 0.27666640281677246, 0.2607753276824951, 0.2805914878845215, 0.2003927230834961))
('test_time', (0.014613151550292969, 0.012702465057373047, 0.012905597686767578, 0.012049198150634766, 0.006788015365600586))
-----------------------
3.887257112156283


### Optimizing the best FSM (SVD)

In [56]:
# Setting up and running the Grid Search to find the best hyperparams for the 
# svd
params = {'n_factors': [20, 50, 100,500],
         'reg_all': [0.02, 0.05, 0.1,0.5]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)

In [58]:
# Printing out the best parameters based on the Grid Search
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 3.884515313788918, 'mae': 3.6709193978445294}
{'rmse': {'n_factors': 100, 'reg_all': 0.1}, 'mae': {'n_factors': 50, 'reg_all': 0.1}}


## Make Recommendations

We are using the anime.csv data set to select the anime names for our recommendations.

In [66]:
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genders,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,1467,94683,587,4300,46165,5121,5378,33719,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,4369,13224,18,642,7314,766,1108,3394,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [59]:
# Instantiating the best model for predictions
best_model = SVD(n_factors = 100, reg_all = 0.1)
best_model.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f6ab9b06650>

In [61]:
#Quick Prediction to see if the model is working
best_model.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=4.2352, details={'was_impossible': False})

## Obtain the User Ratings

In [67]:
# Function rating the movies based on the best model

def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['Genders'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'user_id':userID,'anime_id':movie['MAL_ID'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list      

In [72]:
user_rating = movie_rater(anime_df,4,'Action')

       MAL_ID  ... Score-1
14174   37303  ...    14.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
       MAL_ID  ... Score-1
17348   44078  ...     2.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      MAL_ID         Name Score  ... Score-3 Score-2 Score-1
1084    1189  Eden's Bowy  6.52  ...    46.0    23.0    28.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
       MAL_ID         Name    Score  ...  Score-3  Score-2  Score-1
15622   39473  Möbius Dust  Unknown  ...  Unknown  Unknown  Unknown

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      MAL_ID               Name Score  ... Score-3 Score-2 Score-1
8585   22043  Fairy Tail (2014)  7.73  ...  5595.0  3198.0  2912.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not

In [73]:
user_rating


[{'anime_id': 44078, 'rating': '2', 'user_id': 1000},
 {'anime_id': 1189, 'rating': '3', 'user_id': 1000},
 {'anime_id': 39473, 'rating': '5', 'user_id': 1000},
 {'anime_id': 22043, 'rating': '5', 'user_id': 1000}]

In [81]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = anime_list_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [82]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_anime = []
for m_id in anime_list_df['anime_id'].unique():
    list_of_anime.append( (m_id,best_model.predict(1000,m_id)[3]))

In [83]:
# order the predictions from highest to lowest rated
ranked_anime = sorted(list_of_anime, key=lambda x:x[1], reverse=True)

In [88]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,anime_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = anime_title_df.loc[anime_title_df['MAL_ID'] == int(rec[0])]['Name']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

recommended_movies(ranked_anime,anime_df,5)

Recommendation #  1 :  221    Great Teacher Onizuka
Name: Name, dtype: object 

Recommendation #  2 :  9921    Charlotte
Name: Name, dtype: object 

Recommendation #  3 :  1103    NHK ni Youkoso!
Name: Name, dtype: object 

Recommendation #  4 :  9183    High School DxD BorN
Name: Name, dtype: object 

Recommendation #  5 :  1393    Death Note
Name: Name, dtype: object 

