<a href="https://colab.research.google.com/github/JCherryA050/phase_4_project/blob/main/surprise_recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Adnime Recommendations with Surprise

## Load in the necessary packages and get a quick look at the data

In [1]:
# Surprise is not downloaded in Google Colab instance so we have to download the packages first
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 248kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617593 sha256=a7f28f387a66c8358a658f23f9ad42c4da85f5266f34fad5b5d673769771c2c3
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [1]:
# importing all of the relevant packages for the project
# Standard Packages
import pandas as pd
import numpy as np

# Surprise Packages
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
import random
random.seed(1)

In [31]:
# Load in the data as Pandas DataFrame
anime_df = pd.read_csv('anime.csv')
anime_list_df = pd.read_csv('animelist.csv')

In [32]:
# Looking at the different features in the dataset, the set will fit into a 
# surprise model well as it has the relavent columns
anime_list_df.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9.0,1.0,1.0
1,0,6702,7.0,1.0,4.0
2,0,242,10.0,1.0,4.0
3,0,4898,0.0,1.0,1.0
4,0,21,10.0,1.0,0.0


In [5]:
# Looking at the different features of the anime, there seems to be some anime 
# with episode less than a minute and also anime of ill repute that we will
# not include in the scope of the project.
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genders,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,1467,94683,587,4300,46165,5121,5378,33719,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,4369,13224,18,642,7314,766,1108,3394,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


## Cleaning the Data and reformatting

In [33]:
# Need to drop all of the unnecessary columns from the DataFrame
anime_list_df.drop(['watching_status','watched_episodes'],axis=1,inplace=True)

In [34]:
# dropping all of the anime of ill repute
anime_df = anime_df[~anime_df['Genders'].str.contains("Hentai")]

# dropping all of the anime with air rimes less than 1 sec.
anime_df = anime_df[~anime_df['Duration'].str.contains("sec")]
anime_df = anime_df[~anime_df['Type'].str.contains('|'.join(['OVA', 'Special', 'ONA', 'Unknown']))]

# using the ID of the cleaned anime list to clean the list of ratings as well
anime_list_df = anime_list_df[anime_list_df['anime_id'].isin(list(anime_df['MAL_ID'].tolist()))]

In [35]:
anime_list_df.shape

(47205472, 3)

In [None]:
# There is way too much data in this set so we will be resampling the data to 
# make a smaller data set.
# anime_list_df = anime_list_df.sample(5000)

In [None]:
# Loading in the DataFrame as a surprise objet
reader = Reader()
data = Dataset.load_from_df(anime_list_df,reader)

In [None]:
# Looking at how many users and items are in the data to determine whether we 
# use a user-user or item-item similarity method
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

In [None]:
# Looks like there are vastly more items than there are users here so if we are 
# using a neighborhood-based model we will evaluate with a item-item approach

In [None]:
# loaidng a spark object with the cleaned data for use in pyspark models
# rec_data = spark.createDataFrame(cleaned_df)

## First simple models with SVD, KNNBasic, and KNNBaseline


In [None]:
# Splitting the data into train/test sets we will be using 80% of the data to
# train the model and 20% for validating the model.
trainset,testset = train_test_split(data,test_size=0.2)

In [11]:
# Instantiating the models

#SVD
svd = SVD(n_factors=50,reg_all=0.05)

# KNNBasic
knn_basic = KNNBasic(sim_options = {'name':'pearson', 'user_based':False})

# KNNBaseline
knn_baseline = KNNBaseline(sim_options = {'name':'pearson', 'user_based':False})

### SVD


In [12]:
# Run cross validation on the data
cv_svd = cross_validate(svd,data,n_jobs=-1)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-636acfb22f03>", line 2, in <module>
    cv_svd = cross_validate(svd,data,n_jobs=-1)
  File "/usr/local/lib/python3.7/dist-packages/surprise/model_selection/validation.py", line 103, in cross_validate
    out = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch)(delayed_list)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1054, in __call__
    self.retrieve()
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 933, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py", line 542, in wrap_future_result
    return future.result(timeout=timeout)
  File "/usr/lib/python3.7/concurrent/futures/_base.py", line 430, in result
    self._condition.

KeyboardInterrupt: ignored

In [None]:
# Printing out the test metrics of the cross validation for comparison
for i in cv_svd.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svd['test_rmse']))

### KNNBasic

In [None]:
# Run cross validation on the data 
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [None]:
# Printing out the test metrics of the cross validation for comparison
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

### KNNBaseline

In [None]:
# Run cross validation on the data  
cv_knn_baseline = cross_validate(knn_baseline, data, n_jobs=-1)

In [None]:
# Printing out the test metrics of the cross validation for comparison
for i in cv_knn_baseline.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_baseline['test_rmse']))

### Optimizing the best FSM (SVD)

In [None]:
# Setting up and running the Grid Search to find the best hyperparams for the 
# svd
params = {'n_factors': [20, 50, 100,500],
         'reg_all': [0.02, 0.05, 0.1,0.5]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)

In [None]:
# Printing out the best parameters based on the Grid Search
print(g_s_svd.best_score)
print(g_s_svd.best_params)

## Make Recommendations

We are using the anime.csv data set to select the anime names for our recommendations.

In [13]:
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genders,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,1467,94683,587,4300,46165,5121,5378,33719,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,4369,13224,18,642,7314,766,1108,3394,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [21]:
# Instantiating the best model for predictions
best_model = knn_basic #SVD(n_factors = 100, reg_all = 0.1)
best_model.fit(dataset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f5856ac6a10>

In [15]:
#Quick Prediction to see if the model is working
best_model.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=1.7231755941676012, details={'was_impossible': False})

## Obtain the User Ratings

In [None]:
# Function rating the movies based on the best model

def movie_rater(movie_df,num, genre=None):
    userID = 1000000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['Genders'].str.contains(genre)].sample(1)['Name']
        else:
            movie = movie_df.sample(1)['Name']
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'user_id':userID,'anime_id':movie['MAL_ID'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list      

In [None]:
user_rating = movie_rater(anime_df,4,'Action')

       MAL_ID  ... Score-1
14174   37303  ...    14.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
       MAL_ID  ... Score-1
17348   44078  ...     2.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      MAL_ID         Name Score  ... Score-3 Score-2 Score-1
1084    1189  Eden's Bowy  6.52  ...    46.0    23.0    28.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
       MAL_ID         Name    Score  ...  Score-3  Score-2  Score-1
15622   39473  Möbius Dust  Unknown  ...  Unknown  Unknown  Unknown

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      MAL_ID               Name Score  ... Score-3 Score-2 Score-1
8585   22043  Fairy Tail (2014)  7.73  ...  5595.0  3198.0  2912.0

[1 rows x 35 columns]
How do you rate this movie on a scale of 1-5, press n if you have not

In [None]:
(user_id,1,7), # Cowboy Bepop
                  (user_id,2,7), # Trigun
                  (user_id,30,10), # Neon Genesis Evangelion
                  (user_id,32937,10), # KonoSuba 2
                  (user_id,22199,5), # Akame ga Kill!
                  (user_id,18679,8),# Kill la Kill
                  (user_id, 227, 10)] # FLCL 

In [16]:
user_rating = [{'anime_id': 2, 'rating': '7', 'user_id': 1000000},
               {'anime_id': 30, 'rating': '10', 'user_id': 1000000},
               {'anime_id': 32937, 'rating': '10', 'user_id': 1000000},
               {'anime_id': 22199, 'rating': '5', 'user_id': 1000000},
               {'anime_id': 18679, 'rating': '8', 'user_id': 1000000},
               {'anime_id': 227, 'rating': '10', 'user_id': 1000000}]


In [22]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = anime_list_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

## Make Predictions

In [23]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_anime = []
for m_id in anime_list_df['anime_id'].unique():
    list_of_anime.append( (m_id,best_model.predict(1000,m_id)[3]))

In [24]:
# order the predictions from highest to lowest rated
ranked_anime = sorted(list_of_anime, key=lambda x:x[1], reverse=True)

In [25]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,anime_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = anime_title_df.loc[anime_title_df['MAL_ID'] == int(rec[0])]['Name']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

recommended_movies(ranked_anime,anime_df,10)

Recommendation #  1 :  3018    Jungle no Ouja Taa-chan
Name: Name, dtype: object 

Recommendation #  2 :  2641    Anime Oyako Gekijou
Name: Name, dtype: object 

Recommendation #  3 :  2362    Don Chuck Monogatari
Name: Name, dtype: object 

Recommendation #  4 :  15748    Sayonara Freeway
Name: Name, dtype: object 

Recommendation #  5 :  14677    Crescendo Story
Name: Name, dtype: object 

Recommendation #  6 :  15143    Ok? Alright!
Name: Name, dtype: object 

Recommendation #  7 :  14515    Very! Merry!! Session!!!
Name: Name, dtype: object 

Recommendation #  8 :  10618    Walking Man
Name: Name, dtype: object 

Recommendation #  9 :  3293    Shirayuki Hime no Densetsu
Name: Name, dtype: object 

Recommendation #  10 :  9920    Hand Soap
Name: Name, dtype: object 

