In [None]:
import os
import numpy as np
import pandas as pd 
import warnings
import scipy as sp 

#ML model
from sklearn.metrics.pairwise import cosine_similarity

#default theme and settings
pd.options.display.max_columns

#handle warnings
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

Initial datasets were downloaded from [https://www.kaggle.com/hernan4444/anime-recommendation-database-2020](https://www.kaggle.com/hernan4444/anime-recommendation-database-2020)

Where ``rating_df`` is the ``rating_complete.csv`` dataset, and ``anime_df`` is the ``anime.csv`` set.

In [44]:
# reading into dataframes
rating_df = pd.read_csv("rating_complete.csv.zip")

anime_df = pd.read_csv("anime.csv.zip")

As seen by the shapes of the datasets, there is a lot of data to be processed:

In [45]:
# initial shapes
print(rating_df.shape)
print(anime_df.shape)

(57633278, 3)


However, after removing any MAL entries that weren't a TV series, and any ratings that weren't related to titles in the anime dataset, there was considerably less data.

In [47]:
# selecting only TV listings(anime series, not movies or manga etc.)
anime_df = anime_df.loc[anime_df['Type'] == 'TV']
print(anime_df.shape)

(4996, 35)


In [48]:
# selecting only ratings for the anime that is in the other dataset
rating_df = rating_df.loc[rating_df['anime_id'].isin(anime_df['MAL_ID'])]
print(rating_df.shape)

(38541711, 3)


Some preprocessing had to be done to make the datasets even smaller - merging the full datasets would take a very long time or run out of memory to use.

In [49]:
# before merging, select only the columns we need
anime1 = anime_df[['MAL_ID', 'Name', 'Score']]

rating1 = rating_df[['user_id', 'anime_id', 'rating']]

# merging the two dataframes
merged1 = rating_df.merge(anime1, left_on='anime_id', right_on='MAL_ID', suffixes=['_user', ''])

# making it only uid, name and rating
rated_anime = merged1[['user_id', 'Name', 'Score']]


Index(['user_id', 'anime_id', 'rating', 'MAL_ID', 'Name', 'Score'], dtype='object')


In [51]:
# A quick sample to check that everything is looking good
rated_anime.sample(20)

Unnamed: 0,user_id,Name,Score
25777818,163200,Seikon no Qwaser II,6.39
5302259,103945,Shingeki no Kyojin Season 2,8.45
14420142,169411,Kore wa Zombie Desu ka?,7.43
1073832,171283,Made in Abyss,8.74
30653463,147849,Shinchou Yuusha: Kono Yuusha ga Ore Tueee Kuse...,7.54
6069564,2931,Sankarea,7.36
11295749,226869,D-Frag!,7.58
21838099,193859,Free!: Eternal Summer,7.68
8170009,248903,Working!!,7.7
21098176,32656,Majimoji Rurumo,6.85


In [52]:
# Dropping any scores that are not numeric - some are listed as "Unknown"
rated_anime = rated_anime[pd.to_numeric(rated_anime['Score'], errors='coerce').notnull()]

In [53]:
# Converting the Score column to float from object
rated_anime['Score'] = rated_anime['Score'].astype(float)
rated_anime.dtypes

user_id      int64
Name        object
Score      float64
dtype: object

In [54]:
# making a pivot table for similarity calculations
pivot = rated_anime.pivot_table(index=['user_id'], columns=['Name'], values='Score')
pivot.head()

Name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,100% Pascal-sensei (TV),100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,...,ef: A Tale of Memories.,gdMen,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,number24,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,8.01,8.25,


In [55]:
# normalise values
pivot_n = pivot.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# convert NaN values to 0
pivot_n.fillna(0, inplace=True)

# transpose pivot and drop columns with 0 values (no rating)
pivot_n = pivot_n.T
pivot_n = pivot_n.loc[:, (pivot_n != 0).any(axis=0)]

# convert to sparse matrix so we can do similarity computation
piv_sparse = sp.sparse.csr_matrix(pivot_n.values)

In [56]:
# cosine similarity based on the pivot table sparse matrix
anime_similarity = cosine_similarity(piv_sparse)

# putting that into a dataframe
ani_sim_df = pd.DataFrame(anime_similarity, index = pivot_n.index, columns = pivot_n.index)

At this point, the dataset is cleaned and filled with cosine similarity values that allow us to make recommendations. From here, we can export the dataframe to a csv file, compressed using gzip, for use in our api.

In [None]:
ani_sim_df.to_csv('anime_similarity.csv.gz', compression="gzip")