In [43]:
import os #paths to file
import numpy as np # linear algebra
import pandas as pd # data processing
import warnings# warning filter
import scipy as sp #pivot egineering


#ML model
from sklearn.metrics.pairwise import cosine_similarity


#default theme and settings
pd.options.display.max_columns

#handle warnings
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

In [44]:
# reading into dataframes

rating_df = pd.read_csv("rating_complete.csv.zip")

anime_df = pd.read_csv("anime.csv.zip")

In [45]:
rating_df.sample(1)
print(rating_df.shape)

(57633278, 3)


In [46]:
anime_df.sample(1)
print(anime_df.shape)


(17562, 35)


In [47]:
anime_df = anime_df.loc[anime_df['Type'] == 'TV']
print(anime_df.shape)

(4996, 35)


In [48]:
rating_df = rating_df.loc[rating_df['anime_id'].isin(anime_df['MAL_ID'])]
print(rating_df.shape)

(38541711, 3)


In [49]:
# only want anime, so filter the df to only be TV data (anime)
# anime_df = anime_df[anime_df['type']=='TV']

# merging the dataframes

anime1 = anime_df[['MAL_ID', 'Name', 'Score']]

rating1 = rating_df[['user_id', 'anime_id', 'rating']]


merged1 = rating_df.merge(anime1, left_on='anime_id', right_on='MAL_ID', suffixes=['_user', ''])
print(merged1.columns)
# making it only uid, name and rating
# rated_anime =rated_anime[['user_id', 'Name', 'rating']]


Index(['user_id', 'anime_id', 'rating', 'MAL_ID', 'Name', 'Score'], dtype='object')


In [50]:
rated_anime = merged1[['user_id', 'Name', 'Score']]


In [51]:
rated_anime.sample(20)

Unnamed: 0,user_id,Name,Score
25777818,163200,Seikon no Qwaser II,6.39
5302259,103945,Shingeki no Kyojin Season 2,8.45
14420142,169411,Kore wa Zombie Desu ka?,7.43
1073832,171283,Made in Abyss,8.74
30653463,147849,Shinchou Yuusha: Kono Yuusha ga Ore Tueee Kuse...,7.54
6069564,2931,Sankarea,7.36
11295749,226869,D-Frag!,7.58
21838099,193859,Free!: Eternal Summer,7.68
8170009,248903,Working!!,7.7
21098176,32656,Majimoji Rurumo,6.85


In [52]:
rated_anime = rated_anime[pd.to_numeric(rated_anime['Score'], errors='coerce').notnull()]

In [53]:
rated_anime['Score'] = rated_anime['Score'].astype(float)
rated_anime.dtypes

user_id      int64
Name        object
Score      float64
dtype: object

In [54]:
# making a pivot table for similarity calculations

pivot = rated_anime.pivot_table(index=['user_id'], columns=['Name'], values='Score')
pivot.head()

Name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,100% Pascal-sensei (TV),100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,...,ef: A Tale of Memories.,gdMen,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,number24,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,8.01,8.25,


In [55]:
# normalise values
pivot_n = pivot.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# convert NaN values to 0
pivot_n.fillna(0, inplace=True)

# transpose pivot and drop columns with 0 values (unrated)
pivot_n = pivot_n.T
pivot_n = pivot_n.loc[:, (pivot_n != 0).any(axis=0)]

# convert to sparse matrix so we can do similarity computation
piv_sparse = sp.sparse.csr_matrix(pivot_n.values)

In [56]:
# cosine similarity based on the pivot table sparse matrix
anime_similarity = cosine_similarity(piv_sparse)

# making that into a dataframe
ani_sim_df = pd.DataFrame(anime_similarity, index = pivot_n.index, columns = pivot_n.index)


In [None]:
# recommendation fuction
def anime_recommendation(ani_name):
    
    number = 1
    print('Recommended because you watched {}:\n'.format(ani_name))
    try:
      for anime in ani_sim_df.sort_values(by = ani_name, ascending = False).index[1:6]:
          print(f'#{number}: {anime}, {round(ani_sim_df[anime][ani_name]*100,2)}% match')
          number +=1 
    except KeyError:
      print("This anime cannot be found.") 

In [None]:
anime_recommendation('Hunter x Hunter')

Recommended because you watched Hunter x Hunter:

#1: Hajime no Ippo, 29.62% match
#2: Fullmetal Alchemist, 29.05% match
#3: Slam Dunk, 28.86% match
#4: Yuu☆Yuu☆Hakusho, 28.53% match
#5: Rurouni Kenshin: Meiji Kenkaku Romantan, 28.34% match


In [59]:
ani_sim_df.to_csv('anime_similarity.csv.gz', compression="gzip")

In [60]:
csv1 = pd.read_csv("anime_similarity.csv.gz")
csv1.head()

Unnamed: 0,Name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,100% Pascal-sensei (TV),100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,12-sai.: Chicchana Mune no Tokimeki,...,ef: A Tale of Memories.,gdMen,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,number24,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
0,.hack//Roots,1.0,0.446023,0.472166,0.070928,0.091268,0.011659,0.024909,0.108471,0.015822,...,-0.120432,0.005148,0.009952,0.009207,0.057498,0.007536,0.083601,-0.165894,-0.142711,0.030586
1,.hack//Sign,0.446023,1.0,0.449841,0.066707,0.103156,0.010364,0.029923,0.109738,0.015573,...,-0.139168,0.007509,0.015411,0.013301,0.058979,0.00776,0.151773,-0.179726,-0.150499,0.028322
2,.hack//Tasogare no Udewa Densetsu,0.472166,0.449841,1.0,0.082788,0.073733,0.01592,0.024476,0.104293,0.020198,...,-0.130374,0.011896,0.011295,0.009816,0.067943,0.009126,0.089539,-0.176686,-0.141359,0.028464
3,009-1,0.070928,0.066707,0.082788,1.0,0.030941,0.041866,0.016942,0.072477,0.029891,...,-0.063085,0.012404,0.015194,0.010559,0.044229,0.00684,0.011065,-0.104258,-0.080213,0.03075
4,07-Ghost,0.091268,0.103156,0.073733,0.030941,1.0,-0.00085,0.023105,0.157014,0.020044,...,-0.084634,-0.001278,0.002444,0.002612,0.017835,0.024538,0.056652,-0.140468,-0.149337,0.027444


In [62]:
csv1 = csv1.set_index('Name')
csv1.head()


Unnamed: 0_level_0,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,100% Pascal-sensei (TV),100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,...,ef: A Tale of Memories.,gdMen,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,number24,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.446023,0.472166,0.070928,0.091268,0.011659,0.024909,0.108471,0.015822,0.013099,...,-0.120432,0.005148,0.009952,0.009207,0.057498,0.007536,0.083601,-0.165894,-0.142711,0.030586
.hack//Sign,0.446023,1.0,0.449841,0.066707,0.103156,0.010364,0.029923,0.109738,0.015573,0.012479,...,-0.139168,0.007509,0.015411,0.013301,0.058979,0.00776,0.151773,-0.179726,-0.150499,0.028322
.hack//Tasogare no Udewa Densetsu,0.472166,0.449841,1.0,0.082788,0.073733,0.01592,0.024476,0.104293,0.020198,0.016561,...,-0.130374,0.011896,0.011295,0.009816,0.067943,0.009126,0.089539,-0.176686,-0.141359,0.028464
009-1,0.070928,0.066707,0.082788,1.0,0.030941,0.041866,0.016942,0.072477,0.029891,0.028936,...,-0.063085,0.012404,0.015194,0.010559,0.044229,0.00684,0.011065,-0.104258,-0.080213,0.03075
07-Ghost,0.091268,0.103156,0.073733,0.030941,1.0,-0.00085,0.023105,0.157014,0.020044,0.011805,...,-0.084634,-0.001278,0.002444,0.002612,0.017835,0.024538,0.056652,-0.140468,-0.149337,0.027444


In [66]:
animename = 'Vinland Saga'
for anime in csv1.sort_values(by=animename, ascending=False).index[1:6]:
  print(f'{anime}, {round(csv1[anime][animename]*100,2)}% match') 

Dr. Stone, 53.57% match
Kimetsu no Yaiba, 53.37% match
Shingeki no Kyojin Season 3 Part 2, 52.16% match
Shingeki no Kyojin Season 3, 50.86% match
Yakusoku no Neverland, 50.71% match
