In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

# Read the data

In [2]:
anime = pd.read_csv("anime.csv")
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
# only select tv show and movie
print(anime.shape)
anime = anime[(anime['type'] == 'TV') | (anime['type'] == 'Movie')]
print(anime.shape)

(12294, 7)
(6135, 7)


In [4]:
# only select famous anime, 75% percentile
m = anime['members'].quantile(0.75)
anime = anime[(anime['members'] >= m)]
anime.shape

(1534, 7)

In [5]:
rating = pd.read_csv("rating.csv")
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
rating.shape

(7813737, 3)

# Replacing missing rating with NaN

In [7]:
rating.loc[rating.rating == -1, 'rating'] = np.NaN
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


# Create index for anime name

In [8]:
anime_index = pd.Series(anime.index, index=anime.name)
anime_index.head()

name
Kimi no Na wa.                      0
Fullmetal Alchemist: Brotherhood    1
Gintama°                            2
Steins;Gate                         3
Gintama&#039;                       4
dtype: int64

# Join the data

In [9]:
joined = anime.merge(rating, how='inner', on='anime_id')
joined.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5.0
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10.0
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10.0
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10.0
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,278,


# Create a pivot table

In [10]:
joined = joined[['user_id', 'name', 'rating_y']]

pivot = pd.pivot_table(joined, index='name', columns='user_id', values='rating_y')
pivot.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,,,,,,,,,,,...,,,,,,,,,,
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,


In [11]:
pivot.shape

(1490, 69224)

# Drop all users that never rate an anime

In [12]:
pivot.dropna(axis=1, how='all', inplace=True)
pivot.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,,,,,,,,,,,...,,,,,,,,,,
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,


In [13]:
pivot.shape

(1490, 69224)

# Center the mean around 0 (centered cosine / pearson)

In [14]:
pivot_norm = pivot.apply(lambda x: x - np.nanmean(x), axis=1)
pivot_norm.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,,,,,,,,,,,...,,,,,,,,,,
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,


# Item Based Collaborative Filtering

In [15]:
# fill NaN with 0
pivot_norm.fillna(0, inplace=True)
pivot_norm.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate Similar Items

In [16]:
# convert into dataframe to make it easier
item_sim_df = pd.DataFrame(cosine_similarity(pivot_norm, pivot_norm), index=pivot_norm.index, columns=pivot_norm.index)
item_sim_df.head()

name,&quot;Bungaku Shoujo&quot; Movie,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,11eyes,30-sai no Hoken Taiiku,91 Days,A-Channel,AKB0048,...,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,1.0,0.032753,0.026011,0.036676,0.038012,0.046374,0.047874,0.009425,0.056224,0.0402,...,0.048061,0.036628,0.045698,0.031965,0.073987,0.079183,0.008384,0.035791,0.049389,0.047023
.hack//Roots,0.032753,1.0,0.289985,0.315053,0.074391,0.061119,0.027616,0.006698,0.027652,0.031912,...,0.043802,0.034236,0.066003,0.004396,0.038025,0.030098,0.049731,0.038284,0.040426,0.031665
.hack//Sign,0.026011,0.289985,1.0,0.269825,0.05885,0.050163,0.033374,0.014148,0.035317,0.031722,...,0.038653,0.03326,0.056693,0.005344,0.049085,0.050667,0.067435,0.055709,0.031301,0.03371
.hack//Tasogare no Udewa Densetsu,0.036676,0.315053,0.269825,1.0,0.047875,0.053741,0.027466,0.009595,0.04057,0.023247,...,0.038657,0.034183,0.063091,0.000434,0.04459,0.036658,0.072843,0.048902,0.04161,0.038522
07-Ghost,0.038012,0.074391,0.05885,0.047875,1.0,0.122489,0.04018,0.02106,0.051344,0.035019,...,0.094213,0.017888,0.129566,0.00798,0.042737,0.036837,0.041351,0.054222,0.066368,0.042567


In [40]:
def get_similar_anime(anime_name):
    if anime_name not in pivot_norm.index:
        return None, None
    else:
        sim_animes = item_sim_df.sort_values(by=anime_name, ascending=False).index[1:]
        sim_score = item_sim_df.sort_values(by=anime_name, ascending=False).loc[:, anime_name].tolist()[1:]
        return sim_animes, sim_score

In [41]:
animes, score = get_similar_anime("Steins;Gate")
for x,y in zip(animes[:10], score[:10]):
    print("{} with similarity of {}".format(x, y))

Steins;Gate Movie: Fuka Ryouiki no Déjà vu with similarity of 0.24336781483926162
Psycho-Pass with similarity of 0.1841425084628398
Code Geass: Hangyaku no Lelouch R2 with similarity of 0.18246014048789982
Fate/Zero 2nd Season with similarity of 0.18113560310021662
Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai. with similarity of 0.17588179421245187
Fate/Zero with similarity of 0.17547220214978015
Fullmetal Alchemist: Brotherhood with similarity of 0.16823351950204468
Code Geass: Hangyaku no Lelouch with similarity of 0.1678560093146475
Angel Beats! with similarity of 0.16348349134494156
Nisemonogatari with similarity of 0.15594071809614196


## Helper Function

In [19]:
# predict the rating of anime x by user y
def predict_rating(user_id, anime_name, max_neighbor=10):
    animes, scores = get_similar_anime(anime_name)
    anime_arr = np.array([x for x in animes])
    sim_arr = np.array([x for x in scores])
    
    # select only the anime that has already rated by user x
    filtering = pivot_norm[user_id].loc[anime_arr] != 0
    
    # calculate the predicted score
    s = np.dot(sim_arr[filtering][:max_neighbor], pivot[user_id].loc[anime_arr[filtering][:max_neighbor]]) \
            / np.sum(sim_arr[filtering][:max_neighbor])
    
    return s

In [20]:
predict_rating(3, "Steins;Gate")

9.0511507264121

In [21]:
predict_rating(3, "Cowboy Bebop")

8.645364065411728

## Get Recommendation

In [22]:
# recommend top n_anime for user x based on item collaborative filtering algorithm
def get_recommendation(user_id, n_anime=10):
    predicted_rating = np.array([])
    
    for _anime in pivot_norm.index:
        predicted_rating = np.append(predicted_rating, predict_rating(user_id, _anime))
    
    # don't recommend something that user has already rated
    temp = pd.DataFrame({'predicted':predicted_rating, 'name':pivot_norm.index})
    filtering = (pivot_norm[user_id] == 0.0)
    temp = temp.loc[filtering.values].sort_values(by='predicted', ascending=False)

    # recommend n_anime anime
    return anime.loc[anime_index.loc[temp.name[:n_anime]]]

In [23]:
get_recommendation(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
32,5258,Hajime no Ippo: New Challenger,"Comedy, Drama, Shounen, Sports",TV,26,8.75,88995
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
44,19647,Hajime no Ippo: Rising,"Comedy, Drama, Shounen, Sports",TV,25,8.68,66756
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
108,7655,Major S6,"Comedy, Drama, Sports",TV,25,8.49,24788
12,918,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,201,9.04,336376
20,263,Hajime no Ippo,"Comedy, Drama, Shounen, Sports",TV,75,8.83,157670
39,12365,Bakuman. 3rd Season,"Comedy, Drama, Romance, Shounen",TV,25,8.71,133620
405,2159,Ookiku Furikabutte,"Comedy, Sports",TV,25,8.11,49599
139,627,Major S1,"Comedy, Drama, Shounen, Sports",TV,26,8.42,41061


In [24]:
get_recommendation(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
65,7472,Gintama Movie: Shinyaku Benizakura-hen,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,8.59,51185
228,10271,Gyakkyou Burai Kaiji: Hakairoku-hen,"Game, Psychological, Seinen, Thriller",TV,26,8.29,55566
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
307,1430,Lupin III: Cagliostro no Shiro,"Adventure, Comedy, Shounen",Movie,1,8.2,32732
511,658,Touhai Densetsu Akagi: Yami ni Maiorita Tensai,"Game, Psychological, Seinen, Thriller",TV,26,8.02,48127
17,24701,Mushishi Zoku Shou 2nd Season,"Adventure, Fantasy, Historical, Mystery, Seine...",TV,10,8.88,75894
496,416,Kurenai no Buta,"Action, Adventure, Comedy, Drama, Historical, ...",Movie,1,8.03,74028
109,2246,Mononoke,"Demons, Fantasy, Historical, Horror, Mystery, ...",TV,12,8.49,88850
170,513,Tenkuu no Shiro Laputa,"Adventure, Fantasy, Romance, Sci-Fi",Movie,1,8.38,151061


Compared to the user based collaborative filtering, the recommendation given may be very different. As we know, in the user based, so many user did not rate the anime. In the item based however, I think it is more robust because there is no anime that never rated by users.