In [1]:
import numpy as np 
import pandas as pd 
import warnings
import scipy as sp 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
anime = pd.read_csv("anime.csv")
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
anime.shape 

(12294, 7)

In [4]:
anime.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [5]:
anime.genre

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12294, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer #term frequencey- inverse document frequncy is a numerical statistic that is intended to reflect how important a word is to document in a collecion or corpus

In [7]:
# Creating a Tfidf Vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words="english")    #taking stop words from tfid vectorizer 

In [8]:
# replacing the NaN values in overview column with
# empty string
anime["genre"].isnull().sum() 
anime["genre"] = anime["genre"].fillna(" ")


In [9]:
# Preparing the Tfidf matrix by fitting and transforming

tfidf_matrix = tfidf.fit_transform(anime.genre)   #Transform a count matrix to a normalized tf or tf-idf representation
tfidf_matrix.shape 


(12294, 46)

##### With the above matrix we need to find the similarity score.There are several metrics for this such as the euclidean, the Pearson and the cosine similarity scores.For now we will be using cosine similarity matrix.A numeric quantity to represent the similarity between 2 movies. Cosine similarity - metric is independent of magnitude and easy to calculate.

In [11]:
from sklearn.metrics.pairwise import linear_kernel
# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)

In [12]:
# creating a mapping of anime name to index number 
anime_index = pd.Series(anime.index,index=anime['name']).drop_duplicates()

In [13]:
anime_index["Hunter x Hunter (2011)"]

6

In [14]:
def get_anime_recommendations(Name,topN):
    
   
    #topN = 10
    # Getting the movie index using its title 
    anime_id = anime_index[Name]
    
    # Getting the pair wise similarity score for all the anime's with that 
    # anime
    cosine_scores = list(enumerate(cosine_sim_matrix[anime_id]))
    
    # Sorting the cosine_similarity scores based on scores 
    cosine_scores = sorted(cosine_scores,key=lambda x:x[1],reverse = True)
    
    # Get the scores of top 10 most similar anime's 
    cosine_scores_10 = cosine_scores[0:topN+1]
    
    # Getting the anime index 
    anime_idx  =  [i[0] for i in cosine_scores_10]
    anime_scores =  [i[1] for i in cosine_scores_10]
    
    # Similar movies and scores
    anime_similar_show = pd.DataFrame(columns=["name","Score"])
    anime_similar_show["name"] = anime.loc[anime_idx,"name"]
    anime_similar_show["Score"] = anime_scores
    anime_similar_show.reset_index(inplace=True)
    anime_similar_show.drop(["index"],axis=1,inplace=True)
    print (anime_similar_show)
    #return (anime_similar_show)


In [15]:
get_anime_recommendations("Ginga Eiyuu Densetsu",topN=5)

                                         name     Score
0                        Ginga Eiyuu Densetsu  1.000000
1  Ginga Eiyuu Densetsu Gaiden: Rasen Meikyuu  1.000000
2                          Starship Operators  1.000000
3                 Uchuu Senkan Yamato (Movie)  1.000000
4                             Ginga Patrol PJ  1.000000
5                    Uchuu Senkan Yamato 2199  0.954633
