# Anime Recommendations Project

# Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import warnings# warning filter
import scipy as sp #pivot egineering


#ML model
from sklearn.metrics.pairwise import cosine_similarity


#default theme and settings
pd.options.display.max_columns

#warning hadle
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

# Preprocessing and Data Analysis

In [2]:
rating_path = "./Data Set/rating.csv"
anime_path = "./Data Set/anime.csv"

In [3]:
rating_df = pd.read_csv(rating_path)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [4]:
anime_df = pd.read_csv(anime_path)
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
print(f"anime set (row, col): {anime_df.shape}\n\nrating set (row, col): {rating_df.shape}")

anime set (row, col): (12294, 7)

rating set (row, col): (7813737, 3)


In [6]:
print("Anime:\n")
print(anime_df.info())

Anime:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [7]:
print("Rating:\n")
print(rating_df.info())

Rating:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB
None


# Handling Missing Values

In [8]:
print("Anime missing values (%):\n")
print(round(anime_df.isnull().sum().sort_values(ascending=False)/len(anime_df.index),4)*100) 
# Anime dataset is missing 1.87% of ratings

Anime missing values (%):

rating      1.87
genre       0.50
type        0.20
anime_id    0.00
name        0.00
episodes    0.00
members     0.00
dtype: float64


In [9]:
print("Rating missing values (%):\n")
print(round(rating_df.isnull().sum().sort_values(ascending=False)/len(rating_df.index),4)*100)
# Rating dataset does not have any missing values

Rating missing values (%):

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64


In [10]:
print(anime_df['type'].mode())
print(anime_df['genre'].mode())
# The mode of "genre" column in anime database is "TV"
# The mode of "type" column in anime database is "Hentai"

0    TV
Name: type, dtype: object
0    Hentai
Name: genre, dtype: object


In [11]:
# deleting anime with 0 rating
anime_df=anime_df[~np.isnan(anime_df["rating"])]

# filling mode value for genre and type
anime_df['genre'] = anime_df['genre'].fillna(
anime_df['genre'].dropna().mode().values[0])

anime_df['type'] = anime_df['type'].fillna(
anime_df['type'].dropna().mode().values[0])

#checking if all null values are filled
anime_df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

# Feature Engineering

In [12]:
# I have seen earlier that there is a -1 rating,
# this suggests that the user did not register a rating,
# so I will foll the -1 ratings with NaN.
rating_df['rating'] = rating_df['rating'].apply(lambda x: np.nan if x==-1 else x)
rating_df.head(20)

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,
5,1,355,
6,1,356,
7,1,442,
8,1,487,
9,1,846,


In [13]:
# Now, we splice our dataframe
# We want to recommend anime series, so the relevant type is "TV"
anime_df = anime_df[anime_df['type']=='TV']

# JOIN the anime and rating dataframe on "anime_id" to form a new dataframe
rated_anime = rating_df.merge(anime_df, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])

# Project the "user_id", "name", and "rating" only
rated_anime = rated_anime[['user_id', 'name', 'rating']]

# Limit to 10000 users for computational purposes
rated_anime_10000 = rated_anime[rated_anime.user_id <= 10000]
rated_anime_10000.head()

Unnamed: 0,user_id,name,rating
0,1,Naruto,7.81
1,3,Naruto,7.81
2,5,Naruto,7.81
3,6,Naruto,7.81
4,10,Naruto,7.81


# Pivot Table

In [14]:
# Now, creating a pivot table with users as rows and anime names as columns
# will help us analyze for calculations of similarity

In [15]:
pivot = rated_anime_10000.pivot_table(index=['user_id'], columns=['name'], values='rating')
pivot.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,6.49,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,8.11,


In [16]:
# Value Normalization
pivot_n = pivot.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# Filling NaN values as 0
pivot_n.fillna(0, inplace=True)

# Transposing the pivot for the next step
pivot_n = pivot_n.T

# Dropping columns with the values of 0 (user did not rate the anime)
pivot_n = pivot_n.loc[:, (pivot_n != 0).any(axis=0)]

# Using scipy package to convert to sparse matrix format for similarity computation
piv_sparse = sp.sparse.csr_matrix(pivot_n.values)

# Using the Cosine Similarity Model

In [17]:
#model based on anime similarity
anime_similarity = cosine_similarity(piv_sparse)

#Df of anime similarities
ani_sim_df = pd.DataFrame(anime_similarity, index = pivot_n.index, columns = pivot_n.index)
ani_sim_df

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.000000,0.483043,0.436756,0.035721,0.070056,0.122621,0.004887,0.000000,0.051610,-0.025314,...,0.065054,0.0,-0.113007,-0.108576,0.000000,0.000000,0.065426,0.132925,-0.135748,-0.115742
.hack//Sign,0.483043,1.000000,0.441474,0.035239,0.089239,0.093153,0.003677,0.000000,0.059214,-0.023532,...,0.076632,0.0,-0.111885,-0.122537,0.011767,0.010797,0.048033,0.155897,-0.137902,-0.118404
.hack//Tasogare no Udewa Densetsu,0.436756,0.441474,1.000000,0.064496,0.057389,0.092265,0.005965,0.000000,0.043266,-0.021194,...,0.030686,0.0,-0.119785,-0.129481,0.000000,0.000000,0.070611,0.104973,-0.133673,-0.101508
009-1,0.035721,0.035239,0.064496,1.000000,0.012034,0.061638,0.011157,0.000000,0.002477,-0.023557,...,0.008970,0.0,-0.030307,-0.031157,0.000000,0.000000,0.013543,0.012651,-0.082850,-0.058980
07-Ghost,0.070056,0.089239,0.057389,0.012034,1.000000,0.164189,0.009277,0.000548,0.044063,-0.025039,...,0.009376,0.0,-0.084299,-0.080929,-0.000472,-0.000355,0.021332,0.051022,-0.097623,-0.111422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gdgd Fairies 2,0.000000,0.010797,0.000000,0.000000,-0.000355,0.001218,0.000000,0.000000,0.000000,-0.016189,...,0.000000,0.0,-0.003648,-0.007009,0.776453,1.000000,0.006611,0.009750,-0.010336,0.000000
iDOLM@STER Xenoglossia,0.065426,0.048033,0.070611,0.013543,0.021332,0.068340,0.042430,0.000000,0.010935,-0.042748,...,0.018833,0.0,-0.124561,-0.126264,0.008778,0.006611,1.000000,0.003082,-0.089927,-0.063873
s.CRY.ed,0.132925,0.155897,0.104973,0.012651,0.051022,0.023177,0.000000,0.000000,0.011593,-0.009030,...,0.026179,0.0,-0.015562,-0.027255,0.010670,0.009750,0.003082,1.000000,-0.068192,-0.055123
xxxHOLiC,-0.135748,-0.137902,-0.133673,-0.082850,-0.097623,-0.154606,-0.009722,0.000000,-0.097098,0.033333,...,-0.025580,0.0,0.215747,0.223329,-0.011312,-0.010336,-0.089927,-0.068192,1.000000,0.710562


In [18]:
ani_sim_df[["Nanatsu no Taizai"]]

name,Nanatsu no Taizai
name,Unnamed: 1_level_1
.hack//Roots,-0.068064
.hack//Sign,-0.067654
.hack//Tasogare no Udewa Densetsu,-0.059080
009-1,-0.034479
07-Ghost,-0.079788
...,...
gdgd Fairies 2,-0.004078
iDOLM@STER Xenoglossia,-0.051665
s.CRY.ed,-0.042605
xxxHOLiC,0.093982


In [19]:
def anime_recommendation(ani_name):
    """
    This function will return the top 5 shows with the highest cosine similarity value and show match percent
    
    example:
    >>>Input: 
    
    anime_recommendation('Death Note')
    
    >>>Output: 
    
    Recommended because you watched Death Note:

                    #1: Code Geass: Hangyaku no Lelouch, 57.35% match
                    #2: Code Geass: Hangyaku no Lelouch R2, 54.81% match
                    #3: Fullmetal Alchemist, 51.07% match
                    #4: Shingeki no Kyojin, 48.68% match
                    #5: Fullmetal Alchemist: Brotherhood, 45.99% match 

               
    """
    
    number = 1
    print('Recommended because you watched {}:\n'.format(ani_name))
    for anime in ani_sim_df.sort_values(by = ani_name, ascending = False).index[1:6]:
        print(f'#{number}: {anime}, {round(ani_sim_df[anime][ani_name]*100,2)}% match')
        number +=1  

In [21]:
anime_recommendation("Mahouka Koukou no Rettousei")

Recommended because you watched Mahouka Koukou no Rettousei:

#1: Dungeon ni Deai wo Motomeru no wa Machigatteiru Darou ka, 39.44% match
#2: Rakudai Kishi no Cavalry, 37.33% match
#3: Tokyo Ravens, 34.89% match
#4: Mondaiji-tachi ga Isekai kara Kuru Sou Desu yo?, 34.77% match
#5: Nisekoi, 34.56% match


inspired by Yonatan Rabinovich, https://www.kaggle.com/yonatanrabinovich