Preprocessing Anime dataset from Kaggle

In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.neighbors import NearestNeighbors
from surprise import KNNWithMeans
import matplotlib.pyplot as plt
import re
import string

1. Data preprocessing

1.1 Data Cleaning

In [2]:
# remove unwanted features (columns) from the dataset
anime_df = pd.read_csv("animes.csv")
anime_df.rename(columns={'title': 'name'}, inplace=True)
anime_df.drop(['aired', 'ranked', 'img_url', 'link'], axis=1, inplace=True)

#removing unwanted characters from the anime name strings
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'Â°', '',text)

    return text

anime_df['name'] = anime_df['name'].apply(text_cleaning)
anime_df.head(5)

Unnamed: 0,uid,name,synopsis,genre,episodes,members,popularity,score
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,8.83


In [3]:
anime_df.rename(columns={'uid': 'anime_uid', 'score': 'rating'}, inplace=True)
anime_df.episodes.replace({'Unknown':np.nan},inplace=True)

anime_df.drop_duplicates(subset=['name'], inplace=True)
anime_df.dropna(inplace=True)
anime_df.reset_index(drop=True, inplace=True)

anime_df.head(5)

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,8.83


In [4]:
# replace the characters "[]'" with an empty space as the genre column is already of type string
anime_df['genre'] = anime_df['genre'].str.replace("'", "", regex=False)
anime_df['genre'] = anime_df['genre'].str.replace("[", "", regex=False)
anime_df['genre'] = anime_df['genre'].str.replace("]", "", regex=False)

anime_df.head(5)

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",1.0,214621,502,8.83


In [5]:
normalised_anime_df = anime_df.copy()

weights = {
    'genre': 0.35,
    'members_norm': 0.1,
    'rating_norm': 0.35,
    'popularity_norm': 0.1,
    'episodes_norm': 0.1
}

normalised_anime_df['members_norm'] = normalised_anime_df['members'] / normalised_anime_df['members'].max() * weights['members_norm']
normalised_anime_df['rating_norm'] = normalised_anime_df['rating'] / normalised_anime_df['rating'].max() * weights['rating_norm']
normalised_anime_df['popularity_norm'] = normalised_anime_df['popularity'] / normalised_anime_df['popularity'].max() * weights['popularity_norm']
normalised_anime_df['episodes_norm'] = normalised_anime_df['episodes'] / normalised_anime_df['episodes'].max() * weights['episodes_norm']

normalised_anime_df.head()

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating,members_norm,rating_norm,popularity_norm,episodes_norm
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,0.026183,0.334453,0.000864,0.000818
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",22.0,995473,28,8.83,0.053204,0.334832,0.000172,0.00072
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",13.0,581663,98,8.83,0.031088,0.334832,0.0006,0.000425
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",64.0,1615084,4,9.23,0.08632,0.35,2.5e-05,0.002094
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",1.0,214621,502,8.83,0.011471,0.334832,0.003076,3.3e-05


In [6]:
normalised_anime_df.drop(['members', 'rating', 'popularity', 'episodes'], axis=1, inplace=True)

normalised_anime_df.head()

Unnamed: 0,anime_uid,name,synopsis,genre,members_norm,rating_norm,popularity_norm,episodes_norm
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",0.026183,0.334453,0.000864,0.000818
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",0.053204,0.334832,0.000172,0.00072
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",0.031088,0.334832,0.0006,0.000425
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",0.08632,0.35,2.5e-05,0.002094
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",0.011471,0.334832,0.003076,3.3e-05


In [7]:
genres_df = anime_df['genre'].str.get_dummies(sep=', ').astype(int)
genres_df = genres_df.apply(lambda x : x * weights['genre'])

genres_df.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0.0,0.0,0.0,0.35,0.0,0.0,0.35,0.0,0.0,0.0,...,0.0,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.35,0.0,0.0,0.0,0.0,0.35,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.35,0.35,0.0,0.35,0.0,0.0,0.35,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.35,0.0,0.0


In [8]:
normalised_anime_df.drop('genre', axis=1, inplace=True)
normalised_anime_df = pd.concat([normalised_anime_df, genres_df], axis=1)

normalised_anime_df.head()

Unnamed: 0,anime_uid,name,synopsis,members_norm,rating_norm,popularity_norm,episodes_norm,Action,Adventure,Cars,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,0.026183,0.334453,0.000864,0.000818,0.0,0.0,0.0,...,0.0,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,0.053204,0.334832,0.000172,0.00072,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,0.031088,0.334832,0.0006,0.000425,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...",0.08632,0.35,2.5e-05,0.002094,0.35,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,0.011471,0.334832,0.003076,3.3e-05,0.35,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.35,0.0,0.0


In [9]:
features = ['members_norm', 'rating_norm', 'popularity_norm', 'episodes_norm'] + genres_df.columns.tolist()

cosine_sim = cosine_similarity(normalised_anime_df[features], normalised_anime_df[features])

print(cosine_sim)

[[1.         0.66214591 0.32441239 ... 0.38178832 0.24895614 0.65314481]
 [0.66214591 1.         0.32520761 ... 0.16702495 0.24895703 0.65247583]
 [0.32441239 0.32520761 1.         ... 0.16711943 0.24916537 0.30803338]
 ...
 [0.38178832 0.16702495 0.16711943 ... 1.         0.26943555 0.58509904]
 [0.24895614 0.24895703 0.24916537 ... 0.26943555 1.         0.21806531]
 [0.65314481 0.65247583 0.30803338 ... 0.58509904 0.21806531 1.        ]]


In [10]:
indices = pd.Series(anime_df.index, index=anime_df['name']).drop_duplicates()

Content based filtering recommendations

In [11]:
def get_recommendations(title, cosine_sim=cosine_sim, anime_df=anime_df, indices=indices):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwise cosine similarity scores for all anime with that index
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar anime
    # sim_scores = sim_scores[1:11]

    # Get the titles of the top 10 most similar anime
    # anime_indices = [i[0] for i in sim_scores]
    # anime_titles = anime_df['name'].iloc[anime_indices].values.tolist()


    # the aim here is to stop different seaseons of an anime being recommended. 
    # E.g. Tokyo Ghoul Season 1, Tokyo Ghoul Season 2, etc.
    # STILL INCOMPLETE -> perhaps try to fix in the item-item recommendations (chatgpt)

    anime_indices = []

    for i in sim_scores:
        name = anime_df['name'].iloc[i[0]]
        if len(anime_indices) == 10: # providing 10 recommendations
            break
        if re.search(title, name):
            continue

        anime_indices.append(i[0])

    anime_titles = anime_df['name'].iloc[anime_indices].values.tolist()

    return anime_titles

In [61]:
get_recommendations('Haikyuu')

['Slam Dunk',
 'Ahiru no Sora',
 'Rokudenashi Blues',
 'Batsu  Terry',
 'Cross Game',
 'Ballroom e Youkoso',
 'Slam Dunk Hoero Basketmandamashii Hanamichi to Rukawa no Atsuki Natsu',
 'Slam Dunk Shouhoku Saidai no Kiki Moero Sakuragi Hanamichi',
 'Slam Dunk Zenkoku Seiha Da  Sakuragi Hanamichi',
 'Slam Dunk Movie']

Item-item Collaborative filtering

From the recommendations provided from the content based algorithm above, calculate the similarity of each of those recommendations with anime the user has already watched.

In [126]:
user_ratings_df = pd.read_csv("reviews.csv")
user_ratings_df.drop(['profile', 'link', 'text'], axis=1, inplace=True)

user_ratings_df.head()

Unnamed: 0,uid,anime_uid,score,scores
0,255938,34096,8,"{'Overall': '8', 'Story': '8', 'Animation': '8..."
1,259117,34599,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
2,253664,28891,7,"{'Overall': '7', 'Story': '7', 'Animation': '9..."
3,8254,2904,9,"{'Overall': '9', 'Story': '9', 'Animation': '9..."
4,291149,4181,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."


In [115]:
anime_with_ratings_df = pd.merge(anime_df, user_ratings_df, on='anime_uid')

anime_with_ratings_df.drop_duplicates(subset=['uid', 'name'], inplace=True)
anime_with_ratings_df.reset_index(drop=True, inplace=True)

anime_with_ratings_df.head()

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating,uid,score,scores
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,253664,7,"{'Overall': '7', 'Story': '7', 'Animation': '9..."
1,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,254554,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
2,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,271227,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
3,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,284956,7,"{'Overall': '7', 'Story': '7', 'Animation': '7..."
4,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,249279,8,"{'Overall': '8', 'Story': '7', 'Animation': '7..."


Create User-item matrix for item-item collaborative filtering

maybe reduce dimensions so that we use users with x amount of ratings given in total -> reduce sparsity

Select users with that have rated a given number of anime 

In [127]:
ratings = user_ratings_df.copy()
ratings.drop(['scores'], axis=1, inplace=True)

counts = ratings['uid'].value_counts()
print(counts)

321183    4
321837    4
321498    4
321144    4
321148    4
         ..
46599     1
211503    1
156351    1
198366    1
193145    1
Name: uid, Length: 130519, dtype: int64


In [138]:
selected_users = ratings[ratings['uid'].isin(counts[counts >= 3].index)]

selected_users.shape
print(selected_users)

           uid  anime_uid  score
48992   321837      40269      9
48993   321498      40269      9
49297   322786      40269     10
49298   321183      40269      9
49299   321148      40269     10
...        ...        ...    ...
192087  238200      32979      9
192088  238885      32979      8
192089  320358      32979      3
192090  240457      32979      6
192091  262057      32979      8

[2172 rows x 3 columns]


Create pivot

In [139]:
anime_pivot = selected_users.pivot_table(index='uid', columns='anime_uid', values='score')
anime_pivot.fillna(0, inplace=True)

anime_pivot.head()

anime_uid,26,60,71,87,123,255,290,477,556,669,...,34445,35251,35777,36910,37869,37893,38154,38544,39282,40269
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
212,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
item_similarities = cosine_similarity(anime_pivot.T)
item_similarities_df = pd.DataFrame(item_similarities, index=anime_pivot.columns, columns=anime_pivot.columns)

item_similarities_df.head(10)

anime_uid,26,60,71,87,123,255,290,477,556,669,...,34445,35251,35777,36910,37869,37893,38154,38544,39282,40269
anime_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
255,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
def get_similar_anime(anime, n=10):
    similarity_scores = item_similarities_df[anime]
    similarity_scores = similarity_scores.sort_values(ascending=False)
    similar_anime = similarity_scores.iloc[1:n+1].index.tolist()

    similar_anime_names = 
    
    return similar_anime


In [148]:
get_similar_anime(18119)

[26, 60, 9930, 12447, 14511, 22831, 23385, 25875, 28085, 28927]

[26, 31711, 12447, 14511, 18119, 22831, 23385, 25875, 28085, 28927]

In [119]:
selected_users = user_ratings[user_ratings >= 3].index

print(selected_users)

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            192102, 192103, 192104, 192105, 192106, 192107, 192108, 192109,
            192110, 192111],
           dtype='int64', length=192112)


In [88]:
anime_rating_pivot = anime_with_ratings_df.pivot_table(index='uid', columns='name', values='score', fill_value=7)
# anime_rating_pivot.fillna(0, inplace=True)

anime_rating_pivot.head(10)

name,0,0008,001,009 ReCyborg,0091,07Ghost,100,100 Pascalsensei TV,1000nen Joou Queen Millennia,1001 Nights,...,makemagic,sCRYed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,ēlDLIVE,◯
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
9,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
10,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
11,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
12,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
13,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
14,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
15,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
16,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
17,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7


In [85]:
anime_rating_pivot.shape

(128186, 7887)

In [89]:
item_similarities = cosine_similarity(anime_rating_pivot.T)
item_similarities_df = pd.DataFrame(item_similarities, index=anime_rating_pivot.columns, columns=anime_rating_pivot.columns)

item_similarities_df.head(10)

name,0,0008,001,009 ReCyborg,0091,07Ghost,100,100 Pascalsensei TV,1000nen Joou Queen Millennia,1001 Nights,...,makemagic,sCRYed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,ēlDLIVE,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.999991,0.99999,0.999991,0.999999,0.999975,1.0,1.0,0.999999,0.999997,...,0.999998,0.999996,0.999998,0.999988,0.999998,0.999998,0.999999,1.0,0.999998,0.999997
0008,0.999991,1.0,0.999981,0.999982,0.99999,0.999966,0.999991,0.999991,0.99999,0.999988,...,0.99999,0.999987,0.99999,0.999979,0.999989,0.999989,0.99999,0.999991,0.999989,0.999988
001,0.99999,0.999981,1.0,0.999982,0.999989,0.999965,0.99999,0.99999,0.999989,0.999987,...,0.999989,0.999986,0.999989,0.999978,0.999989,0.999989,0.999989,0.99999,0.999988,0.999987
009 ReCyborg,0.999991,0.999982,0.999982,1.0,0.999991,0.999967,0.999992,0.999991,0.999991,0.999989,...,0.99999,0.999988,0.99999,0.99998,0.99999,0.99999,0.99999,0.999992,0.99999,0.999989
0091,0.999999,0.99999,0.999989,0.999991,1.0,0.999975,0.999999,0.999999,0.999998,0.999996,...,0.999998,0.999995,0.999998,0.999988,0.999998,0.999998,0.999998,0.999999,0.999997,0.999997
07Ghost,0.999975,0.999966,0.999965,0.999967,0.999975,1.0,0.999975,0.999975,0.999974,0.999972,...,0.999974,0.999971,0.999974,0.999963,0.999974,0.999974,0.999974,0.999975,0.999973,0.999973
100,1.0,0.999991,0.99999,0.999992,0.999999,0.999975,1.0,1.0,0.999999,0.999997,...,0.999999,0.999996,0.999999,0.999988,0.999998,0.999999,0.999999,1.0,0.999998,0.999997
100 Pascalsensei TV,1.0,0.999991,0.99999,0.999991,0.999999,0.999975,1.0,1.0,0.999999,0.999997,...,0.999999,0.999996,0.999999,0.999988,0.999998,0.999998,0.999999,1.0,0.999998,0.999997
1000nen Joou Queen Millennia,0.999999,0.99999,0.999989,0.999991,0.999998,0.999974,0.999999,0.999999,1.0,0.999996,...,0.999998,0.999995,0.999998,0.999987,0.999998,0.999998,0.999998,0.999999,0.999997,0.999997
1001 Nights,0.999997,0.999988,0.999987,0.999989,0.999996,0.999972,0.999997,0.999997,0.999996,1.0,...,0.999996,0.999993,0.999996,0.999985,0.999995,0.999996,0.999996,0.999997,0.999995,0.999994


In [84]:
item_similarities_df['Death Note']

name
0                                         0.0
0008                                      0.0
001                                       0.0
009 ReCyborg                              0.0
0091                                      0.0
                                         ... 
xxxHOLiC Movie Manatsu no Yoru no Yume    0.0
xxxHOLiC Rou                              0.0
xxxHOLiC Shunmuki                         0.0
ēlDLIVE                                   0.0
◯                                         0.0
Name: Death Note, Length: 7887, dtype: float64

In [90]:
def get_similar_anime(anime, n=10):
    similarity_scores = item_similarities_df[anime]
    similarity_scores = similarity_scores.sort_values(ascending=False)
    similar_anime = similarity_scores.iloc[1:n+1].index.tolist()
    
    return similar_anime


In [95]:
get_similar_anime('Naruto')

['Larva',
 'Black Jack Specials Inochi wo Meguru Yottsu no Kiseki',
 'Nobunaga no Shinobi Anegawa Ishiyamahen',
 'No Game No Life Zero  Manner Movie',
 'Nils no Fushigi na Tabi',
 'Blame Prologue',
 'Neko nanka Yondemo Konai',
 'Needless Saint Lily Gakuen no Himitsu',
 'Nee Chanto Shiyou Yo',
 'Natsumushi The Animation']

['Furueru Kuchibiru Episode 0',
 'Detatoko Princess',
 'Larva',
 'Kujiratori',
 'Kuragehime Eiyuu Retsuden☆',
 'Shinmai Maou no Testament Burst Toujou Basara no Shigoku Heiwa na Nichijou',
 'Kurai Mirai',
 'Shingeki no Kyojin Movie 2 Jiyuu no Tsubasa',
 'Kuraibito',
 'Kurenai Sanshirou']

In [75]:
# def get_similar_anime(title, n=10):
#     # get index of title
#     title_idx = anime_rating_pivot.columns.get_loc(title)
#     print("INDEX: ", title_idx)
#     # get cosine similarity for all anime compared to given title
#     sim_list = item_similarities[title_idx]
#     print("SIM: ", sim_list)
#     # sort by similarity in descending order
#     sim_list_sorted = sorted(list(enumerate(sim_list)), key=lambda x: x[1], reverse=True)
#     # remove title itself from list
#     sim_list_sorted = [(anime_rating_pivot.index[i], sim) for i, sim in sim_list_sorted if i != title_idx]
#     # get top n most similar anime
#     return sim_list_sorted[:n]

In [76]:
get_similar_anime('Shingeki no Kyojin')

INDEX:  6244
SIM:  [0. 0. 0. ... 0. 0. 0.]


[(1, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0)]

[('0', 0.0),
 ('0008', 0.0),
 ('001', 0.0),
 ('009 ReCyborg', 0.0),
 ('0091', 0.0),
 ('07Ghost', 0.0),
 ('100', 0.0),
 ('100 Pascalsensei TV', 0.0),
 ('1000nen Joou Queen Millennia', 0.0),
 ('1001 Nights', 0.0)]

In [21]:
def cb_recommendations(title, n=10):
    # get index of title
    title_idx = anime_rating_pivot.columns.get_loc(title)
    # get cosine similarity for all anime compared to given title
    sim_list = [(i, sim) for i, sim in enumerate(pivot_cosine_similarities[title_idx])]
    # sort by similarity in descending order
    sim_list_sorted = sorted(sim_list, key=lambda x: x[1], reverse=True)
    # remove title itself from list
    sim_list_sorted = [(i, sim) for i, sim in sim_list_sorted if i != title_idx]
    # get top n most similar anime
    return [(anime_with_ratings_df.loc[anime_with_ratings_df['anime_uid']==anime_rating_pivot.index[i], 'name'].iloc[0], sim) for i, sim in sim_list_sorted[:n]]


In [None]:
cb_recommendations('Death Note')

In [None]:
print(np.unique(pivot_cosine_similarities))

In [None]:
def get_recommendations_cb(name, cosine_sim_df=pivot_cosine_similarities_df, recommendations=10):

    # Get the top 10 similar anime titles for each anime title
    similar_anime = {}
    for title in cosine_sim_df.columns:
        similar_anime[title] = list(cosine_sim_df[title].nlargest(recommendations).index)[1:]

    # Print the top 10 similar anime titles for each anime title
    # for title, similar_titles in similar_anime.items():
    #     print(title + ':', ', '.join(similar_titles))
    
    return similar_anime[name]


In [None]:
get_recommendations_cb('One Punch Man')

0',
 '0008',
 '001',
 '009 ReCyborg',
 '0091',
 '07Ghost',
 '100',
 '100 Pascalsensei TV',
 '1000nen Joou Queen Millennia'

Memory based Recommendation using KNN

In [None]:
knn = NearestNeighbors(metric='cosine')
knn.fit(anime_rating_pivot)

In [None]:
# approx 1 min to execute
distances, indices = knn.kneighbors(anime_rating_pivot, n_neighbors=6)

recommend = pd.DataFrame(indices, columns=['anime0', 'anime1', 'anime2', 'anime3', 'anime4', 'anime5'])
recommend.head()

In [None]:
animes_to_recommend = recommend.copy()

for i in range(0, 6):
    animes = pd.DataFrame(anime_rating_pivot.index).reset_index()
    animes = animes.rename(columns={'index':f'anime{i}'})
    animes_to_recommend = pd.merge(animes_to_recommend, animes, on=[f'anime{i}'], how='left')
    animes_to_recommend = animes_to_recommend.drop(f'anime{i}', axis=1)
    animes_to_recommend = animes_to_recommend.rename(columns={'name':f'anime{i}'})

In [None]:
animes_to_recommend.head(10)

In [None]:
# # change profile names into unique IDs (i.e. integers)
# user_ratings_df.profile = pd.factorize(user_ratings_df.profile)[0]
# user_ratings_df.rename(columns={'profile': 'user_id'}, inplace=True)
# user_ratings_df.head(10)

In [None]:
# merged_anime_reviews_df = pd.merge(anime_df, user_ratings_df, on='anime_uid')
# merged_anime_reviews_df.rename(columns={'score_x':'avg_rating', 'text': 'review_text'},inplace=True)

# merged_anime_reviews_df.drop('review_text', axis=1, inplace=True)

# merged_anime_reviews_df.head()

Categorical encoding -> Separating genres into their own respective column

In [None]:
# replace the characters "[]'" with an empty space as the genre column is already of type string
# merged_anime_reviews_df['genre'] = merged_anime_reviews_df['genre'].str.replace("'", "", regex=False)
# merged_anime_reviews_df['genre'] = merged_anime_reviews_df['genre'].str.replace("[", "", regex=False)
# merged_anime_reviews_df['genre'] = merged_anime_reviews_df['genre'].str.replace("]", "", regex=False)

# merged_anime_reviews_df.head(5)