Preprocessing Anime dataset from Kaggle

In [3]:
from collections import defaultdict
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re
import string

1. Data preprocessing

1.1 Data Cleaning

Anime data

In [4]:
# remove unwanted features (columns) from the dataset
anime_df = pd.read_csv("animes.csv")
anime_df.rename(columns={'title': 'name'}, inplace=True)
anime_df.drop(['aired', 'ranked', 'img_url', 'link'], axis=1, inplace=True)

#removing unwanted characters from the anime name strings
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'Â°', '',text)

    return text

anime_df['name'] = anime_df['name'].apply(text_cleaning)
anime_df.head(5)

Unnamed: 0,uid,name,synopsis,genre,episodes,members,popularity,score
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,8.83


In [5]:
anime_df.rename(columns={'uid': 'anime_uid', 'score': 'rating'}, inplace=True)
anime_df.episodes.replace({'Unknown':np.nan},inplace=True)

anime_df.drop_duplicates(subset=['name'], inplace=True)
anime_df.dropna(inplace=True)
anime_df.reset_index(drop=True, inplace=True)

anime_df.head(5)

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,8.83


In [6]:
# replace the characters "[]'" with an empty space as the genre column is already of type string
anime_df['genre'] = anime_df['genre'].str.replace("'", "", regex=False)
anime_df['genre'] = anime_df['genre'].str.replace("[", "", regex=False)
anime_df['genre'] = anime_df['genre'].str.replace("]", "", regex=False)

anime_df.head(5)

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",1.0,214621,502,8.83


Ratings data

In [7]:
user_ratings_df = pd.read_csv("reviews.csv")
user_ratings_df.drop(['link', 'text'], axis=1, inplace=True)
user_ratings_df.rename(columns={'profile': 'user_id'}, inplace=True)

user_ratings_df.user_id = pd.factorize(user_ratings_df.user_id)[0]

user_ratings_df.head()

Unnamed: 0,uid,user_id,anime_uid,score,scores
0,255938,0,34096,8,"{'Overall': '8', 'Story': '8', 'Animation': '8..."
1,259117,1,34599,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
2,253664,2,28891,7,"{'Overall': '7', 'Story': '7', 'Animation': '9..."
3,8254,3,2904,9,"{'Overall': '9', 'Story': '9', 'Animation': '9..."
4,291149,4,4181,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."


USE MERGED DATASET FOR BOTH CB AND CF RECOMMENDATION METHODS TO PREVENT THE INDEX ERROR OCCURING IN THE HYBRID METHOD -> SOME ANIMES ARE REMOVED AFTER MERGING SO LEADS TO INDEX ERROR

In [8]:
anime_with_ratings_df = pd.merge(anime_df, user_ratings_df, on='anime_uid')

anime_with_ratings_df.drop_duplicates(subset=['user_id', 'name'], inplace=True)
anime_with_ratings_df.reset_index(drop=True, inplace=True)

anime_with_ratings_df.head()

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating,uid,user_id,score,scores
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,253664,2,7,"{'Overall': '7', 'Story': '7', 'Animation': '9..."
1,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,254554,15102,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
2,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,271227,15103,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
3,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,284956,13930,7,"{'Overall': '7', 'Story': '7', 'Animation': '7..."
4,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,249279,599,8,"{'Overall': '8', 'Story': '7', 'Animation': '7..."


Content based filtering recommendation

USE IN FINAL REPORT:

Normalizing ratings is important in content-based recommenders for anime (and any other system) for several reasons:

Fairness: Different users may have different rating scales. For example, one user may only give a rating of 4 or 5 if they really loved the anime, while another user may give a rating of 3 if they thought it was just okay. Normalizing the ratings ensures that each user's ratings are treated equally and fairly, regardless of their personal rating scale.

Consistency: Normalizing the ratings ensures that they all fall within the same range, making it easier to compare the ratings of different anime. This helps to ensure that the recommendations are consistent and reliable.

Accuracy: Normalizing the ratings can help to reduce the impact of outliers, which can skew the results of the recommender system. By scaling the ratings to a common range, outliers can be identified and handled appropriately, leading to more accurate recommendations.

In summary, normalizing ratings in a content-based recommender for anime helps to ensure fairness, consistency, and accuracy in the recommendations provided to users.

In [9]:
normalised_anime_df = anime_df.copy()

weights = {
    'genre': 0.35,
    'members_norm': 0.1,
    'rating_norm': 0.35,
    'popularity_norm': 0.1,
    'episodes_norm': 0.1
}

normalised_anime_df['members_norm'] = normalised_anime_df['members'] / normalised_anime_df['members'].max() * weights['members_norm']
normalised_anime_df['rating_norm'] = normalised_anime_df['rating'] / normalised_anime_df['rating'].max() * weights['rating_norm']
normalised_anime_df['popularity_norm'] = normalised_anime_df['popularity'] / normalised_anime_df['popularity'].max() * weights['popularity_norm']
normalised_anime_df['episodes_norm'] = normalised_anime_df['episodes'] / normalised_anime_df['episodes'].max() * weights['episodes_norm']

normalised_anime_df.head()

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,rating,members_norm,rating_norm,popularity_norm,episodes_norm
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82,0.026183,0.334453,0.000864,0.000818
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",22.0,995473,28,8.83,0.053204,0.334832,0.000172,0.00072
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",13.0,581663,98,8.83,0.031088,0.334832,0.0006,0.000425
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",64.0,1615084,4,9.23,0.08632,0.35,2.5e-05,0.002094
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",1.0,214621,502,8.83,0.011471,0.334832,0.003076,3.3e-05


In [10]:
normalised_anime_df.drop(['members', 'rating', 'popularity', 'episodes'], axis=1, inplace=True)

normalised_anime_df.head()

Unnamed: 0,anime_uid,name,synopsis,genre,members_norm,rating_norm,popularity_norm,episodes_norm
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",0.026183,0.334453,0.000864,0.000818
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",0.053204,0.334832,0.000172,0.00072
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",0.031088,0.334832,0.0006,0.000425
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",0.08632,0.35,2.5e-05,0.002094
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",0.011471,0.334832,0.003076,3.3e-05


In [11]:
genres_df = anime_df['genre'].str.get_dummies(sep=', ').astype(int)
genres_df = genres_df.apply(lambda x : x * weights['genre'])

genres_df.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0.0,0.0,0.0,0.35,0.0,0.0,0.35,0.0,0.0,0.0,...,0.0,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.35,0.0,0.0,0.0,0.0,0.35,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.35,0.35,0.0,0.35,0.0,0.0,0.35,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.35,0.0,0.0


In [12]:
normalised_anime_df.drop('genre', axis=1, inplace=True)
normalised_anime_df = pd.concat([normalised_anime_df, genres_df], axis=1)

normalised_anime_df.head()

Unnamed: 0,anime_uid,name,synopsis,members_norm,rating_norm,popularity_norm,episodes_norm,Action,Adventure,Cars,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,0.026183,0.334453,0.000864,0.000818,0.0,0.0,0.0,...,0.0,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,0.053204,0.334832,0.000172,0.00072,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,0.031088,0.334832,0.0006,0.000425,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...",0.08632,0.35,2.5e-05,0.002094,0.35,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,0.011471,0.334832,0.003076,3.3e-05,0.35,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.35,0.0,0.0


In [13]:
features = ['members_norm', 'rating_norm', 'popularity_norm', 'episodes_norm'] + genres_df.columns.tolist()

cosine_sim = cosine_similarity(normalised_anime_df[features], normalised_anime_df[features])

print(cosine_sim)

[[1.         0.66214591 0.32441239 ... 0.38178832 0.24895614 0.65314481]
 [0.66214591 1.         0.32520761 ... 0.16702495 0.24895703 0.65247583]
 [0.32441239 0.32520761 1.         ... 0.16711943 0.24916537 0.30803338]
 ...
 [0.38178832 0.16702495 0.16711943 ... 1.         0.26943555 0.58509904]
 [0.24895614 0.24895703 0.24916537 ... 0.26943555 1.         0.21806531]
 [0.65314481 0.65247583 0.30803338 ... 0.58509904 0.21806531 1.        ]]


In [14]:
indices = pd.Series(anime_df.index, index=anime_df['name']).drop_duplicates()

In [24]:
def content_based_recommendations(title, cosine_sim=cosine_sim, anime_df=anime_df, indices=indices, n_recommendations=40):
    # Get the index of the anime that matches the title
    idx = indices[title]
    
    # Get the pairwise cosine similarity scores for all anime with that index
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar anime
    # sim_scores = sim_scores[1:11]

    # Get the titles of the top 10 most similar anime
    # anime_indices = [i[0] for i in sim_scores]
    # anime_titles = anime_df['name'].iloc[anime_indices].values.tolist()


    # the aim here is to stop different seaseons of an anime being recommended. 
    # E.g. Tokyo Ghoul Season 1, Tokyo Ghoul Season 2, etc.
    # STILL INCOMPLETE -> perhaps try to fix in the item-item recommendations (chatgpt)

    anime_indices = []

    for i in sim_scores:
        name = anime_df['name'].iloc[i[0]]
        if len(anime_indices) == n_recommendations:
            break
        if re.search(title, name):
            continue

        anime_indices.append(i[0])

    anime_titles = anime_df['name'].iloc[anime_indices].values.tolist()

    return anime_titles

In [25]:
content_based_recommendations('Shingeki no Kyojin')

['Katsute Kami Datta Kemonotachi e',
 'Saint Seiya Meiou Hades Elysionhen',
 'Kaze no Youjinbou',
 'Dragon Ball Z The Real 4D',
 'Zetsuen no Tempest',
 'One Piece Episode of Merry  Mou Hitori no Nakama no Monogatari',
 'One Piece Episode of Nami  Koukaishi no Namida to Nakama no Kizuna',
 'One Piece Episode of East Blue  Luffy to 4nin no Nakama no Daibouken',
 'One Piece Episode of Sabo  3 Kyoudai no Kizuna Kiseki no Saikai to Uketsugareru Ishi',
 'GetBackers',
 'One Piece Episode of Sorajima',
 'One Piece Long Ring Long Landhen',
 'Hunter x Hunter 2011',
 'Akame ga Kill',
 'Hunter x Hunter',
 'Houseki no Kuni TV',
 'Hunter x Hunter Greed Island Final',
 'Hunter x Hunter Greed Island',
 'Break Blade 4 Sanka no Chi',
 'Break Blade 5 Shisen no Hate',
 'Break Blade 3 Kyoujin no Kizu',
 'Break Blade 2 Ketsubetsu no Michi',
 'Break Blade 1 Kakusei no Toki',
 'Break Blade 6 Doukoku no Toride',
 'Shin Mazinger Shougeki Zhen',
 'One Piece Romance Dawn',
 'One Piece Romance Dawn Story',
 'Kujir

Item-item Collaborative filtering

From the recommendations provided from the content based algorithm above, calculate the similarity of each of those recommendations with anime the user has already watched.

maybe select users that have rated more than a certain threshold of anime? E.g. rated more than 200 animes

In [19]:
user_ratings_df['user_id'].value_counts()

2762     762
456      743
109      648
97       566
31       491
        ... 
36472      1
36473      1
36476      1
36481      1
47884      1
Name: user_id, Length: 47885, dtype: int64

Create User-item matrix for item-item collaborative filtering

In [20]:
anime_rating_pivot = anime_with_ratings_df.pivot_table(index='user_id', columns='name', values='score')
anime_rating_pivot.fillna(0, inplace=True)

anime_rating_pivot.head(10)

name,0,0008,001,009 ReCyborg,0091,07Ghost,100,100 Pascalsensei TV,1000nen Joou Queen Millennia,1001 Nights,...,makemagic,sCRYed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,ēlDLIVE,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Use Cosine Similarity to calculate the similarity between each anime

In [21]:
item_similarities = cosine_similarity(anime_rating_pivot.T)
item_similarities_df = pd.DataFrame(item_similarities, index=anime_rating_pivot.columns, columns=anime_rating_pivot.columns)

item_similarities_df.head(10)

name,0,0008,001,009 ReCyborg,0091,07Ghost,100,100 Pascalsensei TV,1000nen Joou Queen Millennia,1001 Nights,...,makemagic,sCRYed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,ēlDLIVE,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.261557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0008,0.261557,1.0,0.026801,0.083551,0.052803,0.0,0.0,0.0,0.0,0.027276,...,0.0,0.0,0.161796,0.0,0.0,0.0,0.0,0.0,0.0,0.053602
001,0.0,0.026801,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027925,...,0.0,0.0,0.165647,0.0,0.0,0.0,0.0,0.0,0.0,0.164634
009 ReCyborg,0.0,0.083551,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0091,0.0,0.052803,0.0,0.0,1.0,0.084706,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07Ghost,0.0,0.0,0.0,0.0,0.084706,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100 Pascalsensei TV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000nen Joou Queen Millennia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Nights,0.0,0.027276,0.027925,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.16858,0.0,0.0,0.0,0.0,0.0,0.0,0.05585


In [26]:
def collaborative_filtering_recommendations(anime, n=40):
    similarity_scores = item_similarities_df[anime]
    similarity_scores = similarity_scores.sort_values(ascending=False)

    similar_anime = similarity_scores.iloc[1:n+1].index.tolist()

    return similar_anime


In [27]:
collaborative_filtering_recommendations('Death Note')

['Code Geass Hangyaku no Lelouch',
 'Fullmetal Alchemist Brotherhood',
 'SteinsGate',
 'Code Geass Hangyaku no Lelouch R2',
 'Elfen Lied',
 'Sword Art Online',
 'Cowboy Bebop',
 'Monster',
 'Deadman Wonderland',
 'Tokyo Ghoul',
 'Fullmetal Alchemist',
 'Ouran Koukou Host Club',
 'Fullmetal Alchemist The Conqueror of Shamballa',
 'Death Note Rewrite',
 'Clannad After Story',
 'Cowboy Bebop Tengoku no Tobira',
 'Highschool of the Dead',
 'Another',
 'Higurashi no Naku Koro ni',
 'Neon Genesis Evangelion',
 'Naruto',
 'Clannad',
 'Sen to Chihiro no Kamikakushi',
 'Ansatsu Kyoushitsu',
 'Tengen Toppa Gurren Lagann',
 'Howl no Ugoku Shiro',
 'Death Parade',
 'Kill la Kill',
 'Sora no Otoshimono Project Pink',
 'Haikyuu vs Akaten',
 'Wolfs Rain',
 'One Punch Man',
 'Mirai Nikki',
 'Dragon Ball Z',
 'Trigun',
 'Mousou Dairinin',
 'Kenpuu Denki Berserk',
 'Kuroshitsuji',
 'Kuroko no Basket Oshaberi Shiyokka',
 'Vampire Knight']

Hybrid Implementation: Combine content based and collaborative filtering methods to provide recommendations to the user

In [29]:
collaborative_sim_scores = item_similarities_df.copy()

In [79]:
def combined_recommendations(anime_name, num_recommendations=20, content_weight=0.5, collaborative_weight=0.5):
    if anime_name not in collaborative_sim_scores.index:
        return []

    content_based = content_based_recommendations(anime_name)
    collaborative_filtering = collaborative_filtering_recommendations(anime_name)

    # removing anime titles that may no longer exist within our dataframe as some were removed after the initial
    # anime_df and ratings_df dataframes were merged together
    content_based_animes = []

    for i in content_based:
        if i in collaborative_sim_scores.index:
            content_based_animes.append(i)

    collaborative_based_animes = []

    for i in collaborative_filtering:
        if i in collaborative_sim_scores.index:
            collaborative_based_animes.append(i)

    content_based_scores = collaborative_sim_scores.loc[content_based_animes]
    collaborative_filtering_scores = collaborative_sim_scores.loc[collaborative_based_animes]

    scores = content_based_scores.mul(content_weight).add(collaborative_filtering_scores.mul(collaborative_weight), fill_value=0)

    weighted_scores = scores[anime_name].sort_values(ascending=False)

    return weighted_scores.head(num_recommendations).index.tolist()

In [83]:
combined_recommendations('One Punch Man')

['One Punch Man Road to Hero',
 'Boku no Hero Academia',
 'Mob Psycho 100',
 'Boku no Hero Academia 2nd Season',
 'Kono Subarashii Sekai ni Shukufuku wo',
 'Gakusen Toshi Asterisk',
 'Charlotte',
 'Tokyo Ghoul',
 'Gangsta',
 'One Punch Man Specials',
 'Young Black Jack',
 'Shokugeki no Souma Ni no Sara',
 'Ansatsu Kyoushitsu',
 'Death Parade',
 'Shokugeki no Souma',
 'Arslan Senki TV',
 'Tokyo Ghoul √A',
 'Ore ga Ojousama Gakkou ni Shomin Sample Toshite Gets♥Sareta Ken',
 'ReZero kara Hajimeru Isekai Seikatsu',
 'Ninja Senshi Tobikage']