Preprocessing Anime dataset from Kaggle

In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import matplotlib.pyplot as plt
import re
import string

1. Data preprocessing

1.1 Data Cleaning

In [2]:
# remove unwanted features (columns) from the dataset
anime_df = pd.read_csv("animes.csv")
anime_df.rename(columns={'title': 'name'}, inplace=True)
anime_df.drop(['aired', 'ranked', 'img_url', 'link'], axis=1, inplace=True)

#removing unwanted characters from the anime name strings
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'Â°', '',text)

    return text

anime_df['name'] = anime_df['name'].apply(text_cleaning)
anime_df.head(5)

Unnamed: 0,uid,name,synopsis,genre,episodes,members,popularity,score
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,8.83


In [3]:
anime_df.rename(columns={'uid': 'anime_uid'}, inplace=True)
anime_df.episodes.replace({'Unknown':np.nan},inplace=True)

anime_df.drop_duplicates(subset=['name'], inplace=True)
anime_df.dropna(inplace=True)
anime_df.reset_index(drop=True, inplace=True)

anime_df.head(5)

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,score
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,8.83


In [4]:
# replace the characters "[]'" with an empty space as the genre column is already of type string
anime_df['genre'] = anime_df['genre'].str.replace("'", "", regex=False)
anime_df['genre'] = anime_df['genre'].str.replace("[", "", regex=False)
anime_df['genre'] = anime_df['genre'].str.replace("]", "", regex=False)

anime_df.head(5)

Unnamed: 0,anime_uid,name,synopsis,genre,episodes,members,popularity,score
0,28891,Haikyuu Second Season,Following their participation at the Inter-Hig...,"Comedy, Sports, Drama, School, Shounen",25.0,489888,141,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Drama, Music, Romance, School, Shounen",22.0,995473,28,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Sci-Fi, Adventure, Mystery, Drama, Fantasy",13.0,581663,98,8.83
3,5114,Fullmetal Alchemist Brotherhood,"""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...",64.0,1615084,4,9.23
4,31758,Kizumonogatari III Reiketsuhen,After helping revive the legendary vampire Kis...,"Action, Mystery, Supernatural, Vampire",1.0,214621,502,8.83


Use Tfid Vectorizer to fit and transform the synopsis and genre columns

In [None]:
# # Combining the synopsis and genres columns into a single column so that we do not get an inconsistent shapes error when doing using tfidf separately
# anime_df['synopsis_genres'] = anime_df['synopsis'] + ' ' + anime_df['genre'] 
# anime_df['describe'] = anime_df['genre']
# anime_df.head()

In [None]:
anime_df.dtypes

In [None]:
anime_df['synopsis'] = anime_df['synopsis'].apply(lambda x: x.lower())

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english')

tfidf_matrix = tfidf_vectorizer.fit_transform(anime_df['synopsis_genres'])

cosine_sim = cosine_similarity(tfidf_matrix)

print(cosine_sim)

tfidf_matrix.shape

In [None]:
simil = linear_kernel(tfidf_matrix, tfidf_matrix)
simil.shape

In [None]:
id = anime_df['anime_uid'].values
simil = pd.DataFrame(simil, index=id, columns=id)
simil.columns = anime_df['name']
simil['anime_name'] = anime_df['name'].values
simil.fillna(0)

simil.head()

In [None]:
def content_rec(name):
    idx = simil[simil['anime_name']==name]
    idx = idx.drop('anime_name', axis=1).T
    idx.columns = ['similar_val']
    idx = idx.sort_values(by='similar_val', ascending=False)
    return idx

In [None]:
content_rec('Shingeki no Kyojin').head(10)

Recommend Similar anime

In [None]:
# alpha = weight for cosine similarity (synopsis + genre)
# beta = weight for rating

def get_similar_anime(title, n, alpha, beta):
    idx = anime_df.index[anime_df['name'] == title][0]

    weighted_scores = []

    for i in range(len(anime_df)):
        if i != idx:
            weighted_score = alpha * cosine_sim[idx][i] + beta * anime_df['score'][i]
            weighted_scores.append((i, weighted_score))
    
    # Sort the anime based on the weighted scores
    weighted_scores = sorted(weighted_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top n similar anime
    sim_anime_indices = [i[0] for i in weighted_scores if anime_df['name'][i[0]] != title][:n]
    
    # Return the titles of the top n similar anime
    return anime_df['name'].iloc[sim_anime_indices]

In [None]:
get_similar_anime("Shingeki no Kyojin", 10, alpha=0.5, beta=0.5)

3                  Fullmetal Alchemist Brotherhood
752                           Hunter x Hunter 2011
753                                     SteinsGate
751                                  Kimi no Na wa
750             Shingeki no Kyojin Season 3 Part 2
11191    Quiz de Manabu Pinocchio no Koutsuu Ansen
749                                       Gintama°
748                           Ginga Eiyuu Densetsu
747                      3gatsu no Lion 2nd Season
746                                 Koe no Katachi

In [None]:
user_ratings_df = pd.read_csv("reviews.csv")
user_ratings_df.head(5)

1.2 Merging the datasets

In [None]:
# remove unwanted columns
user_ratings_df.drop(['uid', 'link'], axis=1, inplace=True)
user_ratings_df.head(5)

In [None]:
# change profile names into unique IDs (i.e. integers)
user_ratings_df.profile = pd.factorize(user_ratings_df.profile)[0]
user_ratings_df.rename(columns={'profile': 'user_id'}, inplace=True)
user_ratings_df.head(10)

In [None]:
merged_anime_reviews_df = pd.merge(anime_df, user_ratings_df, on='anime_uid')
merged_anime_reviews_df.rename(columns={'score_x':'avg_rating', 'text': 'review_text'},inplace=True)

merged_anime_reviews_df.drop('review_text', axis=1, inplace=True)

merged_anime_reviews_df.head()

Categorical encoding -> Separating genres into their own respective column

In [None]:
# replace the characters "[]'" with an empty space as the genre column is already of type string
# merged_anime_reviews_df['genre'] = merged_anime_reviews_df['genre'].str.replace("'", "", regex=False)
# merged_anime_reviews_df['genre'] = merged_anime_reviews_df['genre'].str.replace("[", "", regex=False)
# merged_anime_reviews_df['genre'] = merged_anime_reviews_df['genre'].str.replace("]", "", regex=False)

# merged_anime_reviews_df.head(5)

Clean data -> drop duplicates + NaN values

In [None]:
# merged_anime_reviews_df.drop_duplicates(inplace=True)
# merged_anime_reviews_df.dropna(inplace=True)