In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
anime_data_cleaned = pd.read_csv('csv/2020/anime_2020_cleaned.csv')

In [3]:
anime_data_cleaned['Genres'] = anime_data_cleaned['Genres'].apply(lambda x: x.replace(' ', '').replace(',', ' '))

In [4]:
rating_data = pd.read_csv('csv/2020/rating_2020.csv')
counts = rating_data['user_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(counts[counts >= 200].index)]
rating_data.rename(columns={'anime_id':'MAL_ID'}, inplace=True)

In [104]:
# pick random user id from rating_data
user_id = rating_data['user_id'].sample(1).values[0]
user_id

334442

## Basic Content-based by Genre using CountVectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# Create a CountVectorizer to one-hot encode the genres
vectorizer = CountVectorizer()
genre_matrix = vectorizer.fit_transform(anime_data_cleaned['Genres'])

In [7]:
# Compute the cosine similarity between anime based on their genre vectors
genre_similarity = cosine_similarity(genre_matrix, genre_matrix)

In [73]:
def get_recommendations(anime_title, similarity_matrix, anime_data, k=10):
    index = anime_data[anime_data['Name'] == anime_title].index[0]
    scores = list(enumerate(similarity_matrix[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:k+1]  # exclude the anime itself
    indices = [i[0] for i in scores]
    similarity_scores = [i[1] for i in scores]
    # print(anime_data.iloc[indices]['Name'])
    return anime_data.iloc[indices]['MAL_ID'], similarity_scores

In [89]:
title = 'Naruto'
get_recommendations(title, genre_similarity, anime_data_cleaned)

(794    1735
 0         1
 1         6
 2         7
 3         8
 4        15
 5        16
 6        17
 7        18
 8        19
 Name: MAL_ID, dtype: int64,
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

### Predict ratings

In [94]:
# Function to predict the rating of an anime for a user based on content-based recommendations
def predict_anime_rating_content_based(user_id, anime_title, similarity_matrix, anime_data, k=10, similarity_threshold=0.1):
    # Get the top-k most similar anime for the given anime_title
    recommended_anime_indices, similarity_scores = get_recommendations(anime_title, similarity_matrix, anime_data, k)

    # Get the user's ratings for the anime they have watched in the training set
    user_ratings = rating_data[rating_data['user_id'] == user_id]
    
    # Create a dictionary to store the predicted ratings for the recommended anime
    predicted_ratings = {}

    for index, similarity_score in zip(recommended_anime_indices, similarity_scores):
        anime_name = anime_data.loc[index, 'Name']
        anime_id = anime_data.loc[index, 'MAL_ID']
        
        # Check if the user has rated the recommended anime
        if anime_id in user_ratings['MAL_ID'].values:
            # Use the user's actual rating for the recommended anime
            user_rating = user_ratings.loc[user_ratings['MAL_ID'] == anime_id, 'rating'].values[0]
        else:
            # Use the user's average rating as a fallback
            user_average_rating = user_ratings['rating'].mean()
            user_rating = user_average_rating
        
        if similarity_score >= similarity_threshold:
            predicted_rating = similarity_score * user_rating / sum(similarity_scores)
        else:
            predicted_rating = user_rating
        
        predicted_ratings[anime_name] = predicted_rating

    return predicted_ratings


In [106]:
predict_anime_rating_content_based(user_id, 'Naruto', genre_similarity, anime_data_cleaned)

{'Dragon Ball Kai': 6.578651685393258,
 'Trigun': 6.578651685393258,
 'Hungry Heart: Wild Striker': 6.578651685393258,
 'Initial D Fourth Stage': 6.578651685393258,
 'Monster': 6.578651685393258,
 'Texhnolyze': 6.578651685393258,
 'Trinity Blood': 6.578651685393258,
 'Yakitate!! Japan': 6.578651685393258,
 'Zipang': 6.578651685393258,
 'Neon Genesis Evangelion': 9}

## Content-based by Genre and Studio

In [90]:
anime_data_cleaned['Genres_and_Studio'] = anime_data_cleaned['Genres'] + ', ' + anime_data_cleaned['Studios']

In [91]:
# Create a CountVectorizer to one-hot encode the genres
vectorizer = CountVectorizer()
genre_studio_matrix = vectorizer.fit_transform(anime_data_cleaned['Genres_and_Studio'])

In [92]:
# Compute the cosine similarity between anime based on their genre vectors
genre_studio_similarity = cosine_similarity(genre_studio_matrix, genre_studio_matrix)

In [14]:
title = 'Naruto'
get_recommendations(title, genre_studio_similarity, anime_data_cleaned)

794                  Naruto: Shippuuden
162                      Rekka no Honoo
3827    Boruto: Naruto Next Generations
184                              Bleach
1739                     Moeru! Oniisan
272                     Yuu☆Yuu☆Hakusho
1205           Chiisana Kyojin Microman
769                        Duel Masters
1496                Duel Masters Charge
3545                  Duel Masters VSRF
Name: Name, dtype: object

In [20]:
anime_titles = ['Naruto', 'Rekka no Honoo']
# for loop titles and print Studio and Genres column in anime_data_cleaned for each title
for title in anime_titles:
    print(title)
    print(anime_data_cleaned[anime_data_cleaned['Name'] == title][['Studios', 'Genres']])
    print()


Naruto
          Studios                                             Genres
9  Studio Pierrot  Action Adventure Comedy SuperPower MartialArts...

Rekka no Honoo
            Studios                                           Genres
162  Studio Pierrot  Action Adventure MartialArts Shounen SuperPower



In [25]:
get_recommendations('InuYasha', genre_studio_similarity, anime_data_cleaned)

1831                      InuYasha: Kanketsu-hen
4811    Hanyou no Yashahime: Sengoku Otogizoushi
737                               Kekkaishi (TV)
125                            Tsubasa Chronicle
31                                 Chrno Crusade
538                            Majutsushi Orphen
590                                  Slayers Try
1817                             Kuroshitsuji II
2879                Shingeki no Bahamut: Genesis
3328            Shingeki no Bahamut: Virgin Soul
Name: Name, dtype: object

In [107]:
predict_anime_rating_content_based(user_id, 'Naruto', genre_studio_similarity, anime_data_cleaned)

{'Dragon Ball Kai': 0.9398073836276082,
 'Solty Rei': 0.6265382557517388,
 'Tenjou Tenge': 0.6265382557517388,
 'Tenkuu no Escaflowne': 0.6265382557517388,
 'Whistle!': 0.6265382557517388,
 'Xenosaga The Animation': 0.6265382557517388,
 'Maburaho': 0.6265382557517388,
 'Ninin ga Shinobuden': 0.6265382557517388,
 'Gankutsuou': 0.6265382557517388,
 'Ginyuu Mokushiroku Meine Liebe': 0.6265382557517388}

## Content-based with multiple titles

In [21]:
import collections
def get_recommendations_for_multiple_titles(titles, cosine_sim, anime_df, top_k=10):
    # Find the indices of the input titles
    title_indices = [anime_df[anime_df['Name'] == title].index[0] for title in titles]

    # Get the cosine similarity scores for all titles in the input list
    sim_scores = [list(enumerate(cosine_sim[title_index])) for title_index in title_indices]

    # Combine the similarity scores
    combined_scores = collections.defaultdict(float)
    for score_list in sim_scores:
        for i, score in score_list:
            combined_scores[i] += score

    # Sort the combined scores and get the top k indices
    sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    top_k_indices = [i for i, _ in sorted_scores[:top_k]]

    # Get the top k recommendations based on the combined scores
    return anime_df.iloc[top_k_indices]

In [23]:
titles = ['Kill la Kill', 'Code Geass: Hangyaku no Lelouch', 'Shingeki no Kyojin', 'Tengen Toppa Gurren Lagann', 
          'Mahou Shoujo Madoka★Magica', 'Ga-Rei: Zero', 'JoJo no Kimyou na Bouken Part 5: Ougon no Kaze']
get_recommendations_for_multiple_titles(titles, genre_similarity, anime_data_cleaned, 5)

Unnamed: 0,MAL_ID,Name,Score,Genres,Episodes,Premiered,Studios,Members,Completed,Dropped rate,Genres_and_Studio
145,209,R.O.D: The TV,7.55,Action Sci-Fi Adventure Comedy SuperPower Drama,26,Fall 2003,J.C.Staff,61604,32941,0.06,Action Sci-Fi Adventure Comedy SuperPower Dram...
306,479,Ueki no Housoku,7.76,Action Adventure Comedy SuperPower Supernatura...,51,Spring 2005,Studio Deen,97521,51123,0.07,Action Adventure Comedy SuperPower Supernatura...
718,1575,Code Geass: Hangyaku no Lelouch,8.72,Action Military Sci-Fi SuperPower Drama Mecha ...,25,Fall 2006,Sunrise,1583882,1209288,0.02,Action Military Sci-Fi SuperPower Drama Mecha ...
10,21,One Piece,8.52,Action Adventure Comedy SuperPower Drama Fanta...,Unknown,Fall 1999,Toei Animation,1352724,33,0.11,Action Adventure Comedy SuperPower Drama Fanta...
118,168,s.CRY.ed,7.38,Action Sci-Fi Adventure SuperPower Drama,26,Summer 2001,Sunrise,72311,44099,0.05,"Action Sci-Fi Adventure SuperPower Drama, Sunrise"


In [24]:
get_recommendations_for_multiple_titles(titles, genre_studio_similarity, anime_data_cleaned, 5)

Unnamed: 0,MAL_ID,Name,Score,Genres,Episodes,Premiered,Studios,Members,Completed,Dropped rate,Genres_and_Studio
718,1575,Code Geass: Hangyaku no Lelouch,8.72,Action Military Sci-Fi SuperPower Drama Mecha ...,25,Fall 2006,Sunrise,1583882,1209288,0.02,Action Military Sci-Fi SuperPower Drama Mecha ...
1146,2904,Code Geass: Hangyaku no Lelouch R2,8.91,Action Military Sci-Fi SuperPower Drama Mecha,25,Spring 2008,Sunrise,1268320,1060730,0.01,"Action Military Sci-Fi SuperPower Drama Mecha,..."
118,168,s.CRY.ed,7.38,Action Sci-Fi Adventure SuperPower Drama,26,Summer 2001,Sunrise,72311,44099,0.05,"Action Sci-Fi Adventure SuperPower Drama, Sunrise"
145,209,R.O.D: The TV,7.55,Action Sci-Fi Adventure Comedy SuperPower Drama,26,Fall 2003,J.C.Staff,61604,32941,0.06,Action Sci-Fi Adventure Comedy SuperPower Dram...
306,479,Ueki no Housoku,7.76,Action Adventure Comedy SuperPower Supernatura...,51,Spring 2005,Studio Deen,97521,51123,0.07,Action Adventure Comedy SuperPower Supernatura...
