In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
anime_data_cleaned = pd.read_csv('csv/2020/anime_2020_cleaned.csv')
rating_data = pd.read_csv('csv/2020/rating_2020.csv')

In [4]:
anime_name = pd.read_csv('csv/2020/anime_2020_name.csv')

In [5]:
counts = rating_data['user_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(counts[counts >= 200].index)]
rating_data.rename(columns={'anime_id':'MAL_ID'}, inplace=True)

In [37]:
# exclude anime from rating data if the id is not in anime_data_cleaned
rating_data = rating_data[rating_data['MAL_ID'].isin(anime_data_cleaned['MAL_ID'])]

In [6]:
anime_rating_data = anime_data_cleaned.merge(rating_data, on='MAL_ID', how='inner')
anime_ratings = anime_rating_data[['user_id', 'MAL_ID', 'rating']]

## Item-based collaborative filtering

In [38]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets based on the users
train_data, test_data = train_test_split(rating_data, test_size=0.2, random_state=42)

In [39]:
from scipy.sparse import csr_matrix
anime_user_matrix = train_data.pivot_table(index='MAL_ID', columns='user_id', values='rating').fillna(0)
train_data_sparse = csr_matrix(anime_user_matrix.fillna(0).values)

In [40]:
from sklearn.cluster import KMeans

# Cluster the users based on the sparse matrix
kmeans = KMeans(n_clusters=10, random_state=42).fit(train_data_sparse)

# Add the cluster labels to the user-item matrix
anime_user_matrix['cluster'] = kmeans.labels_

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
def predict_ratings_item_based(name):
    anime_id = anime_name[anime_name['Name'] == name]['MAL_ID'].values[0]

    # Get the user_ids that the anime has actually been rated
    user_ids = test_data.loc[test_data['MAL_ID'] == anime_id]['user_id'].values

    # Get the cluster of the anime (e.g., 5)
    anime_cluster = anime_user_matrix['cluster'].loc[anime_user_matrix['cluster'].index == anime_id].values[0]
    
    # Get the users in the same cluster (e.g., [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    animes_in_cluster = anime_user_matrix[anime_user_matrix['cluster'] == anime_cluster].index
    
    # Get the ratings of the animes in the same cluster
    ratings_of_animes_in_cluster = train_data_sparse[anime_user_matrix.index.isin(animes_in_cluster)]
    
    row_index = anime_user_matrix.index.get_loc(anime_id)

    # Calculate the similarity between the user and the animes in the same cluster
    similarity = cosine_similarity(train_data_sparse[row_index], ratings_of_animes_in_cluster)

    # Get the ratings for the specified user_ids that exist in train_data_sparse
    # It is actually searching it from anime_user_matrix but its fine because train_data_sparse is a csr_matrix made from anime_user_matrix
    common_user_ids = set(user_ids).intersection(set(anime_user_matrix.index.values))
    common_user_ids = np.array(sorted(list(common_user_ids)))
    
    common_user_ids_idx  = [anime_user_matrix.columns.get_loc(user_id) for user_id in common_user_ids]
    
    # Use this in case if you want to get all the user_ids from train_data_sparse
    user_ids_idx  = [anime_user_matrix.columns.get_loc(user_id) for user_id in user_ids]

    # get index of users_in_cluster, common_user_ids
    animes_in_cluster_idx = [anime_user_matrix.index.get_loc(anime_id) for anime_id in animes_in_cluster]

    # Get the ratings for the specified user_ids
    user_ratings = np.asarray(train_data_sparse[animes_in_cluster_idx, :][:, user_ids_idx].todense())

    # Get the top-k similar animes
    top_similar_animes = np.argsort(similarity)[0][-1]
    
    # Calculate the predicted ratings for the specified user_ids
    predicted_ratings = user_ratings.dot(similarity[0][top_similar_animes]) / np.sum(np.abs(similarity[0][top_similar_animes]))
    
    return predicted_ratings[0]


In [25]:
predicted_ratings = predict_ratings_item_based('Naruto')
predicted_ratings, predicted_ratings.shape

(array([0., 0., 0., ..., 0., 0., 0.]), (11197,))

In [32]:
id = anime_name[anime_name['Name'] == 'Naruto']['MAL_ID'].values[0]
actual_ratings = test_data.loc[test_data['MAL_ID']==id]['rating'].values
actual_ratings, actual_ratings.shape

(array([10,  7,  6, ...,  9,  9,  9]), (11197,))

In [42]:
def calculate_mae(predicted_ratings, actual_ratings):
    return np.mean(np.abs(predicted_ratings - actual_ratings))

In [43]:
calculate_mae(predicted_ratings, actual_ratings)

7.611592390818969

### predict_ratings_item_based_v2

In this version, we are finding top_k similar animes instead of just the most similar item

In [99]:
def predict_ratings_item_based_v2(name, k_similar_animes=5):
    anime_id = anime_name[anime_name['Name'] == name]['MAL_ID'].values[0]

    # Get the user_ids that the anime has actually been rated
    user_ids = test_data.loc[test_data['MAL_ID'] == anime_id]['user_id'].values

    # Get the cluster of the anime (e.g., 5)
    anime_cluster = anime_user_matrix['cluster'].loc[anime_user_matrix['cluster'].index == anime_id].values[0]
    
    # Get the users in the same cluster (e.g., [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    animes_in_cluster = anime_user_matrix[anime_user_matrix['cluster'] == anime_cluster].index
    
    # Get the ratings of the animes in the same cluster
    ratings_of_animes_in_cluster = train_data_sparse[anime_user_matrix.index.isin(animes_in_cluster)]
    
    row_index = anime_user_matrix.index.get_loc(anime_id)

    # Calculate the similarity between the user and the animes in the same cluster
    similarity = cosine_similarity(train_data_sparse[row_index], ratings_of_animes_in_cluster)

    # Get the ratings for the specified user_ids that exist in train_data_sparse
    # It is actually searching it from anime_user_matrix but its fine because train_data_sparse is a csr_matrix made from anime_user_matrix
    common_user_ids = set(user_ids).intersection(set(anime_user_matrix.index.values))
    common_user_ids = np.array(sorted(list(common_user_ids)))
    
    common_user_ids_idx  = [anime_user_matrix.columns.get_loc(user_id) for user_id in common_user_ids]
    
    # Use this in case if you want to get all the user_ids from train_data_sparse
    user_ids_idx  = [anime_user_matrix.columns.get_loc(user_id) for user_id in user_ids]

    # get index of users_in_cluster, common_user_ids
    animes_in_cluster_idx = [anime_user_matrix.index.get_loc(anime_id) for anime_id in animes_in_cluster]

    # Get the ratings for the specified user_ids
    user_ratings = np.asarray(train_data_sparse[animes_in_cluster_idx, :][:, user_ids_idx].todense())

    predicted_ratings = np.zeros(len(user_ids_idx))
    
    # Get the top-k similar animes
    top_similar_animes_indices = np.argsort(similarity)[0][-k_similar_animes:]

    print(user_ids_idx)
    print(top_similar_animes_indices)

    # Loop through the top-k similar users
    for item_idx in top_similar_animes_indices:
        # Get the similarity score of the user
        similarity_score = similarity[0][item_idx]
        
        # Add the weighted rating of each user to the predicted ratings
        for j, user_idx in enumerate(user_ids_idx):
            anime_actual_idx = animes_in_cluster_idx[item_idx]
            # print(anime_actual_idx)
            user_id = anime_user_matrix.columns[user_idx]
            # print(user_id)
            user_rating = anime_user_matrix.iloc[anime_actual_idx][user_id]
            predicted_ratings[j] += similarity_score * user_rating

    # Normalize the predicted ratings by the sum of the absolute values of the top-k similarities
    predicted_ratings /= np.sum(np.abs(similarity[0][top_similar_animes_indices]))

    predicted_ratings = np.round(predicted_ratings,1)
    
    return predicted_ratings

In [100]:
predicted_ratings = predict_ratings_item_based_v2('Naruto')
predicted_ratings, predicted_ratings.shape

[25639, 69500, 22416, 65359, 24410, 3489, 41836, 79179, 24243, 65427, 59688, 32091, 64360, 12895, 45233, 20842, 26869, 50827, 82859, 48029, 58848, 7527, 681, 78846, 89059, 20635, 55735, 24212, 67701, 27639, 69037, 52057, 57427, 67897, 87082, 18398, 93884, 79901, 27850, 68498, 41672, 87305, 54836, 66822, 3629, 74191, 63375, 33113, 4176, 90637, 74718, 69254, 78749, 29405, 32940, 92059, 12972, 67419, 44145, 22789, 89396, 77033, 44663, 62100, 18321, 32553, 83943, 25220, 50513, 12059, 37309, 61412, 54626, 13085, 56858, 74678, 85138, 41959, 67936, 59277, 21220, 9885, 37333, 26475, 26713, 43929, 79823, 29797, 62533, 57054, 90990, 87100, 6781, 49913, 84863, 22828, 43747, 66387, 8071, 46706, 30802, 39060, 63185, 69994, 14788, 7942, 29760, 3647, 5097, 44998, 63275, 18666, 35909, 52320, 84311, 6688, 47777, 89047, 16702, 26693, 607, 67336, 46082, 29741, 26492, 60676, 35345, 59199, 86906, 35949, 10972, 94181, 2519, 81607, 85453, 71118, 20812, 66879, 76358, 10322, 37394, 62611, 94075, 8121, 62507, 1

KeyboardInterrupt: 

In [24]:
id = anime_name[anime_name['Name'] == 'Naruto']['MAL_ID'].values[0]
actual_ratings = test_data.loc[test_data['MAL_ID']==id]['rating'].values
actual_ratings, actual_ratings.shape

(array([10,  7,  6, ...,  9,  9,  9]), (11197,))

In [17]:
def calculate_mae(predicted_ratings, actual_ratings):
    return np.mean(np.abs(predicted_ratings - actual_ratings))

In [18]:
calculate_mae(predicted_ratings, actual_ratings)

7.611592390818969

## Calculate similarity between all animes

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between items
anime_similarity = cosine_similarity(anime_user_matrix)

In [None]:
# Generate recommendations using the similarity scores
norm = np.abs(anime_similarity).sum(axis=1)
predicted_anime_ratings = anime_similarity.dot(anime_user_matrix.values) / norm[:, np.newaaxis]

In [15]:
name = 'Naruto'
# get MAL_ID
anime_name[anime_name['Name'] == name]['MAL_ID'].values[0]

20

In [18]:
train_data.loc[train_data['MAL_ID'] == 20]

Unnamed: 0,user_id,MAL_ID,rating
30537002,187850,20,7
12200520,74712,20,3
45917065,281746,20,9
12703863,77893,20,8
23094810,141908,20,8
...,...,...,...
10465284,64078,20,10
7597911,46581,20,7
6039979,37088,20,7
52115670,319617,20,8


In [21]:
row_index = anime_user_matrix.index.get_loc(20)

In [None]:
# Write a function to find top k similar anime from anime_pivot
def recommend_item_base(anime_name, k=10):
    anime_index = anime_user_matrix.index.get_loc(anime_name)
    anime_indices = np.argsort(anime_similarity[anime_index])[-k:][::-1]
    similar_anime_ids = anime_user_matrix.index[anime_indices]
    return similar_anime_ids

In [None]:
recommend_item_base('Naruto')