In [1]:
import pandas as pd
import numpy as np
import warnings
from scipy.sparse import csr_matrix
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
anime_data_cleaned = pd.read_csv('csv/2020/anime_2020_cleaned.csv')
rating_data = pd.read_csv('csv/2020/rating_2020.csv')

In [21]:
anime_name = pd.read_csv('csv/2020/anime_2020_name.csv')

In [5]:
rating_data.shape

(57633278, 3)

In [6]:
counts = rating_data['user_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(counts[counts >= 200].index)]
rating_data.rename(columns={'anime_id':'MAL_ID'}, inplace=True)

In [7]:
# exclude anime from rating data if the id is not in anime_data_cleaned
rating_data = rating_data[rating_data['MAL_ID'].isin(anime_data_cleaned['MAL_ID'])]

In [13]:
rating_data.shape

(41320341, 3)

In [8]:
anime_rating_data = anime_data_cleaned.merge(rating_data, on='MAL_ID', how='inner')

In [3]:
anime_rating_data.shape

(38541711, 12)

## User-based collaborative filtering - v2.0
- With sparse matrix to speed up the computation
- Clustering users by Kmeans

Split dataset into train and test

In [9]:
anime_ratings = anime_rating_data[['user_id', 'MAL_ID', 'rating']]

In [6]:
anime_ratings.shape

(26724793, 3)

### Split data into train and test

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets based on the users
train_data, test_data = train_test_split(rating_data, test_size=0.2, random_state=42)

In [11]:
from scipy.sparse import csr_matrix
user_item_matrix = train_data.pivot_table(index='user_id', columns='MAL_ID', values='rating').fillna(0)
train_data_sparse = csr_matrix(user_item_matrix.fillna(0).values)

In [100]:
user_item_matrix

MAL_ID,1,5,6,7,8,15,16,17,18,19,...,47398,47402,47403,47410,47614,47616,47618,48177,48456,cluster
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
6,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
19,8.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353385,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
353386,7.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
353392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
353395,0.0,8.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [98]:
train_data_sparse

<95143x16847 sparse matrix of type '<class 'numpy.float64'>'
	with 33056272 stored elements in Compressed Sparse Row format>

### Clustering users with KMeans

In [12]:
from sklearn.cluster import KMeans

# Cluster the users based on the sparse matrix
kmeans = KMeans(n_clusters=10, random_state=42).fit(train_data_sparse)

# Add the cluster labels to the user-item matrix
user_item_matrix['cluster'] = kmeans.labels_

### Predict ratings for the specific items that the user has rated in the test set

In [145]:
# pick random user_id from test_data
user_id = test_data['user_id'].sample(1).values[0]
user_id

211078

user id : 305137

In [229]:
train_data_sparse.shape

(95143, 16847)

In [238]:
train_data_sparse[95138, :].shape

(1, 16847)

In [76]:
rating_data['MAL_ID'].nunique()

4684

### predict_ratings_user_based_v2 

Finding the most similar user and predict the rating for the rated items

However, it resulted reuturning an array with many zeros (not rated). 

Assuming it happens when the similar user has not rated the item

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
def predict_ratings_user_based(user_id):
    # Get the anime_ids that the user has actually rated
    anime_ids = test_data.loc[test_data['user_id'] == user_id]['MAL_ID'].values

    # for each anime_id, get the name of anime
    # names = []
    # for anime_id in anime_ids:
    #     name = anime_name.loc[anime_name['MAL_ID']==anime_id]['Name'].values[0]
    #     names.append(name)

    # Get the cluster of the user (e.g., 5)
    user_cluster = user_item_matrix['cluster'].loc[user_item_matrix['cluster'].index == user_id].values[0]
    
    # Get the users in the same cluster (e.g., [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    users_in_cluster = user_item_matrix[user_item_matrix['cluster'] == user_cluster].index
    
    # Get the ratings of the users in the same cluster
    ratings_of_users_in_cluster = train_data_sparse[user_item_matrix.index.isin(users_in_cluster)]
    
    row_index = user_item_matrix.index.get_loc(user_id)

    # Calculate the similarity between the user and the users in the same cluster
    similarity = cosine_similarity(train_data_sparse[row_index], ratings_of_users_in_cluster)
    print('-------similarity array------')
    print(similarity, similarity.shape)

    # Get the ratings for the specified anime_ids that exist in train_data_sparse
    # It is actually searching it from user_item_matrix but its fine because train_data_sparse is a csr_matrix made from user_item_matrix
    common_anime_ids = set(anime_ids).intersection(set(user_item_matrix.index.values))
    common_anime_ids = np.array(sorted(list(common_anime_ids)))
    
    common_anime_ids_idx  = [user_item_matrix.columns.get_loc(anime_id) for anime_id in common_anime_ids]
    
    # Use this in case if you want to get all the anime_ids from train_data_sparse
    anime_ids_idx  = [user_item_matrix.columns.get_loc(anime_id) for anime_id in anime_ids]

    # get index of users_in_cluster, common_anime_ids
    users_in_cluster_idx = [user_item_matrix.index.get_loc(user_id) for user_id in users_in_cluster]

    # Get the ratings for the specified anime_ids
    anime_ratings = np.asarray(train_data_sparse[users_in_cluster_idx, :][:, anime_ids_idx].todense())

    # Get the most similar user
    top_similar_user = np.argsort(similarity)[0][-1]
    print('-------top_similar_users------')
    print(users_in_cluster[top_similar_user])

    # Calculate the predicted ratings for the specified anime_ids
    predicted_ratings = anime_ratings.dot(similarity[0][top_similar_user]) / np.sum(np.abs(similarity[0][top_similar_user]))

    return predicted_ratings[0]


### predict_ratings_user_based_v2 

In this version, we are finding top_k similar users instead of just the most similar user

In [166]:
def predict_ratings_user_based_v2(user_id, k_similar_users=10):
    # Get the anime_ids that the user has actually rated
    anime_ids = test_data.loc[test_data['user_id'] == user_id]['MAL_ID'].values

    # Get the cluster of the user (e.g., 5)
    user_cluster = user_item_matrix['cluster'].loc[user_item_matrix['cluster'].index == user_id].values[0]
    
    # Get the users in the same cluster (e.g., [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    users_in_cluster = user_item_matrix[user_item_matrix['cluster'] == user_cluster].index
    
    # Get the ratings of the users in the same cluster
    ratings_of_users_in_cluster = train_data_sparse[user_item_matrix.index.isin(users_in_cluster)]
    
    row_index = user_item_matrix.index.get_loc(user_id)

    # Calculate the similarity between the user and the users in the same cluster
    similarity = cosine_similarity(train_data_sparse[row_index], ratings_of_users_in_cluster)

    # Get the ratings for the specified anime_ids that exist in train_data_sparse
    # It is actually searching it from user_item_matrix but its fine because train_data_sparse is a csr_matrix made from user_item_matrix
    common_anime_ids = set(anime_ids).intersection(set(user_item_matrix.index.values))
    common_anime_ids = np.array(sorted(list(common_anime_ids)))
    
    # Use this in case if you want to get all the anime_ids from train_data_sparse
    anime_ids_idx  = [user_item_matrix.columns.get_loc(anime_id) for anime_id in anime_ids]

    # Get the top-k similar users
    top_similar_users_indices = np.argsort(similarity)[0][-k_similar_users:]

    # Initialize an array to store the predicted ratings
    predicted_ratings = np.zeros(len(anime_ids_idx))

    # Get the top-k similar users indices
    top_similar_users_indices = np.argsort(similarity[0])[-k_similar_users:]

    # Loop through the top-k similar users
    for user_idx in top_similar_users_indices:
        # Get the similarity score of the user
        similarity_score = similarity[0][user_idx]
        print(similarity_score)
        
        # Add the weighted rating of each user to the predicted ratings
        for j, anime_idx in enumerate(anime_ids_idx):
            user_actual_idx = users_in_cluster[user_idx]
            anime_id = user_item_matrix.columns[anime_idx]
            # print(user_actual_idx, anime_id)
            user_rating = user_item_matrix.at[user_actual_idx, anime_id]
            predicted_ratings[j] += similarity_score * user_rating

    # Normalize the predicted ratings by the sum of the absolute values of the top-k similarities
    predicted_ratings /= np.sum(np.abs(similarity[0][top_similar_users_indices]))

    predicted_ratings = np.round(predicted_ratings,1)

    return predicted_ratings


In [163]:
predicted_ratings = predict_ratings_user_based_v2(user_id)
predicted_ratings, predicted_ratings.shape

0.33387139711351094
219376 554
219376 855
219376 6201
219376 4752
219376 150
219376 355
219376 71
219376 104
219376 146
219376 9330
219376 16011
219376 5081
219376 10156
219376 2603
219376 3467
219376 1810
219376 2025
219376 9367
219376 50
219376 8536
219376 469
219376 1543
219376 4744
219376 11761
219376 957
219376 2129
219376 4224
219376 2993
219376 3974
219376 98
219376 290
219376 2926
219376 8525
0.33522510897337787
298445 554
298445 855
298445 6201
298445 4752
298445 150
298445 355
298445 71
298445 104
298445 146
298445 9330
298445 16011
298445 5081
298445 10156
298445 2603
298445 3467
298445 1810
298445 2025
298445 9367
298445 50
298445 8536
298445 469
298445 1543
298445 4744
298445 11761
298445 957
298445 2129
298445 4224
298445 2993
298445 3974
298445 98
298445 290
298445 2926
298445 8525
0.3368578636690543
244774 554
244774 855
244774 6201
244774 4752
244774 150
244774 355
244774 71
244774 104
244774 146
244774 9330
244774 16011
244774 5081
244774 10156
244774 2603
244774 3467

(array([1.2, 1.8, 3. , 4.6, 3.4, 6. , 5.3, 0.5, 2. , 0.6, 0. , 3. , 0.6,
        1.3, 2.5, 1.6, 3.3, 2.7, 3.8, 0.3, 3.8, 3.1, 2.4, 0.6, 0. , 3.5,
        3.8, 4.5, 1.5, 4.4, 3.5, 3.3, 2.7]),
 (33,))

In [148]:
actual_ratings = test_data.loc[test_data['user_id']==user_id]['rating'].values
actual_ratings, actual_ratings.shape

(array([ 8,  8,  7,  8, 10,  8,  8,  7,  7,  5,  6, 10,  4,  5,  6,  7,  8,
         4,  7,  7,  8,  8,  6,  6,  9,  7,  7,  5,  7,  9, 10,  8,  7]),
 (33,))

In [119]:
def calculate_mae(predicted_ratings, actual_ratings):
    return np.mean(np.abs(predicted_ratings - actual_ratings))

In [133]:
calculate_mae(predicted_ratings, actual_ratings)

3.3768115942028984

In [121]:
def calculate_rmse(predict_anime_ratings, actual_ratings):
    return np.sqrt(np.mean((predict_anime_ratings - actual_ratings)**2))

In [134]:
calculate_rmse(predicted_ratings, actual_ratings)

3.9122253573096746

### predict reating user based version 3
In this version, predict_ratings function just returns the predicted rating for a single item
For the mae calculateion I sample random user_id and anime_id from the test set

In [161]:
def predict_ratings_user_based_v3(user_id, anime_id, k_similar_users=5):
    # Get the cluster of the user (e.g., 5)
    user_cluster = user_item_matrix['cluster'].loc[user_item_matrix['cluster'].index == user_id].values[0]
    
    # Get the users in the same cluster (e.g., [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    users_in_cluster = user_item_matrix[user_item_matrix['cluster'] == user_cluster].index
    
    # Get the ratings of the users in the same cluster
    ratings_of_users_in_cluster = train_data_sparse[user_item_matrix.index.isin(users_in_cluster)]
    
    row_index = user_item_matrix.index.get_loc(user_id)

    # Calculate the similarity between the user and the users in the same cluster
    similarity = cosine_similarity(train_data_sparse[row_index], ratings_of_users_in_cluster)

    # Get the top k_similar_users indices
    top_similar_users_indices = np.argsort(similarity[0])[-k_similar_users:]

    # Get the index of the specified anime_id
    anime_idx = user_item_matrix.columns.get_loc(anime_id)
    
    # Initialize the predicted rating
    predicted_rating = 0

    # Compute the predicted rating for the specified anime_id
    for user_idx in top_similar_users_indices:
        user_rating = train_data_sparse[user_idx, anime_idx]
        similarity_score = similarity[0][user_idx]
        predicted_rating += similarity_score * user_rating

    predicted_rating /= np.sum(np.abs(similarity[0][top_similar_users_indices]))

    return round(predicted_rating, 1)

In [167]:
def compute_mae_random_samples(data, user_item_matrix, n_samples=100, k_similar_users=5):
    mae = 0
    rmse = 0

    for _ in range(n_samples):
        # Randomly sample a user_id and anime_id pair from the data
        sample = data.sample()
        user_id = sample['user_id'].values[0]
        anime_id = sample['MAL_ID'].values[0]

        # Make sure the user has rated the anime
        while user_item_matrix.loc[user_id, anime_id] == 0:
            sample = data.sample()
            user_id = sample['user_id'].values[0]
            anime_id = sample['MAL_ID'].values[0]

        # Predict the rating using the modified function
        predicted_rating = predict_ratings_user_based_v3(user_id, anime_id)

        # Get the actual rating from the sampled data
        actual_rating = sample['rating'].values[0]

        # Update the mean absolute error
        mae += abs(predicted_rating - actual_rating)

        # Update the root mean squared error
        rmse += (predicted_rating - actual_rating)**2

    # Calculate the mean absolute error
    mae /= n_samples
    rmse = np.sqrt(rmse / n_samples)

    return mae, rmse

In [168]:
mae, rmse = compute_mae_random_samples(rating_data, user_item_matrix, n_samples=100)
print("MAE:", mae)
print("RMSE:", rmse)

MAE: 5.632999999999999
RMSE: 6.0315255118419255


In [160]:
mae = compute_mae_random_samples(rating_data, user_item_matrix, n_samples=100, k_similar_users=1)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 6.01867868225677


### Predict ratings for top_k recommended items

In [185]:
train_data_sparse.shape

(95143, 16847)

In [175]:
def predict_ratings(user_id, k):
    # Get the cluster of the user
    # user_cluster = user_item_matrix.loc[user_id, 'cluster']
    user_cluster = user_item_matrix['cluster'].loc[user_item_matrix['cluster'].index == user_id].values[0]
    
    # Get the users in the same cluster
    users_in_cluster = user_item_matrix[user_item_matrix['cluster'] == user_cluster].index.intersection(train_data_sparse[:, 0].toarray().flatten().astype(int))
    
    # Get the ratings of the users in the same cluster
    ratings_of_users_in_cluster = train_data_sparse[users_in_cluster]
    
    # Calculate the similarity between the user and the users in the same cluster
    similarity = cosine_similarity(train_data_sparse[user_id], ratings_of_users_in_cluster)
    
    # Get the top-k similar users
    top_k_similar_users = np.argsort(similarity)[0][-k:]
    
    # Get the items rated by the top-k similar users
    items_rated_by_top_k_users = train_data_sparse[top_k_similar_users].toarray().T
    
    # Get the items not rated by the user
    unrated_items_by_user = np.isnan(user_item_matrix.loc[user_id])
    
    # Calculate the predicted rating for the unrated items
    predicted_ratings = items_rated_by_top_k_users.dot(similarity[0][top_k_similar_users]) / np.sum(similarity[0][top_k_similar_users])
    
    # Get the top-k recommended items
    top_k_items = np.argsort(predicted_ratings)[-k:]
    
    # Get the predicted ratings for the top-k items
    predicted_ratings_for_top_k_items = predicted_ratings[top_k_items]
    
    return top_k_items, predicted_ratings_for_top_k_items


In [181]:
# Top 10 recommendation and predicted ratings for user 6
top_k, predicted_ratings = predict_ratings(353385, k=10)
top_k, predicted_ratings

IndexError: row index (353385) out of range

### Evaluation

#### MAE@K

In [78]:
# Here is the problem, none of top_k animes are in the test_data
# So it doesn't work to calculate MAE by comparing train and test data
test_data.loc[test_data['user_id']==6].sort_values(by='MAL_ID', ascending=False)

Unnamed: 0,user_id,MAL_ID,rating
739,6,13675,5
912,6,11691,7
759,6,11099,8
681,6,11073,6
766,6,10622,8
825,6,10497,9
677,6,10161,8
773,6,10067,8
785,6,9846,7
703,6,9756,10


#### MAP

In [11]:
def calculate_MAP_k(user_id, k=10):
    # Get the items in the test set for the user
    test_items = test_data.loc[user_id].dropna().index
    
    # Get the top-k recommended items
    top_k_items = get_top_k_recommendations(user_id, k)
    
    # Get the indices of the recommended items that are in the test set
    relevant_items = np.intersect1d(top_k_items, test_items, assume_unique=True)
    
    # Calculate the precision at k
    precision_at_k = len(relevant_items) / k
    
    return precision_at_k