# Import Necessary Libs

In [1]:
seed = 999

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import random

In [3]:
np.random.seed(seed)
random.seed(seed)

## Load Dataset

In [4]:
full_df = pd.read_csv('../animelists_cleaned.csv')
full_df

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

Drop rows where username is missing

In [None]:
full_df = full_df.dropna(subset=['username'])

Drop rows where show score is zero, as this means it is unrated, and therefore useless data

In [None]:
full_df = full_df[full_df['my_score'] > 0]

In [None]:
full_df.describe()

## Exploratory data analysis

In [None]:
sns.set(rc = {'figure.figsize':(10,10)})

In [None]:
# Step 1: Count the number of animes each user has watched
user_anime_counts = full_df['username'].value_counts()

# Step 2: Plot the distribution
plt.figure(figsize=(10, 6))
plt.hist(user_anime_counts, bins=100, edgecolor='k', alpha=0.7)
plt.title('Distribution of Number of Animes Watched by Users')
plt.xlabel('Number of Animes Watched')
plt.ylabel('Number of Users')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
full_df['username'].value_counts()

In [None]:
sns.boxplot(user_anime_counts, palette="Set2")
plt.title('Boxplot of Price broken down by borough and room type', fontsize = 15)
plt.xlabel('Neighbourhood Groups')
plt.ylabel('Price')
plt.legend(bbox_to_anchor=(1, 1))
plt.show();

In [None]:
full_df[full_df['username'] != "karthiga"]

Keep removing users at random until a user threshold is reached

In [None]:
# usernames = full_df['username'].unique()
# np.random.shuffle(usernames)
# limit = 10000
# username_ind = 0
# while len(full_df['username'].unique()) > limit:
#     full_df = full_df[full_df['username'] != usernames[username_ind]]
#     username_ind += 1

In [None]:
# Get unique users
unique_users = full_df['username'].unique()

# Randomly select users to keep (10,000)
num_users_to_keep = min(len(unique_users), 1000)  # Ensure we keep at most 10,000 users
selected_users = np.random.choice(unique_users, size=num_users_to_keep, replace=False)

# Filter the DataFrame to retain only rows with the selected users
anime_df = full_df[full_df['username'].isin(selected_users)]

print(f"Original number of unique users: {len(unique_users)}")
print(f"Number of unique users after filtering: {anime_df['username'].nunique()}")
print(anime_df.head())

To reduce the dimensionality of the user-item rating matrix in the numpy arrays later, I reduce the index range.

In [None]:
unique_ids = anime_df['anime_id'].unique()
# Gets list of unique IDs and sorts them
indToId = np.sort(unique_ids)
# Contains translations from anime IDs to numpy array indices
idToInd = {}
for i in range(len(indToId)):
    idToInd[indToId[i]] = i

Adds user ID column

In [None]:
anime_df['user_id'] = pd.factorize(anime_df['username'])[0]

# Split Dataset into Training Dataset and Testing Dataset

In [None]:
train_df, test_df = train_test_split(anime_df, test_size=0.2)
train_df, test_df

In [None]:
for row in train_df.itertuples():
    print(row)
    break

In [None]:
print(train_df.itertuples(index=False).__next__()._fields)


In [None]:
anime_df[(anime_df['user_id']==40352)] #  & (anime_df['anime_id']==indToId[3])

In [None]:
n_users = anime_df['user_id'].nunique()
n_items = anime_df['anime_id'].nunique()

In [None]:
train_df.sort_values(by='user_id')

In [None]:
# Training dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row.user_id, idToInd[row.anime_id]] = row.my_score
train_ds = pd.DataFrame(train_ds)

# Testing dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row.user_id, idToInd[row.anime_id]] = row.my_score
test_ds = pd.DataFrame(test_ds)

train_ds, test_ds

# Fitting the Algorithm

## User-based

### Compute Pearson Correlation Coefficient for Each Pair of Users in Training Dataset

In [None]:
GAMMA = 30
EPSILON = 1e-9

np_user_pearson_corr = np.zeros((n_users, n_users))

for i, user_i_vec in enumerate(train_ds.values):
    for j, user_j_vec in enumerate(train_ds.values):

        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        np_user_pearson_corr[i][j] = weighted_sim

np_user_pearson_corr

### Predict Ratings

In [None]:
np_predictions = np.zeros((n_users, n_items))

K = 50
EPSILON = 1e-9

for (i, j), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        # find top-k most similar users as the current user, remove itself
        sim_user_ids = np.argsort(np_user_pearson_corr[i])[-(K + 1):-1]

        # the coefficient values of similar users
        sim_val = np_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])

        # filter unrated items
        #w = np.clip(sim_users[mask_rated_j, j], 0, 1)
        #sim_r_sum_mean *= w
        #print(sim_users[:, j])
        
        np_predictions[i][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 10)
    

### Evaluation

#### Root Mean Squared Error (RMSE)

In [None]:
#==================RMSE on Testing set===================
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based): " + str(RMSE));

#### Mean Absolute Error (MAE)

In [None]:
#==================MAE on Testing set===================#
labels = test_ds.values

# absolute error on all ratings
absolute_error = np.abs(np_predictions - labels)

# weight
weight = np.clip(labels, 0, 1)

# absoulte error on rated ratings
abs_error = absolute_error * weight

# MAE
MAE = np.sum(abs_error) / np.sum(weight)

print("MAE on Tesing set (User-based): " + str(MAE));

In [None]:
np_predictions

In [None]:
test_ds

## Item-based

### Compute Pearson Correlation Coefficient for Each Pair of Users in Training Dataset

DELTA = 25
EPSILON = 1e-9

np_item_pearson_corr = np.zeros((n_items, n_items))

for i, item_i_vec in enumerate(train_ds.T.values):
    for j, item_j_vec in enumerate(train_ds.T.values):

        # ratings corated by the current pair od items
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        # corrated index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

        r_ui_sub_ri_sq = np.square(item_i_sub_mean)
        r_uj_sub_rj_sq = np.square(item_j_sub_mean)

        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

        np_item_pearson_corr[i][j] = weighted_sim

np_item_pearson_corr

### Predict Ratings

np_predictions = np.zeros((n_users, n_items))

K = 10
EPSILON = 1e-9

for (i, j), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        # find top-k most similar items as the current item, remove itself
        sim_item_ids = np.argsort(np_item_pearson_corr[j])[-(K + 1):-1]

        # the coefficient values of similar items
        sim_val = np_item_pearson_corr[j][sim_item_ids]

        # the average value of the current item's ratings
        sim_items = train_ds.T.values[sim_item_ids]
        item_mean = np.sum(train_ds.T.values[j]) / (np.sum(np.clip(train_ds.T.values[j], 0, 1)) + EPSILON)
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

        # sim(u, v) * (r_v - mean_v)
        sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

        # filter unrated items
        w = np.clip(sim_items[:, i], 0, 1)
        sim_r_sum_mean *= w

        np_predictions[i][j] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
        np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)
    

### Evaluation

#### Mean Absolute Error (MAE)

#==================MAE on Testing set===================#
labels = test_ds.values

# absolute error on all ratings
absolute_error = np.abs(np_predictions - labels)

# weight
weight = np.clip(labels, 0, 1)

# absoulte error on rated ratings
abs_error = absolute_error * weight

# MAE
MAE = np.sum(abs_error) / np.sum(weight)

print("MAE on Tesing set (Item-based): " + str(MAE));

#### Root Mean Squared Error (RMSE)

#==================RMSE on Testing set===================#
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (Item-based): " + str(RMSE));