In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from tqdm import tqdm

class uuCF(object):
    """
    User-User Collaborative Filtering implementation.
    """

    def __init__(self, Y_data: np.ndarray, k: int = 40, sim_func=cosine_similarity):
        """
        Initialize the model.
        Args:
            Y_data (np.ndarray): A 2D array of shape (n_samples, 3) where each row is [user_id, item_id, rating].
            k (int): The number of nearest neighbors to consider.
            sim_func (function): Similarity function to compute user-user similarity (default: cosine_similarity).
        """

        self.Y_data = Y_data # User-item interaction data
        self.k = k # Number of neighbors
        self.sim_func = sim_func # Similarity function
        self.Ybar = None # Normalized rating data
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 # Number of users
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1 # Number of items
        self.mu = None  # User mean ratings
        self.S = None  # User-user similarity matrix
    
    def fit(self) -> None:
        """
        Prepares the data by normalizing the ratings, creating a sparse matrix, and computing the similarity matrix.
        """

        # Extract user IDs from data
        users = self.Y_data[:, 0]

        # Copy and normalize the ratings
        self.Ybar = self.Y_data.copy()
        self.mu = np.zeros(self.n_users)

        # Compute mean rating for each user
        self.mu = np.bincount(users.astype(int), weights=self.Y_data[:, 2]) / np.bincount(users.astype(int))
        self.mu = np.nan_to_num(self.mu) # Replace NaNs with 0 for users with no ratings

        # Normalize ratings by subtracting user means
        self.Ybar[:, 2] = self.Y_data[:, 2] - self.mu[users.astype(int)]

        # Create a sparse matrix representation of the normalized ratings
        self.Ybar = sparse.coo_matrix(
            (self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])),
            shape=(self.n_items, self.n_users)
        ).tocsr()
        
        # Compute user-user similarity matrix
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)

    def predict_rating(self, user_id: int, item_id: int) -> float:
        """
        Predict the rating of a specific user for a specific item.
        Args:
            user_id (int): ID of the user.
            item_id (int): ID of the item.
        Returns:
            float: Predicted rating for the given user and item.
        """
        
        # Find all users who rated the target item
        item_ids = np.where(self.Y_data[:, 1] == item_id)[0].astype(int)
        users_rated_item = self.Y_data[item_ids, 0].astype(int)

        # Similarity of the target user with users who rated the item
        sim_scores = self.S[user_id, users_rated_item]

        # Get the k most similar users
        nearest_neighbors = np.argsort(sim_scores)[-self.k:]  # Top-k similarities
        nearest_sim_scores = sim_scores[nearest_neighbors]  # Similarities of nearest neighbors
        ratings_by_neighbors = self.Ybar[item_id, users_rated_item[nearest_neighbors]]  # Ratings by neighbors

        # Compute weighted average prediction
        eps = 1e-8  # Small number to avoid division by zero
        prediction = (ratings_by_neighbors * nearest_sim_scores).sum() / (np.abs(nearest_sim_scores).sum() + eps)

        # Add the user's mean rating back
        return prediction + self.mu[user_id]

In [2]:
# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

rating_base = pd.read_csv('ml-100k/u1.base', sep='\t', names=r_cols)
rating_test = pd.read_csv('ml-100k/u1.test', sep='\t', names=r_cols)

pd.set_option('display.max_columns', None)
print('Rating base:\n', rating_base.head(), '\n')
print('Rating test:\n', rating_test.head(), '\n')

# Convert data to numpy arrays
rate_train = rating_base.to_numpy()
rate_test = rating_test.to_numpy()

Rating base:
    user_id  movie_id  rating  unix_timestamp
0        1         1       5       874965758
1        1         2       3       876893171
2        1         3       4       878542960
3        1         4       3       876893119
4        1         5       3       889751712 

Rating test:
    user_id  movie_id  rating  unix_timestamp
0        1         6       5       887431973
1        1        10       3       875693118
2        1        12       5       878542960
3        1        14       5       874965706
4        1        17       3       875073198 



In [3]:
# Transform user and item indices to start from 0 (required for matrix operations)
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

# Train and evaluate User-User Collaborative Filtering
print("Running User-User Collaborative Filtering...")
rs = uuCF(rate_train, k=40)
rs.fit()

# Calculate RMSE for User-User CF
n_tests = rate_test.shape[0]
SE = 0  # Squared error
for n in range(n_tests):
    pred = rs.predict_rating(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2]) ** 2

RMSE = np.sqrt(SE / n_tests)
print(f'User-User CF, RMSE = {RMSE}')

Running User-User Collaborative Filtering...
User-User CF, RMSE = 0.981525986899648


In [4]:
# Transform data for Item-Item CF by swapping user_id and item_id
print("\nRunning Item-Item Collaborative Filtering...")
rate_train = rate_train[:, [1, 0, 2]]
rate_test = rate_test[:, [1, 0, 2]]

# Reuse the same uuCF class for Item-Item CF
rs = uuCF(rate_train, k=40)
rs.fit()

# Calculate RMSE for Item-Item CF
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.predict_rating(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2]) ** 2

RMSE = np.sqrt(SE / n_tests)
print(f'Item-Item CF, RMSE = {RMSE}')


Running Item-Item Collaborative Filtering...


  self.mu = np.bincount(users.astype(int), weights=self.Y_data[:, 2]) / np.bincount(users.astype(int))


Item-Item CF, RMSE = 0.9657065220682001
