# Neighborhood-based Collaborative Filtering example 

In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from typing import Callable

class UserUserCF:
    """
    User-User Collaborative Filtering class for predicting user ratings of items
    based on similar users' ratings.
    """
    def __init__(self, Y_data: np.ndarray, k: int, sim_func = cosine_similarity) -> None:
        """
        Initialize the collaborative filtering model.
        
        Parameters:
        - Y_data: numpy array of shape (n_samples, 3), each row is [user_id, item_id, rating]
        - k: number of nearest neighbors to consider for predictions
        - sim_func: similarity function, default is cosine similarity
        """
        self.Y_data = Y_data  # Original data (user, item, rating)
        self.k = k  # Number of neighbors
        self.sim_func = sim_func  # Similarity function
        self.Ybar = None  # Normalized rating matrix
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1  # Number of unique users
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1  # Number of unique items

    def fit(self) -> None:
        """
        Normalize the data and compute the user-user similarity matrix.
        """
        users = self.Y_data[:, 0]  # Extract all user IDs
        self.Ybar = self.Y_data.copy()  # Make a copy of the data for normalization
        self.mu = np.zeros((self.n_users,))  # Mean rating for each user

        # Normalize ratings for each user
        for n in range(self.n_users):
            ids = np.flatnonzero(users == n)  # Ratings made by user n
            ratings = self.Y_data[ids, 2]  # User's ratings
            self.mu[n] = np.mean(ratings) if ids.size > 0 else 0  # Mean rating or 0 if no ratings
            self.Ybar[ids, 2] = ratings - self.mu[n]  # Subtract mean rating

        # Create a sparse matrix of normalized ratings
        self.Ybar = sparse.coo_matrix(
            (self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])),
            shape=(self.n_items, self.n_users)
        ).tocsr()

        # Compute user-user similarity matrix
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)

    def pred(self, u: int, i: int) -> float:
        """
        Predict the rating of user u for item i.
        
        Parameters:
        - u: user ID
        - i: item ID
        
        Returns:
        - Predicted rating
        """
        # Find all users who rated item i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        users_rated_i = self.Y_data[ids, 0].astype(np.int32)  # Users who rated item i

        # Similarities between user u and users who rated item i
        sim = self.S[u, users_rated_i]
        # Select k most similar users
        nns = np.argsort(sim)[-self.k:]
        nearest_s = sim[nns]  # Similarities of the nearest neighbors
        r = self.Ybar[i, users_rated_i[nns]]  # Ratings from the nearest neighbors

        # Compute the predicted rating
        eps = 1e-8  # Small value to prevent division by zero
        return (r * nearest_s).sum() / (np.abs(nearest_s).sum() + eps) + self.mu[u]


## MovieLens

In [59]:
# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

rating_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
rating_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

pd.set_option('display.max_columns', None)
print('Rating base:\n', rating_base.head(), '\n')
print('Rating test:\n', rating_test.head(), '\n')

rate_train = rating_base.to_numpy()
rate_test = rating_test.to_numpy()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

Rating base:
    user_id  movie_id  rating  unix_timestamp
0        1         1       5       874965758
1        1         2       3       876893171
2        1         3       4       878542960
3        1         4       3       876893119
4        1         5       3       889751712 

Rating test:
    user_id  movie_id  rating  unix_timestamp
0        1        20       4       887431883
1        1        33       4       878542699
2        1        61       4       878542420
3        1       117       3       874965739
4        1       155       2       878542201 



In [None]:
rs = UserUserCF(rate_train, k = 40)
rs.fit()
 
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 0.9766140289287265


# Item-item CF

In [61]:
rate_train = rate_train[:, [1, 0, 2]]
rate_test  = rate_test[:, [1, 0, 2]]

rs = UserUserCF(rate_train, k = 40)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('Item-item CF, RMSE =', RMSE)

Item-item CF, RMSE = 0.9688460838682366
