In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class CollaborativeFiltering:
    """
    Collaborative Filtering class for User-User or Item-Item predictions.
    """
    def __init__(self, Y_data: np.ndarray, k: int, sim_func = cosine_similarity, mode='none') -> None:
        """
        Initialize the collaborative filtering model.
        
        Parameters:
        - Y_data: numpy array of shape (n_samples, 3), each row is [entity1_id, entity2_id, rating]
        - k: number of nearest neighbors to consider for predictions
        - sim_func: similarity function, default is cosine similarity
        - mode: 'user' for User-User CF, 'item' for Item-Item CF
        """
        self.Y_data = Y_data
        self.k = k
        self.sim_func = sim_func
        self.Ybar = None
        self.mode = mode
        if mode == 'user':
            self.n_entities = int(np.max(self.Y_data[:, 0])) + 1  # Number of unique users
            self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        elif mode == 'item':
            self.n_entities = int(np.max(self.Y_data[:, 1])) + 1  # Number of unique items
            self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        else:
            raise ValueError("Mode must be 'user' or 'item'")

    def fit(self) -> None:
        """
        Normalize the data and compute the similarity matrix.
        """
        if self.mode == 'user':
            print("Evaluating User-User CF with generic class...")
            entities = self.Y_data[:, 0]  # Extract all entity IDs
        else:  # mode == 'item'
            print("Evaluating Item-Item CF with generic class...")
            entities = self.Y_data[:, 1]  # Extract all entity IDs
        
        self.Ybar = self.Y_data.copy()
        self.mr = np.zeros((self.n_entities,))

        # Normalize ratings for each entity
        for e in range(self.n_entities):
            ids = np.flatnonzero(entities == e)
            ratings = self.Y_data[ids, 2]
            self.mr[e] = np.mean(ratings) if ids.size > 0 else 0
            self.Ybar[ids, 2] = ratings - self.mr[e]

        # Create a sparse matrix
        if self.mode == 'user':
            self.Ybar = sparse.coo_matrix(
                (self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])),
                shape=(self.n_items, self.n_entities)
            ).tocsr()
        else:  # mode == 'item'
            self.Ybar = sparse.coo_matrix(
                (self.Ybar[:, 2], (self.Ybar[:, 0], self.Ybar[:, 1])),
                shape=(self.n_users, self.n_entities)
            ).tocsr()

        # Compute similarity matrix
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)

    def pred(self, e: int, other_id: int) -> float:
        """
        Predict the rating based on the mode.
        
        Parameters:
        - e: entity ID (user or item)
        - other_id: item ID (if mode='user') or user ID (if mode='item')
        
        Returns:
        - Predicted rating
        """
        if self.mode == 'user': # Find users who rated the item
            ids = np.flatnonzero(self.Y_data[:, 1] == other_id)
            entities_rated = self.Y_data[ids, 0]
        else:  # mode == 'item' # Find items rated by the user
            ids = np.flatnonzero(self.Y_data[:, 0] == other_id)
            entities_rated = self.Y_data[ids, 1]

        if len(entities_rated) == 0:
            return self.mr[e]

        # Similarities
        sim = self.S[e, entities_rated]
        k_actual = min(self.k, len(sim))
        nns = np.argsort(sim)[-k_actual:]
        nearest_s = sim[nns]
        r = self.Ybar[other_id, entities_rated[nns]]

        eps = 1e-8
        return (r * nearest_s).sum() / (np.abs(nearest_s).sum() + eps) + self.mr[e]


# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

rating_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
rating_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

pd.set_option('display.max_columns', None)
print('Rating base:\n', rating_base.head(), '\n')
print('Rating test:\n', rating_test.head(), '\n')

rate_train = rating_base.to_numpy()
rate_test = rating_test.to_numpy()
# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1
n_tests = rate_test.shape[0]

# User-User CF
cf_user = CollaborativeFiltering(rate_train, k=40, mode='user')
cf_user.fit()

SE_user = 0
for n in range(n_tests):
    pred = cf_user.pred(int(rate_test[n, 0]), int(rate_test[n, 1]))
    SE_user += (pred - rate_test[n, 2]) ** 2 

RMSE_user = np.sqrt(SE_user / n_tests)
print('User-user CF, RMSE =', RMSE_user)

# Item-Item CF
cf_item = CollaborativeFiltering(rate_train, k=40, mode='item')
cf_item.fit()

SE_item = 0
for n in range(n_tests):
    pred = cf_item.pred(int(rate_test[n, 1]), int(rate_test[n, 0]))
    SE_item += (pred - rate_test[n, 2]) ** 2 

RMSE_item = np.sqrt(SE_item / n_tests)
print('Item-item CF, RMSE =', RMSE_item)

Rating base:
    user_id  movie_id  rating  unix_timestamp
0        1         1       5       874965758
1        1         2       3       876893171
2        1         3       4       878542960
3        1         4       3       876893119
4        1         5       3       889751712 

Rating test:
    user_id  movie_id  rating  unix_timestamp
0        1        20       4       887431883
1        1        33       4       878542699
2        1        61       4       878542420
3        1       117       3       874965739
4        1       155       2       878542201 

Evaluating User-User CF with generic class...
User-user CF, RMSE = 0.9766140289287265
Evaluating Item-Item CF with generic class...
Item-item CF, RMSE = 0.9688460838682366
