In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class CollaborativeFiltering:
    """
    Collaborative Filtering class for User-User or Item-Item predictions.
    """
    def __init__(self, Y_data: np.ndarray, k: int, sim_func = cosine_similarity, mode='user') -> None:
        """
        Initialize the collaborative filtering model.
        
        Parameters:
        - Y_data: numpy array of shape (n_samples, 3), each row is [entity1_id, entity2_id, rating]
        - k: number of nearest neighbors to consider for predictions
        - sim_func: similarity function, default is cosine similarity
        - mode: 'user' for User-User CF, 'item' for Item-Item CF
        """
        self.Y_data = Y_data
        self.k = k
        self.sim_func = sim_func
        self.Ybar = None
        self.mode = mode
        if mode == 'user':
            self.n_entities = int(np.max(self.Y_data[:, 0])) + 1  # Number of unique users
            self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        elif mode == 'item':
            self.n_entities = int(np.max(self.Y_data[:, 0])) + 1  # Number of unique items
            self.n_users = int(np.max(self.Y_data[:, 1])) + 1
        else:
            raise ValueError("Mode must be 'user' or 'item'")

    def fit(self) -> None:
        """
        Normalize the data and compute the similarity matrix.
        """
        entities = self.Y_data[:, 0]  # Extract all entity IDs
        self.Ybar = self.Y_data.copy()
        self.mr = np.zeros((self.n_entities,))

        # Normalize ratings for each entity
        for e in range(self.n_entities):
            ids = np.flatnonzero(entities == e)
            ratings = self.Y_data[ids, 2]
            self.mr[e] = np.mean(ratings) if ids.size > 0 else 0
            self.Ybar[ids, 2] = ratings - self.mr[e]

        # Create a sparse matrix
        if self.mode == 'user':
            self.Ybar = sparse.coo_matrix(
                (self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])),
                shape=(self.n_items, self.n_entities)
            ).tocsr()
        else:  # mode == 'item'
            self.Ybar = sparse.coo_matrix(
                (self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])),
                shape=(self.n_users, self.n_entities)
            ).tocsr()

        # Compute similarity matrix
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)

    def recommend(self, e: int, num_recommendations: int) -> list:
        """
        Recommend a list of items for UUCF and users for IICF 
        
        Parameters:
        - e: the ID of the target
        - num_recommendations: number of entities to recommend
        
        Returns:
        - A list of recommended entity IDs
        """
        if self.mode == 'user':
            # Find the similar users to the target user
            sim = self.S[e, :]
            nearest_users = np.argsort(sim)[-self.k:]
            # Get the items these similar users rated, and recommend the most popular ones
            recommended_items = set()
            for user in nearest_users:
                user_ratings = self.Y_data[self.Y_data[:, 0] == user]
                for item_id in user_ratings[:, 1]:
                    recommended_items.add(item_id)
            # Return the top N recommendations
            return list(recommended_items)[:num_recommendations]
        else:  # mode == 'item'
            sim = self.S[e, :]
            nearest_items = np.argsort(sim)[-self.k:]
            # Return the most similar items (excluding the target item itself)
            recommended_items = [item for item in nearest_items if item != e]
            return recommended_items[:num_recommendations]
        
    def evaluate_recommendations(self, test_data, num_recommendations: int):
        hits = 0
        for user_id, item_id, rating in test_data:
            recommendations = self.recommend(user_id, num_recommendations)
            if item_id in recommendations:
                hits += 1
        hit_ratio = hits / len(test_data)
        print(f"Hit Ratio: {hit_ratio:.4f}")


# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

rating_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
rating_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

pd.set_option('display.max_columns', None)
print('Rating base:\n', rating_base.head(), '\n')
print('Rating test:\n', rating_test.head(), '\n')

rate_train = rating_base.to_numpy()
rate_test = rating_test.to_numpy()

# Constants
k_neighbors = 40
n_recommendations = 10

# indices start from 0 for User-User CF
rate_train_uu = rate_train.copy()
rate_test_uu = rate_test.copy()
rate_train_uu[:, :2] -= 1
rate_test_uu[:, :2] -= 1

# indices start from 0 for Item-Item CF (swap user and item)
rate_train_ii = rate_train[:, [1, 0, 2]].copy()
rate_test_ii = rate_test[:, [1, 0, 2]].copy()
rate_train_ii[:, :2] -= 1
rate_test_ii[:, :2] -= 1

# User-User CF
print("Evaluating User-User CF with generic class...")
cf_user = CollaborativeFiltering(rate_train_uu, k_neighbors, mode='user')
cf_user.fit()

n_tests_uu = rate_test_uu.shape[0]
SE_user = 0
for n in range(n_tests_uu):
    pred = cf_user.pred(int(rate_test_uu[n, 0]), int(rate_test_uu[n, 1]))
    SE_user += (pred - rate_test_uu[n, 2]) ** 2 

# Item-Item CF
print("Evaluating Item-Item CF with generic class...")
cf_item = CollaborativeFiltering(rate_train_ii, k_neighbors, mode='item')
cf_item.fit()

n_tests_ii = rate_test_ii.shape[0]
SE_item = 0
for n in range(n_tests_ii):
    pred = cf_item.pred(int(rate_test_ii[n, 0]), int(rate_test_ii[n, 1]))
    SE_item += (pred - rate_test_ii[n, 2]) ** 2 