# Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.sparse import csr_matrix, find
import os

# Neighborhood-Based Collaborative Filtering (NBCF) Class

In [None]:
class NBCF():
    def __init__(self, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.data = None
        self.n_users = 0
        self.n_items = 0
        self.k = k # number of neighbor points
        self.dist_func = dist_func

        self.normalized_data = None
        self.means = None
        self.S = None

    def _similarity(self):
        self.S = self.dist_func(self.normalized_data)

    def _normalize(self):
        self.normalized_data = self.data.copy()
        rows, cols, vals = find(self.data)
        self.normalized_data[rows, cols] = vals - self.means[rows]

    def fit(self, csr_matrix):
        self.data = csr_matrix if self.uuCF else csr_matrix.T

        # number of users and items
        self.n_users = self.data.shape[0] if self.uuCF else self.data.shape[1]
        self.n_items = self.data.shape[1] if self.uuCF else self.data.shape[0]

        self.means = np.squeeze(np.asarray(self.data.mean(axis=self.uuCF)))
        # Replace NaN means with 0 for users with no ratings
        self.means = np.nan_to_num(self.means, 0)

        self._normalize()
        self._similarity()


    def _pred(self, user_id, item_id):
        """
        predict the rating of user u for item i (normalized)
        """
        # Step 1: Find users who rated items i
        # user (row) item (col) --> item (row) user (col)
        # in row i-th, get all non-zero indices (users - cols)
        users_rated_item_id = self.data.T[item_id].indices

        # Step 2: find similarity btw the current user and others
        # who already rated i
        sim = self.S[user_id, users_rated_item_id]

        # Step 3: find the k most similarity users
        nearest_user = np.argsort(sim)[-self.k:]
        nearest_sim = sim[nearest_user]

        # Get ratings and calculate weighted average
        nearest_ratings = self.normalized_data[nearest_user, item_id].toarray()
        numerator = np.sum(nearest_sim * nearest_ratings)
        denominator = np.sum(np.abs(nearest_sim)) + 1e-8

        predicted_rating = numerator/denominator
        final_rating = self.means[user_id] + predicted_rating

        return np.clip(final_rating, 0, 5)


    def recommend(self, user_id, n_recom=10):
        """
        Determine all items should be recommended for user u. (uuCF =1)
        or all users who might have interest on item u (uuCF = 0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """

        user_id = int(user_id)

        items_rated_by_u = self.data[user_id].indices

        unrated_predictions = []

        for item_id in range(self.n_items):
            if item_id not in items_rated_by_u:
                prediction = self._pred(user_id, item_id)
                unrated_predictions.append({
                    "item_id": item_id,
                    "prediction": prediction
                })

         # Sort predictions in descending order and return top N
        df = pd.DataFrame(unrated_predictions)
        df = (df.sort_values('prediction', ascending=False)
                .head(n_recom)
                .reset_index(drop=True))

        return df


    def evaluation(self, test_data):
        test_data = test_data if self.uuCF else test_data.T

        users, items, ground_truth = find(test_data)

        predicted_list = []

        square_error = 0

        for i in range(len(users)):
            predicted = self._pred(users[i], items[i])
            predicted_list.append(predicted)
            square_error += (ground_truth[i] - predicted) ** 2


        df = pd.DataFrame(
                {
                "user_id": users,
                "item_id": items,
                "ground_truth": ground_truth,
                "prediction": predicted_list
                })

        if(self.uuCF == 0):
            df = pd.DataFrame(
                {
                "item_id": users,
                "user_id": items,
                "ground_truth": ground_truth,
                "prediction": predicted_list
                })

        return np.sqrt(square_error / len(users)), df

# Load Datasets

In [None]:
path = "/content/drive/MyDrive/MDEK/Group2/Endterm/ub.base"
# path = "/content/ub.base"

os.path.exists(path)

True

In [None]:
df = pd.read_csv(path, sep="\t", names=["user_id", "item_id", "rating",
                                        "unix_timestamp"])


In [None]:
df.head(5)

Unnamed: 0,user_id,item_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


# Create User-Item CSR Matrix - User-Item Utility Matrix

In [None]:
# Don't care about unix_timestamp column, run this cell 1 time in a session
df.drop("unix_timestamp", axis=1, inplace=True)

In [None]:
max_user_id = df["user_id"].max()
max_item_id = df["item_id"].max()

# Utility Matrix: row is user and columns are items (movies)
csr_utility_matrix = csr_matrix(
                (df["rating"], \
                 (df["user_id"] - 1, df["item_id"] - 1)),
                shape=(max_user_id + 1, max_item_id + 1))



# Create NBCF Instance

In [None]:
"""
uuCF = 1 : users-users trong bài này chỉ cần quan tâm users-users
uuCF = 0 : items - items
"""

# When choosing uuCF user-item CSR matrix transpose to item-user CSR matrix
rs = NBCF(k=10, uuCF=1)
rs.fit(csr_utility_matrix)

# Recommendation - Unrated Items Prediction

In [None]:
"""
Nếu users - users (uuCF = 1) : 10 sản phẩm phù hợp với user_id truyền vào
Nếu items - items (uuCF = 0) : 10 user có thể thích sản phẩm truyền vào
"""
user_id = 511
n_recom = 10
recommend_list = rs.recommend(user_id, n_recom)

In [None]:
print(f"User_id: {user_id}")
recommend_list.head(10)

User_id: 511


Unnamed: 0,item_id,prediction
0,482,5.0
1,203,5.0
2,654,5.0
3,167,5.0
4,356,5.0
5,171,5.0
6,172,5.0
7,173,5.0
8,174,5.0
9,175,5.0


# Evaluation in Testset

In [None]:
path = "/content/drive/MyDrive/MDEK/Group2/Endterm/ub.test"
# path = "/content/ub.test"

os.path.exists(path)

True

In [None]:
test_df = pd.read_csv(path, sep="\t", names=["user_id", "item_id", "rating",
                                        "unix_timestamp"])

test_df.drop("unix_timestamp", axis=1, inplace=True)

In [None]:
max_user_id = test_df["user_id"].max()
max_item_id = test_df["item_id"].max()

In [None]:
test_csr_utility_matrix = csr_matrix(
                (test_df["rating"], \
                 (test_df["user_id"] - 1, test_df["item_id"] - 1)),
                shape=(max_user_id + 1, max_item_id + 1))

In [None]:
evaluation, result_df = rs.evaluation(test_csr_utility_matrix)

In [None]:
print(f"RMSE: {evaluation}")
result_df.head()

RMSE: 2.378561789603843


Unnamed: 0,user_id,item_id,ground_truth,prediction
0,0,16,3,0.558526
1,0,46,4,0.558526
2,0,63,5,5.0
3,0,89,4,0.558526
4,0,91,3,0.558526
