# Ví dụ Collaborative Filtering 

Đây là đoạn code mà tôi implement bằng phương pháp Lọc Cộng Tác (Collaborative Filtering) bằng 2 phương pháp:  

- Lọc Cộng Tác dựa trên Người dùng (User-User Collaborative Filtering)
- Lọc Cộng Tác theo Mục (Item-Item Collaborative Filtering)

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class CF(object):
    def __init__(self, Y_data, k, dist_func = cosine_similarity, CF = 1):
        self.CF = CF       # 1 for user-user, 0 for item-item
        self.Y_data = Y_data if CF else Y_data[:, [1, 0, 2]]
        self.k = k       # Number of neigh
        self.dist_func = dist_func  
        self.Ybar_data = None

        # Number of users and items. Remember to add 1 since id starts from 0
        if self.CF == 1:
            self.n_users = int(np.max(self.Y_data[:, 0])) + 1
            self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        else:
            self.n_users = int(np.max(self.Y_data[:, 1])) + 1
            self.n_items = int(np.max(self.Y_data[:, 0])) + 1

    def add(self, new_data):
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    
    def normalize_Y(self):
        """
        base variable could be users in the (user-user) or items in (item-item)
        """

        base = self.Y_data[:, 0]   # First col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        # print(self.Ybar_data)
        self.mean = np.zeros((self.n_users, ))

        if (self.CF == 0):
            self.mean = np.zeros((self.n_items, ))
        
        for n in range(self.mean.shape[0]):
            # print(np.where(users == n))
            ids = np.where(base == n)[0].astype(np.int32)
            # print(ids)
            item_or_users_ids = self.Y_data[ids, 1]
            ratings = self.Y_data[ids, 2]

            m = np.mean(ratings)
            if np.isnan(m):
                m = 0   # to avoid empty array and nan value
            self.mean[n] = m
            self.Ybar_data[ids, 2] = ratings - self.mean[n]


        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        if (self.CF == 1):
            self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        else:
            self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_users, self.n_items))
            
        self.Ybar = self.Ybar.tocsr()
    
    def similarity(self):
        # eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again 
        (after some few ratings change)
        """
        self.normalize_Y()
        self.similarity()


    def fit(self):
        self.refresh()


    def __pred(self, u, i, normalized = 1):
        """
        Predict the rating of user u for item i (normalized)
        if you need the un
        """

        # Step 1: Find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)

        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)

        # Step 3: Find similarity between the current user and others
        # who already rated_i
        sim = self.S[u, users_rated_i]
        # print(self.S)
        # Step 4: Find the k most similarity users
        a = np.argsort(sim)[-self.k: ]
        nearest_s = sim[a]

        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8)
        
        return (r * nearest_s)[0] / (np.abs(nearest_s).sum() + 1e-8) + self.mean[u]
    
    def pred(self, u, i, normalized = 1):
        return self.__pred(u, i, normalized)
    
    
    def recommend(self, u):
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        
        if (self.CF):
            for i in range(self.n_items):
                if i not in items_rated_by_u:
                    rating = self.pred(u, i)
                    if rating > 0:
                        recommended_items.append(i)
        else:
            for i in range(self.n_users):
                if i not in items_rated_by_u:
                    rating = self.pred(u, i)
                    if rating > 0:
                        recommended_items.append(i)


        return recommended_items

    def print_recommendation(self):
        print("Recommendation: ")
        
        if (self.CF):
            for u in range(self.n_users):
                recommended_items = self.recommend(u)
               
                print(f'\tRecommend items(s): {recommended_items} for user {u}')
        else:
            for i in range(self.n_items):
                recommended_users = self.recommend(i)

                print(f'\tRecommend item  {i} for user(s) {recommended_users}')
                

In [3]:
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.to_numpy()

rs = CF(Y_data, k = 2, CF = 1)
rs.fit()

print(f"Singularity matrix is: \n{rs.S}")
rs.print_recommendation()

Singularity matrix is: 
[[ 1.          0.83307435 -0.5809475  -0.7856742  -0.81649658  0.20412415
  -0.38133693]
 [ 0.83307435  1.         -0.87333376 -0.3986205  -0.55234477 -0.23014365
  -0.70756759]
 [-0.5809475  -0.87333376  1.          0.27386128  0.31622777  0.47434165
   0.9621024 ]
 [-0.7856742  -0.3986205   0.27386128  1.          0.8660254  -0.28867513
   0.18490007]
 [-0.81649658 -0.55234477  0.31622777  0.8660254   1.          0.
   0.16012815]
 [ 0.20412415 -0.23014365  0.47434165 -0.28867513  0.          1.
   0.56044854]
 [-0.38133693 -0.70756759  0.9621024   0.18490007  0.16012815  0.56044854
   1.        ]]
Recommendation: 
	Recommend items(s): [2] for user 0
	Recommend items(s): [1] for user 1
	Recommend items(s): [] for user 2
	Recommend items(s): [4] for user 3
	Recommend items(s): [4] for user 4
	Recommend items(s): [0, 3, 4] for user 5
	Recommend items(s): [1] for user 6


In [4]:
rs_2 = CF(Y_data, k = 2, CF = 0)
rs_2.fit()

print(f"Singularity matrix is: \n{rs_2.S}")
rs_2.print_recommendation()

Singularity matrix is: 
[[ 1.          0.7678689   0.48903062 -0.88910122 -0.51743661]
 [ 0.7678689   1.          0.         -0.64326752 -0.13808619]
 [ 0.48903062  0.          1.         -0.54711014 -0.88318408]
 [-0.88910122 -0.64326752 -0.54711014  1.          0.68100212]
 [-0.51743661 -0.13808619 -0.88318408  0.68100212  1.        ]]
Recommendation: 
	Recommend item  0 for user(s) []
	Recommend item  1 for user(s) [1]
	Recommend item  2 for user(s) [0]
	Recommend item  3 for user(s) [5]
	Recommend item  4 for user(s) [3, 4, 5]


## With MovieLens 100k dataset

[Bộ dataset MovieLens 100k](https://grouplens.org/datasets/movielens/100k/) được công bố bởi GroupLens vào tháng 4/1998. MovieLens gồm có 100,000 *ratings* từ 943 *users* cho 1682 bộ phim (có dung lượng là 5MB) 

In [6]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.to_numpy()
rate_test = ratings_test.to_numpy()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [12]:
ratings_base.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,0,0,5,874965758
1,0,1,3,876893171
2,0,2,4,878542960
3,0,3,3,876893119
4,0,4,3,889751712


In [19]:
len(np.unique(ratings_base['user_id']))

943

In [15]:
ratings_test.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,0,16,3,875073198
1,0,46,4,875072125
2,0,63,5,875072404
3,0,89,4,878542300
4,0,91,3,876892425


In [21]:
np.unique(ratings_base['movie_id'])

array([   0,    1,    2, ..., 1679, 1680, 1681])

In [9]:
rs = CF(rate_train, k = 30, CF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 0.9951981100882598


In [11]:
rs = CF(rate_train, k = 30, CF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 1], rate_test[n, 0], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('Item-item CF, RMSE =', RMSE)

Item-item CF, RMSE = 0.9867912132705384
