In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.model_selection import train_test_split

class NBCF(object):
    def __init__(self, Y, k, uuCF = 1, dist_f = cosine_similarity):
        self.uuCF = uuCF
        self.f = open('danhgia22_04.dat', 'a+')
        self.Y = Y if uuCF else Y[:, [1, 0, 2]]
        self.Ybar = None
        self.k = k
        self.dist_func = dist_f
        self.users_count = int(np.max(self.Y[:, 0])) + 1
        self.items_count = int(np.max(self.Y[:, 1])) + 1
        self.Pu = None
    
    def normalizeY(self):
        users = self.Y[:, 0]
        self.Ybar = self.Y.copy()
        self.mu = np.zeros((self.users_count,))
        for i in range(self.users_count):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.Y[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2],
            (self.Ybar[:, 1], self.Ybar[:, 0])), (self.items_count, self.users_count))
        self.Ybar = self.Ybar.tocsr()
        
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def fit(self):
        self.normalizeY()
        self.similarity()
    
    def pred(self, u, i, normalized = 1):
        if self.Ybar[i, u] != 0 and normalized:
            return self.Ybar[i, u]
        if self.Ybar[i, u] != 0 and normalized == 0:
            return self.Ybar[i, u] + self.mu[u]
        ids = np.where(self.Y[:, 1] == i)[0].astype(int)
        users = (self.Y[ids, 0]).astype(int)
        sim = self.S[u, users]
        a = np.argsort(sim)[-self.k:]
        nearest = sim[a]
        r = self.Ybar[i, users[a]]
        
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8)

        return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8) + self.mu[u]
        
        
    def _pred(self, u, i, normalized = 1):
        if self.uuCF: return self.pred(u, i, normalized)
        return self.pred(i, u, normalized)
    
    def RMSE(self, data_size, Data_test, test_size = 0):
        SE = 0
        n_tests = Data_test.shape[0]
        for n in range(n_tests):
            if Data_test[n, 1] == 1681:
                pred = 0
            else:
                pred = self._pred(Data_test[n, 0], Data_test[n, 1], normalized = 0)
            SE += (pred - Data_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        if self.uuCF == 1:
            print('User-user CF, RMSE =', RMSE)
            self.f.write('%s::1::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
        else:
            print('Item-item CF, RMSE =', RMSE)
            self.f.write('%s::0::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
        self.f.close()
        
    def recommend(self, u, limit = 100):
        l = 0
        ids = np.where(self.Y[:, 0] == u)[0].astype(int)
        items_rated_by_user = self.Y[ids, 1].tolist()
        recommended_items = []
        for i in range(self.items_count):
            if i not in items_rated_by_user:
                a = self._pred(u, i)
                if a > 0:
#                     l += 1
                    recommended_items.append(i)
        return recommended_items
        
    def evaluate_P(self, data_size, Data_test, test_size = 0):
        self.Pu = np.zeros((self.users_count,))
        for u in range(self.users_count):

            recommended_items = self.recommend(u)
            #         print('recommend_items: ', recommended_items)
            ids = np.where(Data_test[:, 0] == u)[0]
            rated_items = Data_test[ids, 1]
            #         print('rated_items:', rated_items)
            for i in rated_items:
                if i in recommended_items:
                    self.Pu[u] += 1
            #         print(self.Pu[u])

In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base_2 = pd.read_csv('mvl/1M.dat', sep='::', names=r_cols, encoding='latin-1', engine='python')
ratings_matrix_2= ratings_base_2.as_matrix()
ratings_matrix_2[:, :2] -= 1

Data_train_2, Data_test_2= train_test_split(ratings_matrix_2, test_size = 0.2, random_state=20)
rs = NBCF(Data_train_2, k = 30, uuCF = 1)
rs.fit()
rs.evaluate_P('1M', Data_test_2) 
# rs.RMSE('100K', Data_test_2, test_size = 0)

NameError: name 'r_cols' is not defined

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

rs = NBCF(rate_train, k = 30, uuCF = 1)
rs.fit()
rs.RMSE('100K', rate_test, test_size = 0)

User-user CF, RMSE = 0.9951981100882598


In [4]:
rs.evaluate_P('100K', rate_test)

In [6]:
print(rs.Pu)

[ 6.  8.  4.  5.  6.  8.  7.  8.  2.  5.  6.  4.  2.  3.  6.  5.  7. 10.
  7.  5.  4.  4.  8.  6.  6.  7.  3.  8.  6.  5.  9.  3.  3.  1.  6.  6.
  6.  5.  5.  6.  9.  9.  6.  6.  4.  3.  7.  8.  4.  3.  9.  4.  5.  5.
  6.  6.  1.  7.  5.  7.  8.  8.  4.  6.  8.  5.  3.  6.  6.  8.  9.  9.
 10.  8.  2.  8. 10.  4.  2.  9.  2.  2.  1.  7.  8.  5.  5.  3.  1.  7.
  7.  3.  9.  6.  5.  4.  6.  7.  7.  4.  5.  5.  7.  6.  5. 10.  1.  7.
  8.  4.  7.  5.  5.  9. 10.  8.  6.  6.  8.  4.  7.  7.  8.  8.  7.  2.
  5.  7.  6.  2. 10.  8.  7.  7.  4.  7.  4.  4.  5.  4.  6.  7.  3.  5.
  4.  4.  4.  8.  5.  6.  7.  5.  8.  8.  6. 10.  6.  7.  4.  8.  9.  5.
 10.  2.  8.  5.  6.  3. 10.  1.  7.  6.  3.  3. 10.  5.  9.  5.  7.  6.
  8.  9.  4.  7. 10.  1.  4.  8.  4.  5.  7.  5.  1.  9.  7.  4.  3.  7.
  6.  2.  4. 10.  3.  7.  3.  6.  2.  9.  4.  2.  7. 10.  5.  9.  9.  7.
  3.  6.  6.  4.  3.  5.  7.  3.  6.  9.  7.  5.  4.  9.  4.  9.  2.  4.
  8.  8.  9.  2.  3.  3.  4.  3.  7.  5.  4.  6.  6

In [None]:
r_cols = ['user_id', 'movie_id', 'rating']

ratings = pd.read_csv('ex.dat', sep=' ', names=r_cols, encoding='latin-1', engine='python')
print(ratings)

rate_train = ratings.as_matrix()
print(rate_train)

rs = NBCF(rate_train, k = 2, uuCF = 1)
rs.fit()

