In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
class NBCF(object):
    def __init__(self, Y, k, uuCF = 1, dist_f = cosine_similarity):
        self.uuCF = uuCF
        self.f = open('danhgia22_04.dat', 'a+')
        self.Y = Y if uuCF else Y[:, [1, 0, 2]]
        self.Ybar = None
        self.k = k
        self.dist_func = dist_f
        self.users_count = int(np.max(self.Y[:, 0])) + 1
        self.items_count = int(np.max(self.Y[:, 1])) + 1
    
    def normalizeY(self):
        users = self.Y[:, 0]
        self.Ybar = self.Y.copy()
        self.mu = np.zeros((self.users_count,))
        for i in range(self.users_count):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.Y[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2],
            (self.Ybar[:, 1], self.Ybar[:, 0])), (self.items_count, self.users_count))
        self.Ybar = self.Ybar.tocsr()
#         print(self.Ybar)
        
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def fit(self):
        self.normalizeY()
        self.similarity()
    
    def pred(self, u, i, normalized = 1):
        if self.Ybar[i, u] != 0 and normalized:
            return self.Ybar[i, u]
        if self.Ybar[i, u] != 0 and normalized == 0:
            return self.Ybar[i, u] + self.mu[u]
        ids = np.where(self.Y[:, 1] == i)[0].astype(int)
        users = (self.Y[ids, 0]).astype(int)
        sim = self.S[u, users]
        a = np.argsort(sim)[-self.k:]
        nearest = sim[a]
        r = self.Ybar[i, users[a]]
        
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8)

        return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8) + self.mu[u]
        
        
    def _pred(self, u, i, normalized = 1):
        if self.uuCF: return self.pred(u, i, normalized)
        return self.pred(i, u, normalized)
    
    def RMSE(self, data_size, Data_test, test_size):
        SE = 0
        n_tests = Data_test.shape[0]
        for n in range(n_tests):
            if Data_test[n, 1] == 1681:
                pred = 0
            else:
                pred = self._pred(Data_test[n, 0], Data_test[n, 1], normalized = 0)
            SE += (pred - Data_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        if self.uuCF == 1:
            print('User-user CF, RMSE =', RMSE)
            self.f.write('%s::1::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
        else:
            print('Item-item CF, RMSE =', RMSE)
            self.f.write('%s::0::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
        self.f.close()
        

In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

rs = NBCF(rate_train, k = 30, uuCF = 1)
rs.fit()
rs.RMSE('100K', rate_test, test_size = 0)

User-user CF, RMSE = 0.9951981100882598


In [3]:
rs = NBCF(rate_train, k = 30, uuCF = 0)
rs.fit()
rs.RMSE('100K', rate_test, test_size = 0)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Item-item CF, RMSE = 0.9867912132705384


In [4]:
r_cols = ['user_id', 'movie_id', 'rating']

ratings = pd.read_csv('ex.dat', sep=' ', names=r_cols, encoding='latin-1', engine='python')
print(ratings)

rate_train = ratings.as_matrix()
print(rate_train)

rs = NBCF(rate_train, k = 2, uuCF = 1)
rs.fit()

for i in range(5):
    for j in range(7):
        print(rs.pred(j, i))

    user_id  movie_id  rating
0         0         0     5.0
1         0         1     4.0
2         0         3     2.0
3         0         4     2.0
4         1         0     5.0
5         1         2     4.0
6         1         3     2.0
7         1         4     0.0
8         2         0     2.0
9         2         2     1.0
10        2         3     3.0
11        2         4     4.0
12        3         0     0.0
13        3         1     0.0
14        3         3     4.0
15        4         0     1.0
16        4         3     4.0
17        5         1     2.0
18        5         2     1.0
19        6         2     1.0
20        6         3     4.0
21        6         4     5.0
[[0. 0. 5.]
 [0. 1. 4.]
 [0. 3. 2.]
 [0. 4. 2.]
 [1. 0. 5.]
 [1. 2. 4.]
 [1. 3. 2.]
 [1. 4. 0.]
 [2. 0. 2.]
 [2. 2. 1.]
 [2. 3. 3.]
 [2. 4. 4.]
 [3. 0. 0.]
 [3. 1. 0.]
 [3. 3. 4.]
 [4. 0. 1.]
 [4. 3. 4.]
 [5. 1. 2.]
 [5. 2. 1.]
 [6. 2. 1.]
 [6. 3. 4.]
 [6. 4. 5.]]
1.75
2.25
-0.5
-1.3333333333333333
-1.5
0.176