In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.model_selection import train_test_split

class NBCF(object):
    def __init__(self, Y, k, uuCF = 1, dist_f = cosine_similarity, limit = 10):
        self.uuCF = uuCF
        self.f = open('danhgia22_04.dat', 'a+')
        self.Y = Y if uuCF else Y[:, [1, 0, 2]]
        self.Ybar = None
        self.k = k
        self.limit = limit
        self.dist_func = dist_f
        self.users_count = int(np.max(self.Y[:, 0])) + 1
        self.items_count = int(np.max(self.Y[:, 1])) + 1
        self.Pu = None
    
    def normalizeY(self):
        users = self.Y[:, 0]
        self.Ybar = self.Y.copy()
        self.mu = np.zeros((self.users_count,))
        for i in range(self.users_count):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.Y[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2],
            (self.Ybar[:, 1], self.Ybar[:, 0])), (self.items_count, self.users_count))
        self.Ybar = self.Ybar.tocsr()
        
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def fit(self):
        self.normalizeY()
        self.similarity()
    
    def pred(self, u, i, normalized = 1):
        if self.Ybar[i, u] != 0 and normalized:
            return self.Ybar[i, u]
        if self.Ybar[i, u] != 0 and normalized == 0:
            return self.Ybar[i, u] + self.mu[u]
        ids = np.where(self.Y[:, 1] == i)[0].astype(int)
        users = (self.Y[ids, 0]).astype(int)
        sim = self.S[u, users]
        a = np.argsort(sim)[-self.k:]
        nearest = sim[a]
        r = self.Ybar[i, users[a]]
        
#         print(ids, users, sim, a, nearest, r)
        
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8)

        return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8) + self.mu[u]
        
        
    def _pred(self, u, i, normalized = 1):
        if self.uuCF: return self.pred(u, i, normalized)
        return self.pred(i, u, normalized)
    
    def RMSE(self, data_size, Data_test, test_size = 0):
        SE = 0
        n_tests = Data_test.shape[0]
        for n in range(n_tests):
            if Data_test[n, 1] == 1681:
                pred = 0
            else:
                pred = self._pred(Data_test[n, 0], Data_test[n, 1], normalized = 0)
            SE += (pred - Data_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        if self.uuCF == 1:
            print('%s::1::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
            self.f.write('%s::1::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
        else:
            print('%s::0::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
            self.f.write('%s::0::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
        self.f.close()
        
    def recommend(self, u):
        ids = np.where(self.Y[:, 0] == u)[0].astype(int)
        items_rated_by_user = self.Y[ids, 1].tolist()
        a = np.zeros((self.items_count,))
        recommended_items = []
        for i in range(self.items_count):
            if i not in items_rated_by_user:
                a[i] = self._pred(u, i)
        if len(a) < self.limit:
            recommended_items = a
        else:
            recommended_items = np.argsort(a)[-self.limit:]
#             print(a[recommended_items])



        return recommended_items
        
    def evaluate_P(self, data_size, Data_test, test_size = 0):
        sum = 0
        p = 0
        self.Pu = np.zeros((self.users_count,))
        for u in range(self.users_count):

            recommended_items = self.recommend(u)
#             print('recommend_items: ', recommended_items)
            ids = np.where(Data_test[:, 0] == u)[0]
            rated_items = Data_test[ids, 1]
#             print('rated_items:', rated_items)
            for i in rated_items:
                if i in recommended_items:
                    self.Pu[u] += 1
            sum += self.Pu[u]
        
        p = sum/(self.users_count * self.limit)
        return p

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base_2 = pd.read_csv('mvl/1M.dat', sep='::', names=r_cols, encoding='latin-1', engine='python')
ratings_matrix_2= ratings_base_2.as_matrix()
ratings_matrix_2[:, :2] -= 1

Data_train_2, Data_test_2= train_test_split(ratings_matrix_2, test_size = 0.1, random_state = 5)
rs = NBCF(Data_train_2, k = 30, uuCF = 1)
rs.fit()
rs.evaluate_P('1M', Data_test_2) 
rs.RMSE('1M', Data_test_2, test_size = 0.1)

In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

rs = NBCF(rate_train, k = 30, uuCF = 1)
rs.fit()
rs.RMSE('100K', rate_test, test_size = 0)

User-user CF, RMSE = 0.9951981100882598


In [None]:
rs.evaluate_P('100K', rate_test)
print(rs.Pu)

[ 0.          0.          0.         ... -0.99999971  0.
  0.        ]
[0.99999985 0.99999985 0.99999985 0.9999999  0.99999992 0.99999992
 0.99999994 1.06683183 1.20978901 1.41566893]
recommend_items:  [1649 1644 1635 1641  813 1499 1466 1642 1367 1471]
rated_items: [ 16  46  63  89  91 112 221 226 227 252]
[ 0.         -0.25661976 -0.52307341 ... -0.9999995   0.
  0.        ]
[0.99999991 0.99999991 0.99999991 0.99999991 0.99999996 0.99999996
 0.99999997 1.20679457 1.27600054 2.06941146]
recommend_items:  [1630 1650 1622 1649 1641  118 1499 1642 1367 1501]
rated_items: [256 278 298 300 302 306 307 312 314 315]
[ 0.2517868  -0.21521247 -0.29342873 ... -0.99999937  0.
  0.        ]
[1.10467201 1.10467201 1.3902153  1.44619182 1.45563587 1.51746273
 1.5889212  1.99999907 1.99999959 1.99999959]
recommend_items:  [ 783  787  912 1204  838 1490  816  598  851  829]
rated_items: [298 299 317 323 329 340 344 347 349 350]
[ 0.21284519 -0.36220661 -0.4231913  ...  0.          0.
  0.        ]
[0

[ 0.30132589 -0.03394341 -0.32899513 ... -0.99999975  0.
  0.        ]
[1.9999991  1.9999991  1.9999991  1.99999933 1.99999933 1.99999933
 1.99999949 1.99999963 1.99999978 2.99999944]
recommend_items:  [1485 1492 1493  851  912  829 1670 1659 1553 1658]
rated_items: [ 11  49  99 152 173 183 281 440 479 894]
[-0.11546332 -0.17386915 -0.31954798 ... -0.99999971  0.
  0.        ]
[0.99999985 0.99999989 0.99999992 0.99999993 0.99999994 1.99999898
 1.99999898 1.99999898 1.99999898 1.99999959]
recommend_items:  [ 813 1466  118 1624 1651 1401 1493 1492 1485 1553]
rated_items: [258 268 269 285 293 299 301 342 479 873]
[ 0.2480047   0.         -0.27259678 ... -0.9999999   0.
  0.        ]
[0.99999976 0.99999981 0.99999986 0.99999987 0.9999999  0.99999991
 1.00927042 1.21751447 1.23494845 1.72982711]
recommend_items:  [1630 1672 1234  813 1611 1624 1396 1553 1642 1501]
rated_items: [   6   27  230  241  257  293  300  530  537 1012]
[ 0.20798244 -0.10970539 -0.55473415 ...  0.99999983  0.
  0.  

[ 0.27518475 -0.09570588 -0.44373269 ...  0.          0.
  0.        ]
[0.99999977 0.99999977 0.99999977 0.99999981 0.99999988 0.99999995
 1.16921681 1.35783185 1.52741532 1.99999963]
recommend_items:  [1641 1644 1650 1204 1655  118  389 1145 1367 1625]
rated_items: [ 21  49  78 116 117 120 173 180 272 596]
[ 0.         -0.11192081 -0.27759513 ...  0.          0.
  0.        ]
[1.00072336 1.04011233 1.05312952 1.07759407 1.58787084 1.99999776
 1.99999955 1.99999955 1.99999955 2.99999664]
recommend_items:  [1426 1590 1232 1642 1553 1659 1492 1485 1493 1658]
rated_items: [ 68 171 183 195 214 229 234 449 584 796]
[ 0.         -0.04816814 -0.45769524 ...  0.          0.
  0.        ]
[0.99999984 0.99999989 0.99999991 0.99999995 1.18839659 1.23639217
 1.33344827 1.99999949 1.9999997  2.99999954]
recommend_items:  [1672 1499 1630 1466 1232 1426 1553 1490 1659 1658]
rated_items: [ 249  293  408  587  743  824  830  863  925 1058]
[ 0.         -0.0612806  -0.38078687 ... -0.99999946  0.
  0.  

In [17]:
r_cols = ['user_id', 'movie_id', 'rating']

ratings = pd.read_csv('ex.dat', sep=' ', names=r_cols, encoding='latin-1', engine='python')
print(ratings)

rate_train = ratings.as_matrix()
print(rate_train)

rs = NBCF(rate_train, k = 2, uuCF = 1)
rs.fit()

rs.pred(1, 1)

    user_id  movie_id  rating
0         0         0     5.0
1         0         1     4.0
2         0         3     2.0
3         0         4     2.0
4         1         0     5.0
5         1         2     4.0
6         1         3     2.0
7         1         4     0.0
8         2         0     2.0
9         2         2     1.0
10        2         3     3.0
11        2         4     4.0
12        3         0     0.0
13        3         1     0.0
14        3         3     4.0
15        4         0     1.0
16        4         3     4.0
17        5         1     2.0
18        5         2     1.0
19        6         2     1.0
20        6         3     4.0
21        6         4     5.0
[[0. 0. 5.]
 [0. 1. 4.]
 [0. 3. 2.]
 [0. 4. 2.]
 [1. 0. 5.]
 [1. 2. 4.]
 [1. 3. 2.]
 [1. 4. 0.]
 [2. 0. 2.]
 [2. 2. 1.]
 [2. 3. 3.]
 [2. 4. 4.]
 [3. 0. 0.]
 [3. 1. 0.]
 [3. 3. 4.]
 [4. 0. 1.]
 [4. 3. 4.]
 [5. 1. 2.]
 [5. 2. 1.]
 [6. 2. 1.]
 [6. 3. 4.]
 [6. 4. 5.]]
[ 1 13 17] [0 3 5] [ 0.83307435 -0.3986205  -

0.47942560011376045