In [None]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.model_selection import train_test_split

class NBCF(object):
    def __init__(self, Y, k, uuCF = 1, dist_f = cosine_similarity, limit = 10):
        self.uuCF = uuCF
        self.f = open('danhgiaNBCF.dat', 'a+')
        self.Y = Y if uuCF else Y[:, [1, 0, 2]]
        self.Ybar = None
        self.k = k
        self.limit = limit
        self.dist_func = dist_f
        self.users_count = int(np.max(self.Y[:, 0])) + 1
        self.items_count = int(np.max(self.Y[:, 1])) + 1
        self.Pu = None
        self.Ru = None
    
    def normalizeY(self):
        users = self.Y[:, 0]
        self.Ybar = self.Y.copy()
        self.mu = np.zeros((self.users_count,))
        for i in range(self.users_count):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.Y[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2],
            (self.Ybar[:, 1], self.Ybar[:, 0])), (self.items_count, self.users_count))
        self.Ybar = self.Ybar.tocsr()
        
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def fit(self):
        self.normalizeY()
        self.similarity()
    
    def pred(self, u, i, normalized = 1):
        ids = np.where(self.Y[:, 1] == i)[0].astype(int)
        if ids == []:
            return 0
        users = (self.Y[ids, 0]).astype(int)
        sim = self.S[u, users]
        a = np.argsort(sim)[-self.k:]
        nearest = sim[a]
        r = self.Ybar[i, users[a]]
        
        if normalized:
            return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8)

        return (r*nearest)[0]/(np.abs(nearest).sum() + 1e-8) + self.mu[u]
        
        
    def _pred(self, u, i, normalized = 1):
        if self.uuCF: return self.pred(u, i, normalized)
        return self.pred(i, u, normalized)
    
    def RMSE(self, data_size, Data_test, test_size = 0):
        SE = 0
        n_tests = Data_test.shape[0]
        for n in range(n_tests):
            if Data_test[n, 1] == 1681:
                pred = 0
            else:
                pred = self._pred(Data_test[n, 0], Data_test[n, 1], normalized = 0)
            SE += (pred - Data_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)

        print('%s::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.uuCF, self.k, test_size, RMSE))
        self.f.write('%s::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.uuCF, self.k, test_size, RMSE))
        
    def recommend(self, u):
        if self.uuCF:
            ids = np.where(self.Y[:, 0] == u)[0].astype(int)
            items_rated_by_user = self.Y[ids, 1].tolist()
            n = self.items_count
        else:
            ids = np.where(self.Y[:, 1] == u)[0].astype(int)
            items_rated_by_user = self.Y[ids, 0].tolist()
            n = self.users_count
        a = np.zeros((n,))
        recommended_items = []
        for i in range(n):
            if i not in items_rated_by_user:
                a[i] = self._pred(u, i)
        if len(a) < self.limit:
            recommended_items = np.argsort(a)[-len(a):]
        else:
            recommended_items = np.argsort(a)[-self.limit:]
        return recommended_items
        
    def evaluate(self, data_size, Data_test, test_size = 0):
        sum_p = 0
        n = self.users_count if self.uuCF else self.items_count
        self.Pu = np.zeros((n,))
        for u in range(n):
            recommended_items = self.recommend(u)
            ids = np.where(Data_test[:, 0] == u)[0]
            rated_items = Data_test[ids, 1]
            for i in recommended_items:
                if i in rated_items:
                    self.Pu[u] += 1
            sum_p += self.Pu[u]
        p = sum_p/(n * self.limit)
        r = sum_p/(Data_test.shape[0] + 1)
        print('%s::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.uuCF, self.limit, p, r))
        self.f.write('%s::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.uuCF, self.limit, p, r))


In [None]:
from scipy.stats.stats import pearsonr

def pearson(X, Y = None):
    x = X.shape[0]
    y = X.shape[1]
    a = np.zeros((x, x))
    u = np.zeros((x, y))
    temp = 0
    
    for i in range(x):
        for j in range(y):
            u[i][j] = X[i, j]
    for i in range(x):
        for j in range(x):
            temp = pearsonr(u[i], u[j])[0]
            a[i][j] =  temp if not np.isnan(temp) else 0
    
    return a

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [8]:
rs = NBCF(rate_train, k = 30, uuCF = 1)
rs.fit()
rs.RMSE('100K', rate_test)
rs.f.close()



100K::1::30::cosine_similarity::0::0.9951981100882598



In [None]:
for j in range(410, 510, 10):
    rs.limit = j
    rs.evaluate('100K', rate_test)
rs.f.close()

In [None]:
rs2 = NBCF(rate_train, k = 20, uuCF = 0, dist_f=pearson)
rs2.fit()
rs2.evaluate('100K', rate_test)

In [None]:
for j in range(500, 410, -10):
    rs2.limit = j
    rs2.evaluate('100K', rate_test)
rs2.f.close()

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('mvl_can/1M_train_01.dat', sep=':', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('mvl_can/1M_test_01.dat', sep=':', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

In [None]:
rs = NBCF(rate_train, k = 30)
rs.fit()

In [None]:
for i in range(1, 25, 1):
    rs.k = i
    rs.RMSE('1M', rate_test)