In [2]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
from sklearn.model_selection import train_test_split
import random

class MF(object):
    def __init__(self, Y, k, X = None, W = None, lamda = 0.1,
                dist_func = cosine_similarity, learning_rate = 0.5, max_iter = 1000, user_based = 1, limit = 10):
#         self.f = open('danhgiaMF.dat', 'a+')
        self.Y = Y
        self.lamda = lamda
        self.k = k
        self.dist_func = dist_func
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.user_based = user_based
        self.limit = limit
        self.users_count = int(np.max(self.Y[:, 0])) + 1
        self.items_count = int(np.max(self.Y[:, 1])) + 1
        self.ratings_count = Y.shape[0]
        if X == None:
            self.X = np.random.randn(self.items_count, k)
        if W == None:
            self.W = np.random.randn(k, self.users_count)
        self.Ybar = self.Y.copy()
        
    def normalizeY(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.users_count
        else:
            user_col = 1
            item_col = 0 
            n_objects = self.items_count
        users = self.Y[:, user_col]
        self.mu = np.zeros((n_objects,))
        for i in range(n_objects):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.Y[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
#         print(self.Ybar)
    
    def get_user_rated_item(self, i):
        ids = np.where(i == self.Ybar[:, 1])[0].astype(int)
        users = self.Ybar[ids, 0].astype(int)
        ratings = self.Ybar[ids, 2]
        
        return (users, ratings)
        

    def get_item_rated_by_user(self, u):
        ids = np.where(u == self.Ybar[:, 0])[0].astype(int)
        items = self.Ybar[ids, 1].astype(int)
        ratings = self.Ybar[ids, 2]
        
        return (items, ratings)
    
    def updateX(self):
        for i in range(self.items_count):
            users, ratings = self.get_user_rated_item(i)
            Wi = self.W[:, users]
            a = -(ratings - self.X[i, :].dot(Wi)).dot(Wi.T)/self.ratings_count + \
            self.lamda*self.X[i, :]
            self.X[i, :] -= self.learning_rate*(a).reshape((self.k,))
        
    def updateW(self):
        for u in range(self.users_count):
            items, ratings = self.get_item_rated_by_user(u)
            Xn = self.X[items, :]
            a = -Xn.T.dot(ratings - Xn.dot(self.W[:, u]))/self.ratings_count + self.lamda*self.W[:, u]
            self.W[:, u] -= self.learning_rate*(a).reshape((self.k,))
        
    def fit(self, x, data_size, Data_test, test_size = 0):
        self.normalizeY()
        for i in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (i + 1) % x == 0:
                print(i + 1)
                self.RMSE(data_size, Data_test, test_size = 0)
                self.evaluate(data_size, Data_test, test_size = 0)
            
    def pred(self, u, i):
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias
        
        if pred < 1:
            return 1 
        if pred > 5: 
            return 5 
        return pred
    
    def recommend(self, u):
        ids = np.where(self.Y[:, 0] == u)[0].astype(int)
        items_rated_by_user = self.Y[ids, 1].tolist()
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X.dot(self.W[:, u]) + bias
        a = np.zeros((self.items_count,))
        recommended_items = []
        for i in range(self.items_count):
            if i not in items_rated_by_user:
                a[i] = pred[i]
        if len(a) < self.limit:
            recommended_items = np.argsort(a)[-self.items_count:]
        else:
            recommended_items = np.argsort(a)[-self.limit:]
        recommended_items = np.where(a[:] > 0)[0].astype(int)

#         return random.sample(list(recommended_items), self.limit)
        return recommended_items[:self.limit]
#         return recommended_items
    
    def RMSE(self, data_size, Data_test, test_size = 0):
        n_tests = Data_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(Data_test[n, 0], Data_test[n, 1])
            SE += (pred - Data_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        print('RMSE =', RMSE)
        if self.user_based:
            print('%s::1::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, self.max_iter, test_size, RMSE))
#             self.f.write('%s::1::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k,self.max_iter, test_size, RMSE))
        else:
            print('%s::0::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, self.max_iter, test_size, RMSE))
#             self.f.write('%s::0::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.k, test_size, RMSE))
#         self.f.close()
    
    def evaluate(self, data_size, Data_test, test_size = 0):
        sum_p = 0
        sum_r = 0
        self.Pu = np.zeros((self.users_count,))
        for u in range(self.users_count):
            recommended_items = self.recommend(u)
            ids = np.where(Data_test[:, 0] == u)[0]
            rated_items = Data_test[ids, 1]
            for i in rated_items:
                if i in recommended_items:
                    self.Pu[u] += 1
                if Data_test[i, 2] > 3:
                    sum_r += 1
            sum_p += self.Pu[u]
        
        p = sum_p/(self.users_count * self.limit)
        r = sum_p/sum_r
        print('%s::0::%d::%d::cosine_similarity::%r::%r::%r\r\n' % (str(data_size), self.k, self.max_iter, test_size, p, r))
        return p, r

In [None]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
from sklearn.model_selection import train_test_split
import random

class MF2(object):
    def __init__(self, Y, n_factors = 2, X = None, W = None, lamda = 0.1, lr = 2, n_epochs = 50, user_based = 1, 
                 limit = 10):
#         self.f = open('danhgiaMF.dat', 'a+')
        self.Y = Y
        self.lamda = lamda
        self.n_factors = n_factors
        self.lr = lr
        self.n_epochs = n_epochs
        self.user_based = user_based
        self.limit = limit
        self.users_count = int(np.max(self.Y[:, 0])) + 1
        self.items_count = int(np.max(self.Y[:, 1])) + 1
        self.ratings_count = Y.shape[0]
        if X == None:
            self.X = np.random.randn(self.items_count, n_factors)
        if W == None:
            self.W = np.random.randn(n_factors, self.users_count)
        self.Ybar = self.Y.copy()
        
        self.bi = np.random.randn(self.items_count)
        self.bu = np.random.randn(self.users_count)
        self.n_ratings = self.Y.shape[0]
        
    def normalizeY(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.users_count
        else:
            user_col = 1
            item_col = 0 
            n_objects = self.items_count
        users = self.Y[:, user_col]
        self.mu = np.zeros((n_objects,))
        for i in range(n_objects):
            ids = np.where(users == i)[0].astype(int)
            ratings = self.Y[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[i] = m
            self.Ybar[ids, 2] = ratings - self.mu[i]
#         print(self.Ybar)
    
    def get_user_rated_item(self, i):
        ids = np.where(i == self.Ybar[:, 1])[0].astype(int)
        users = self.Ybar[ids, 0].astype(int)
        ratings = self.Ybar[ids, 2]
        
        return (users, ratings)
        

    def get_item_rated_by_user(self, u):
        ids = np.where(u == self.Ybar[:, 0])[0].astype(int)
        items = self.Ybar[ids, 1].astype(int)
        ratings = self.Ybar[ids, 2]
        
        return (items, ratings)
    
    def updateX(self):
        for m in range(self.items_count):
            users, ratings = self.get_user_rated_item(m)
            Wm = self.W[:, users]
            b = self.bu[users]
            sum_grad_xm = np.full(shape = (self.X[m].shape) , fill_value = 1e-8)
            sum_grad_bm = 1e-8
            for i in range(50):
                xm = self.X[m]
                error = xm.dot(Wm) + self.bi[m] + b - ratings
                grad_xm = error.dot(Wm.T)/self.n_ratings + self.lamda*xm
                grad_bm = np.sum(error)/self.n_ratings
                sum_grad_xm += grad_xm**2
                sum_grad_bm += grad_bm**2
                # gradient descent
                self.X[m] -= self.lr*grad_xm.reshape(-1)/np.sqrt(sum_grad_xm)
                self.bi[m] -= self.lr*grad_bm/np.sqrt(sum_grad_bm)
        
    def updateW(self):
        for n in range(self.users_count):
            items, ratings = self.get_item_rated_by_user(n)
            Xn = self.X[items, :]
            b = self.bi[items]
            sum_grad_wn = np.full(shape = (self.W[:, n].shape) , fill_value = 1e-8).T
            sum_grad_bn = 1e-8
            for i in range(50):
                wn = self.W[:, n]
                error = Xn.dot(wn) + self.bu[n] + b - ratings
                grad_wn = Xn.T.dot(error)/self.n_ratings + self.lamda*wn
                grad_bn = np.sum(error)/self.n_ratings
                sum_grad_wn += grad_wn**2
                sum_grad_bn += grad_bn**2
                # gradient descent
                self.W[:, n] -= self.lr*grad_wn.reshape(-1)/np.sqrt(sum_grad_wn)
                self.bu[n] -= self.lr*grad_bn/np.sqrt(sum_grad_bn)

    def fit(self, x, data_size, Data_test, test_size = 0):
        self.normalizeY()
        for i in range(self.n_epochs):
            self.updateW()
            self.updateX()
            if (i + 1) % x == 0:
                print(i + 1)
                self.RMSE(Data_test, test_size = 0)
                self.evaluate(data_size, Data_test, test_size = 0)
            
    def pred(self, u, i):
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + self.bi[i] + self.bu[u] + bias
        
        if pred < 1:
            return 1 
        if pred > 5: 
            return 5 
        return max(0, min(5, pred))
    
    def recommend(self, u):
        ids = np.where(self.Y[:, 0] == u)[0].astype(int)
        items_rated_by_user = self.Y[ids, 1].tolist()
        a = np.zeros((self.items_count,))
        recommended_items = []
        pred = self.X.dot(self.W[:, u])
        for i in range(self.items_count):
            if i not in items_rated_by_user:
                if self.user_based:
                    bias = self.mu[u]
                else: 
                    bias = self.mu[i]
                a[i] = pred[i] +self.bi[i] + self.bu[u] + bias
        if len(a) < self.limit:
            recommended_items = np.argsort(a)[-self.items_count:]
        else:
            recommended_items = np.argsort(a)[-self.limit:]
        recommended_items = np.where(a[:] > 0)[0].astype(int)

#         return random.sample(list(recommended_items), self.limit)
        return recommended_items[:self.limit]
#         return recommended_items
    
    def RMSE(self, Data_test, test_size = 0, data_size = '100K'):
        n_tests = Data_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(Data_test[n, 0], Data_test[n, 1])
            SE += (pred - Data_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        print('RMSE =', RMSE)
        if self.user_based:
            print('%s::1::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.n_factors, self.n_epochs, test_size, RMSE))
#             self.f.write('%s::1::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.n_factors,self.n_epochs, test_size, RMSE))
        else:
            print('%s::0::%d::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.n_factors, self.n_epochs, test_size, RMSE))
#             self.f.write('%s::0::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), self.n_factors, test_size, RMSE))
#         self.f.close()
        return RMSE
    
    def evaluate(self, data_size, Data_test, test_size = 0):
        sum_p = 0
        sum_r = 0
        self.Pu = np.zeros((self.users_count,))
        for u in range(self.users_count):
            recommended_items = self.recommend(u)
            ids = np.where(Data_test[:, 0] == u)[0]
            rated_items = Data_test[ids, 1]
            for i in recommended_items:
                if i in rated_items:
                    self.Pu[u] += 1
                if Data_test[i, 2] >= 4.5:
                    sum_r += 1
            sum_p += self.Pu[u]
        print('sump', sum_p, sum_r)
        p = sum_p/(self.users_count * self.limit)
        r = sum_p/sum_r
        print('%s::0::%d::%d::cosine_similarity::%r::%r::%r\r\n' % (str(data_size), self.n_factors, self.n_epochs, test_size, p, r))
        return p, r

In [5]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestemp']
ratings = pd.read_csv('mvl/1M.dat', sep = '::', names = r_cols, encoding='latin-1')
Y_data = ratings.as_matrix()
for testSize in np.arange(0.2, 0.6, 0.1):
    Data_train, Data_test= train_test_split(Y_data, test_size=testSize, random_state=20)
    for j in [0, 1]:
        rs = MF(Data_train, k = 2, lamda = 0.1, learning_rate = 2, max_iter = 10, user_based = j)
        rs.fit()
        rs.RMSE('1M', Data_test, test_size = testSize)

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


RMSE = 0.981551914764402
1M::0::2::cosine_similarity::0.2::0.981551914764402

RMSE = 1.037606663188771
1M::1::2::cosine_similarity::0.2::1.037606663188771

RMSE = 0.9806335336422325
1M::0::2::cosine_similarity::0.30000000000000004::0.9806335336422325

RMSE = 1.0369936764645347
1M::1::2::cosine_similarity::0.30000000000000004::1.0369936764645347

RMSE = 0.980216476770738
1M::0::2::cosine_similarity::0.4000000000000001::0.980216476770738

RMSE = 1.0362479676878835
1M::1::2::cosine_similarity::0.4000000000000001::1.0362479676878835

RMSE = 0.9817307666356478
1M::0::2::cosine_similarity::0.5000000000000001::0.9817307666356478

RMSE = 1.0378532872672301
1M::1::2::cosine_similarity::0.5000000000000001::1.0378532872672301



In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('mvl/10M.dat', sep = '::', names = r_cols, encoding='latin-1')
Y_data = ratings.as_matrix()

for testSize in np.arange(0.1, 0.5, 0.1):
    Data_train, Data_test= train_test_split(Y_data, test_size=testSize, random_state=20)
    for x in [2, 5, 10]:
        rs = MF(Data_train, k = x, lamda = 0.1, learning_rate = 2, max_iter = 10, user_based = 0)
        rs.fit()
        rs.RMSE('10M', Data_test, test_size = testSize)

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestemp']

ratings_base_1 = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1', engine='python')
ratings_test_1 = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1', engine='python')

ratings_matrix_1 = ratings_base_1.as_matrix()
ratings_matrix = ratings_test_1.as_matrix()

ratings_matrix_1[:, :2] -= 1
ratings_matrix[:, :2] -= 1

rs_1 = MF(ratings_matrix_1, k = 100, lamda = 0.01, learning_rate = 2)
rs.fit(100, 

rs_1.RMSE("100K", ratings_matrix, test_size = testSize)

In [None]:
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.as_matrix()

rs = MF(Y_data, k = 2)

rs.fit()
rs.pred(6, 1)

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

r_cols = ['dataset', 'uMF', 'k','dist_func', 'test_size', 'RMSE']

RMSEs = pd.read_csv('danhgiaMF.dat', sep='::', names=r_cols, encoding='latin-1', engine='python')

rs = RMSEs.as_matrix()
print(RMSEs)
dataset = rs[:, 0]
uMF = rs[:, 1]

for n in ['100K', '1M']:
    ids_ii = np.where((dataset == n) & (uMF == 0))[0].astype(np.int32)
    ids_uu = np.where((dataset == n) & (uMF == 1))[0].astype(np.int32)
    items_ii = rs[ids_ii, 5]
    items_uu = rs[ids_uu, 5]
    t_ii = range(1, ids_ii.shape[0] + 1, 1)
    t_uu = range(1, ids_uu.shape[0] + 1, 1)
    plt.plot(t_ii, items_ii, 'g^', t_uu, items_uu, 'bs')
    plt.show()

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('1M_train1.dat', sep=':', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('1M_test1.dat', sep=':', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# # indices start from 0
# rate_train[:, :2] -= 1
# rate_test[:, :2] -= 1

# for i in np.arange(1, 11, 1):
rs = MF(rate_train, k = 10, lamda = 1, learning_rate = 2, max_iter = 1000)
rs.fit(100, '100K', rate_test)

100
RMSE = 2.08723384117777
100K::1::10::1000::cosine_similarity::0::2.08723384117777

100K::0::10::1000::cosine_similarity::0::0.011672185430463576::0.00236640708915145



KeyboardInterrupt: 

In [None]:
100
RMSE = 0.8913244821881572
100K::1::10::1000::cosine_similarity::0::0.8913244821881572

100K::0::10::1000::cosine_similarity::0::0.013509933774834438::0.0027389903329752955

200
RMSE = 0.8913244821881572
100K::1::10::1000::cosine_similarity::0::0.8913244821881572

100K::0::10::1000::cosine_similarity::0::0.013509933774834438::0.0027389903329752955

300
RMSE = 0.8913244821881572
100K::1::10::1000::cosine_similarity::0::0.8913244821881572

100K::0::10::1000::cosine_similarity::0::0.013509933774834438::0.0027389903329752955

400
RMSE = 0.8913244821881572
100K::1::10::1000::cosine_similarity::0::0.8913244821881572

100K::0::10::1000::cosine_similarity::0::0.013509933774834438::0.0027389903329752955

500
RMSE = 0.8913244821881572
100K::1::10::1000::cosine_similarity::0::0.8913244821881572

100K::0::10::1000::cosine_similarity::0::0.013509933774834438::0.0027389903329752955

1
30
RMSE = 1.061960663169275
1.061960663169275
2
30
RMSE = 1.062749446086973
1.062749446086973
3
30
RMSE = 1.0632309128657713
1.0632309128657713
4
30
RMSE = 1.065408051857591
1.065408051857591
5
30
RMSE = 1.0658117505594327
1.0658117505594327
6
30
RMSE = 1.0669805777207546
1.0669805777207546
7
30
RMSE = 1.0664761227724426
1.0664761227724426
8
30
RMSE = 1.069384631857391
1.069384631857391
9
30
RMSE = 1.0672201096462794
1.0672201096462794
10
30
RMSE = 1.0709619647969306
1.0709619647969306