In [3]:
import numpy as np 
import pandas as pd 
# Reading user file:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
n_users = users.shape[0]
print('Number of users:', n_users)

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = ratings_base.values[:,:3]
rate_test = ratings_test.values[:,:3]
rate_train[:,:2]-=1
rate_test[:,:2]-=1
print('Number of traing rates:', rate_train.shape[0])
print('Number of test rates:', rate_test.shape[0])


#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols , encoding='latin-1')

n_items = items.shape[0]
print('Number of items:', n_items)

X0 = items.as_matrix()
X_train_counts = X0[:, -19:]

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
X = transformer.fit_transform(X_train_counts.tolist()).toarray()


def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:,0]
    ids = np.where(y == user_id)[0] 
    item_ids = rate_matrix[ids, 1]
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)


#thuc hien tinhtoan bang model
from sklearn.linear_model import Ridge
from sklearn import linear_model

class Model:

    def __init__(self, lamda,X,n_users):
        self.lamda = lamda
        self.X = X
        self.n_users = n_users
    
    def fit(self,rate_train):
        self.users = set(rate_train[:,0])
        #X is data train, duoi dang tf-idf cua item profile
        self.d = self.X.shape[1] # data dimension
        self.W = np.zeros((self.d, self.n_users))#model for user
        self.b = np.zeros(self.n_users)#bias for model
        for n in range(self.n_users):
            if n in self.users:
                ids, scores = get_items_rated_by_user(rate_train, n)
                model = Ridge(alpha=0.01, fit_intercept  = True)
                Xhat = self.X[ids, :]
                model.fit(Xhat, scores)
                self.W[:, n] = model.coef_
                self.b[n] = model.intercept_
        self.Ypredict  = self.X.dot(self.W) + self.b
    
    def evaluate(self, rate_test):
        se = 0
        cnt = 0
        user_test = set(rate_test[:,0])
        for user in user_test:
            if user in self.users:
                ids, scores_truth = get_items_rated_by_user(rate_test, user)
                scores_pred = self.Ypredict[ids, user]
                e = scores_truth - scores_pred 
                se += (e*e).sum(axis = 0)
                cnt += e.size
        return np.sqrt(se/cnt)
    

def split_fold(rate_train, number_fold):
    index_permutation = np.random.permutation(rate_train.shape[0])
    folds = np.array_split(index_permutation , number_fold)
    return folds

def cross_validation(rate_train,list_lamda):
    dict_results = dict()
    for lamda in list_lamda:
        dict_results[lamda]=compute_mean_error(rate_train , lamda)
        print(lamda, dict_results[lamda])
    return dict_results
                
def compute_mean_error(rate_train , lamda):
    #chia thanh nhieu fold va tien hanh test tren tung fold do
    folds  = split_fold(rate_train , 5)
    sum_error_fold = 0
    for index_fold in range(5):
        fold_test = rate_train[folds[index_fold]]
        fold_train = np.delete(rate_train , fold_test, axis=0)
        model = Model(lamda , X_train_counts , 943)
        model.fit(fold_train)
        sum_error_fold += model.evaluate(fold_test)
    return sum_error_fold / 5

def read_data(file_name):
    data = pd.read_csv('ml-100k/ua/'+file_name, sep='\t', names=r_cols)
    data = data.values[:,:3]
    return data

model = Model(0.1 , X_train_counts,943)
model.fit(rate_train)
print(model.evaluate(rate_test))

# list_k = [0.01 , 0.1 , 1 , 10 , 100]
# result  = cross_validation(rate_train,list_k)
# import pickle
# pickle.dump(result , open("result_content", "wb"))

Number of users: 943
Number of traing rates: 90570
Number of test rates: 9430
Number of items: 1682
1.2760653673912772
