In [3]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


import numpy as np

Reading the data, we will mainly use the rating_df for learning and testing 

In [4]:
movies_df = pd.read_csv('ml-1m/movies.dat',
                        delimiter='::', engine= 'python', header=None,
                        names=['movie_name', 'genre'], encoding = "ISO-8859-1")

In [5]:
rating_df = pd.read_csv('ml-1m/ratings.dat',
                        delimiter='::', engine= 'python', header=None,
                        names=['user_id', 'Movie_Id','Ratings','Time_stamp'], encoding = "ISO-8859-1")

In [6]:
users_df = pd.read_csv('ml-1m/users.dat',
                        delimiter='::', header=None,
                        names=['user_id', 'Gender','Age','Occupation','Zip-Code'], encoding = "ISO-8859-1")

  users_df = pd.read_csv('ml-1m/users.dat',


In [7]:
# rating_df.loc[2]['movie_id']

Implementing the cross validation method, this funtion return a dataframe with randomly assigmnet number of folds. Prepered dataframe like that we then use to learn and test the models, by choosing one of the fold as a valid test, and the other as the train sets

In [9]:
def cross_validation(df,n_folds):
    shuffled_df = df.sample(random_state = 42, frac =1)
    shuffled_df['Fold']= None
    shuffled_df.reset_index(inplace = True)
    shuffled_df.drop(columns = 'index', inplace = True)
    data_size = len(shuffled_df)
    for i in range(1,n_folds):
        shuffled_df.loc[int((i-1)/n_folds*data_size):int(i/n_folds * data_size),'Fold'] = i
    shuffled_df.loc[int((n_folds-1)/n_folds*data_size):,'Fold']= n_folds
    return shuffled_df

In [10]:
data = cross_validation(rating_df, 5)
data.reset_index(inplace = True)

Implementing the model, we create the model as two matrixes of features for the users and movies. Sizes of the matrixes are determined by the unique number of user_id (accordingly movie_id for the item matrix) and the number of features that is chosen when creating the model. 

As the train set may include gaps in the numbering of either user_id or movie_id we use dictionaries to translate the movie_id (or user_id) to matrix_id

For the learning part, we use the algorithm described on the slides from the lecture, we calculate the prediction as a scalar multiplication of the according vectors in matrixes of users and items. Then we calculate the error and update the vectors by learning_rate*(error*user_matrix_vector- lamb*self.item_matrix_vector). As we can see there are two parameters learning rate and lambda that we can tune.

We initialize the matrixes with random numbers using np.random.rand function

The output is put into [1:5] interval by simply setting every output < 1 into 1 and every output > 5 as 5, other outputs are not changed.

In the testing function, when the test example uses user_id that did not appear in the train set (in case of a movie_id not appearing in the train set proceed similarly) we take the sum of the vectors for the corresponding movie_id and scale the sums of features of all movie_ids into a [1:5] interval, and read the value of the movie_id that is asked in the testing example (after the scaling). 

The assumption is that the bigger the sum of features the bigger the chance is for the movie to get a high score. This assumption is not 100% true, as for example having feature < 0 with a corresponding <0 feature in the user_id vector creates a positive result.

In [12]:
class MatrixFactorization:
    def __init__(self,x, num_features):
        #initilaze two matrixes that then multiply by each other to give a matrix of ratings
        
        user_size = np.unique(x['user_id']).shape[0]
        item_size = np.unique(x['Movie_Id']).shape[0]

        values_user = np.unique(x['user_id'])
        self.dict_user = {values_user[i] : i for i in range(len(values_user))}

        values_item = np.unique(x['Movie_Id'])
        self.dict_item = {values_item[i] : i for i in range(len(values_item))}
        
        self.user_matrix = np.random.rand(user_size,num_features)
        self.item_matrix = np.random.rand(item_size,num_features)
        
    def fit(self,x, learning_rate = 0.005, lamb = 0.05, n_iter = 10):
        for it in range(n_iter):
            tmp = 0
            for i in range(len(x)):
                user = x.loc[i]['user_id']
                item = x.loc[i]['Movie_Id']

                user_idx = self.dict_user[user]
                item_idx = self.dict_item[item]
                
                #calculate the error
                error = x.loc[i]['Ratings'] - min(max(np.matmul(self.user_matrix[user_idx],self.item_matrix[item_idx]),1),5)
                # update values
                self.user_matrix[user_idx] = self.user_matrix[user_idx] + learning_rate*(error*self.item_matrix[item_idx] - lamb*self.user_matrix[user_idx])
                self.item_matrix[item_idx] = self.item_matrix[item_idx] + learning_rate*(error*self.user_matrix[user_idx] - lamb*self.item_matrix[item_idx])

                tmp += 1
                if tmp%50000 ==0:
                    print(f'currently done: {tmp/len(x)} % of the iteration {it}')

        
        print('current iteration ended: '+str(it))
    def test(self,x):
        predictions = []
        for i in range(len(x)):
            
            user = x.loc[i]['user_id']
            item = x.loc[i]['Movie_Id']

            try:
                user_idx = self.dict_user[user]
                item_idx = self.dict_item[item]
                pred = min(max(np.matmul(self.user_matrix[user_idx],self.item_matrix[item_idx]),1),5)
                predictions.append(pred)
                
            except: #If there is no user
                try:
                    item_idx = self.dict_item[item]
                    sum_item = np.sum(self.item_matrix[item_idx])
                    sums = np.sum(self.item_matrix, axis = 1)
                    pred =  (sum_item- np.min(sums)) / (np.max(sums) - np.min(sums)) * (4) + 1
                    predictions.append(pred)
                
                except: # If there is no movie
                    user_idx = self.dict_user[user]
                    sum_user= np.sum(self.user_matrix[item_idx])
                    sums = np.sum(self.user_matrix, axis = 1)
                    pred =  (sum_user - np.min(sums)) / (np.max(sums) - np.min(sums)) * (4) + 1
                    predictions.append(pred)
                #calculate the error
            
        labels = np.array(x['Ratings'])
        predictions = np.array(predictions)
        rmse =  mean_squared_error(labels,predictions, squared = False)
        mse = mean_absolute_error(labels,predictions)

        return rmse, mse

Here are some of the results we managed to get using this method with different parameters. The best results we got from the parameters: 

n_features = 10
n_iter = 15
learning_rate = 0.01
lambda = 0.03 

comment - We have noticed an error in the implementation of the MatrixFactorization that the number of unique movie_id was taken from the dataset itself, so the matrix had all of them implemented (at random of course). The learning on these sets underneath is on the wrongly implemented constructor. We believe that this error would not change the results by a lot. Unfortunately, as the function fit time takes hours, especially with that number of iterations, and the error was found on the day of the deadline we couldn't refit the functions. We provide the old implementation under the training.

In [17]:
rmse_list = []
mae_list = []
for i in range(1,6):
    train = data.loc[data['Fold'] != i ]
    # print(train)
    train = train.reset_index()
    valid = data.loc[data['Fold'] == i ] 
    valid = valid.reset_index()
    mt = MatrixFactorization(train,50)
    # print(train)
    print('Fitting the fold: ' + str(i))
    mt.fit(train, n_iter = 10)
    rmse, mae = mt.test(valid)
    print(f"Fold {i} RMSE: {rmse} \n MSE: {mae}")
    rmse_list.append(rmse)
    mae_list.append(mae)
print(f'Mean results:\nRMSE: {np.mean(np.array(rmse_list))} \nMAE: {np.mean(np.array(mae_list))}')

Fitting the fold: 1
currently done: 0.062486877755671306 % of the iteration 0
currently done: 0.12497375551134261 % of the iteration 0
currently done: 0.18746063326701393 % of the iteration 0
currently done: 0.24994751102268523 % of the iteration 0
currently done: 0.31243438877835655 % of the iteration 0
currently done: 0.37492126653402785 % of the iteration 0
currently done: 0.43740814428969915 % of the iteration 0
currently done: 0.49989502204537045 % of the iteration 0
currently done: 0.5623818998010418 % of the iteration 0
currently done: 0.6248687775567131 % of the iteration 0
currently done: 0.6873556553123844 % of the iteration 0
currently done: 0.7498425330680557 % of the iteration 0
currently done: 0.812329410823727 % of the iteration 0
currently done: 0.8748162885793983 % of the iteration 0
currently done: 0.9373031663350696 % of the iteration 0
currently done: 0.9997900440907409 % of the iteration 0
currently done: 0.062486877755671306 % of the iteration 1
currently done: 0.

In [18]:
np.save('user_matrix_2.npy',mt.user_matrix)
np.save('item_matrix_2.npy',mt.item_matrix)
# mt_2.dict_user
list_user = list(mt.dict_user.items())
np.save('dict_user_2.npy',np.array(list_user))
# mt_2.dict_item
key_list_item = list(mt.dict_item.keys())
item_list_item = list(mt.dict_item.items())
np.save('dict_items_2.npy',np.array(item_list_item))
# np.save(np.array(key))

In [12]:
rmse_list = []
mae_list = []
for i in range(1,6):
    train = data.loc[data['Fold'] != i ]
    train = train.reset_index()
    valid = data.loc[data['Fold'] == i ] 
    valid = valid.reset_index()
    mt_2 = MatrixFactorization(train,20)
    print('Fitting the fold: ' + str(i))
    mt_2.fit(train,learning_rate = 0.01, lamb = 0.03, n_iter = 15)
    rmse, mae = mt_2.test(valid)
    print(f"Fold {i} RMSE: {rmse} \n MSE: {mae}")
    rmse_list.append(rmse)
    mae_list.append(mae)
print(f'Mean results:\nRMSE: {np.mean(np.array(rmse_list))} \nMAE: {np.mean(np.array(mae_list))}')

Fitting the fold: 1
currently done: 0.062486877755671306 % of the iteration 0
currently done: 0.12497375551134261 % of the iteration 0
currently done: 0.18746063326701393 % of the iteration 0
currently done: 0.24994751102268523 % of the iteration 0
currently done: 0.31243438877835655 % of the iteration 0
currently done: 0.37492126653402785 % of the iteration 0
currently done: 0.43740814428969915 % of the iteration 0
currently done: 0.49989502204537045 % of the iteration 0
currently done: 0.5623818998010418 % of the iteration 0
currently done: 0.6248687775567131 % of the iteration 0
currently done: 0.6873556553123844 % of the iteration 0
currently done: 0.7498425330680557 % of the iteration 0
currently done: 0.812329410823727 % of the iteration 0
currently done: 0.8748162885793983 % of the iteration 0
currently done: 0.9373031663350696 % of the iteration 0
currently done: 0.9997900440907409 % of the iteration 0
currently done: 0.062486877755671306 % of the iteration 1
currently done: 0.

In [13]:
c_rmse = 10000
user_params = None
item_params = None
rmse_list = []
mae_list = []
for i in range(1,6):
    train = data.loc[data['Fold'] != i ]
    # print(train)
    train = train.reset_index()
    valid = data.loc[data['Fold'] == i ] 
    valid = valid.reset_index()
    mt_3 = MatrixFactorization(train,20)
    # print(train)
    print('Fitting the fold: ' + str(i))
    mt_3.fit(train,learning_rate = 0.002, lamb = 0.04, n_iter = 10)
    rmse, mae = mt_3.test(valid)
    print(f"Fold {i} RMSE: {rmse} \n MSE: {mae}")
    rmse_list.append(rmse)
    mae_list.append(mae)
print(f'Mean results:\nRMSE: {np.mean(np.array(rmse_list))} \nMAE: {np.mean(np.array(mae_list))}')
    # if rmse < c_rmse:
    #     user_params = mt.user_matrix
    #     item_params = mt.item_matrix

Fitting the fold: 1
currently done: 0.062486877755671306 % of the iteration 0
currently done: 0.12497375551134261 % of the iteration 0
currently done: 0.18746063326701393 % of the iteration 0
currently done: 0.24994751102268523 % of the iteration 0
currently done: 0.31243438877835655 % of the iteration 0
currently done: 0.37492126653402785 % of the iteration 0
currently done: 0.43740814428969915 % of the iteration 0
currently done: 0.49989502204537045 % of the iteration 0
currently done: 0.5623818998010418 % of the iteration 0
currently done: 0.6248687775567131 % of the iteration 0
currently done: 0.6873556553123844 % of the iteration 0
currently done: 0.7498425330680557 % of the iteration 0
currently done: 0.812329410823727 % of the iteration 0
currently done: 0.8748162885793983 % of the iteration 0
currently done: 0.9373031663350696 % of the iteration 0
currently done: 0.9997900440907409 % of the iteration 0
currently done: 0.062486877755671306 % of the iteration 1
currently done: 0.

In [13]:
np.save('user_matrix.npy',mt_2.user_matrix)
np.save('item_matrix.npy',mt_2.item_matrix)

In [24]:
mt_2.dict_user
list_user = list(mt_2.dict_user.items())
np.save('dict_user.npy',np.array(list_user))

In [22]:
mt_2.dict_item
key_list_item = list(mt_2.dict_item.keys())
item_list_item = list(mt_2.dict_item.items())
np.save('dict_items.npy',np.array(item_list_item))
# np.save(np.array(key))

Here is the old version of the MatrixFactorization, the only change is in the item_size implementation

In [None]:
class MatrixFactorization:
    def __init__(self,x, num_features):
        #initilaze two matrixes that then multiply by each other to give a matrix of ratings
        
        user_size = np.unique(x['user_id']).shape[0]

        #Here is the error
        item_size = np.unique(data['Movie_Id']).shape[0]

        values_user = np.unique(x['user_id'])
        self.dict_user = {values_user[i] : i for i in range(len(values_user))}

        # key_list_user = list(dict_user.keys())

        values_item = np.unique(x['Movie_Id'])
        self.dict_item = {values_item[i] : i for i in range(len(values_item))}
        # key_list_item = list(dict_item.keys())

        
        self.user_matrix = np.random.rand(user_size,num_features)
        self.item_matrix = np.random.rand(item_size,num_features)
        
    def fit(self,x, learning_rate = 0.005, lamb = 0.05, n_iter = 10):
        for it in range(n_iter):
            tmp = 0
            for i in range(len(x)):
                user = x.loc[i]['user_id']
                item = x.loc[i]['Movie_Id']

                user_idx = self.dict_user[user]
                item_idx = self.dict_item[item]
                
                #calculate the error
                error = x.loc[i]['Ratings'] - min(max(np.matmul(self.user_matrix[user_idx],self.item_matrix[item_idx]),1),5)
                # update values
                self.user_matrix[user_idx] = self.user_matrix[user_idx] + learning_rate*(error*self.item_matrix[item_idx] - lamb*self.user_matrix[user_idx])
                self.item_matrix[item_idx] = self.item_matrix[item_idx] + learning_rate*(error*self.user_matrix[user_idx] - lamb*self.item_matrix[item_idx])

                tmp += 1
                if tmp%50000 ==0:
                    print(f'currently done: {tmp/len(x)} % of the iteration {it}')

        
        print('current iteration ended: '+str(it))
    def test(self,x):
        predictions = []
        for i in range(len(x)):
            
            user = x.loc[i]['user_id']
            item = x.loc[i]['Movie_Id']

            try:
                user_idx = self.dict_user[user]
                item_idx = self.dict_item[item]
                pred = min(max(np.matmul(self.user_matrix[user_idx],self.item_matrix[item_idx]),1),5)
                predictions.append(pred)
                
            except: #If there is no user
                try:
                    item_idx = self.dict_item[item]
                    sum_item = np.sum(self.item_matrix[item_idx])
                    sums = np.sum(self.item_matrix, axis = 1)
                    pred =  (sum_item- np.min(sums)) / (np.max(sums) - np.min(sums)) * (4) + 1
                    predictions.append(pred)
                
                except: # If there is no movie
                    user_idx = self.dict_user[user]
                    sum_user= np.sum(self.user_matrix[item_idx])
                    sums = np.sum(self.user_matrix, axis = 1)
                    pred =  (sum_user - np.min(sums)) / (np.max(sums) - np.min(sums)) * (4) + 1
                    predictions.append(pred)
                #calculate the error
            
        labels = np.array(x['Ratings'])
        predictions = np.array(predictions)
        rmse =  mean_squared_error(labels,predictions, squared = False)
        mse = mean_absolute_error(labels,predictions)

        return rmse, mse