# The Netﬂix Prize dataset

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-preprocessing" data-toc-modified-id="Data-preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data preprocessing</a></span><ul class="toc-item"><li><span><a href="#Importing-All-the-Data" data-toc-modified-id="Importing-All-the-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Importing All the Data</a></span></li><li><span><a href="#Separating-Training-Set-and-Test-Set" data-toc-modified-id="Separating-Training-Set-and-Test-Set-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Separating Training Set and Test Set</a></span></li></ul></li><li><span><a href="#Explanatory-data-analysis" data-toc-modified-id="Explanatory-data-analysis-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Explanatory data analysis</a></span><ul class="toc-item"><li><span><a href="#Movie-release-date" data-toc-modified-id="Movie-release-date-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Movie release date</a></span></li><li><span><a href="#Rating-distribution" data-toc-modified-id="Rating-distribution-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Rating distribution</a></span></li><li><span><a href="#Movie-Rated-Date" data-toc-modified-id="Movie-Rated-Date-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Movie Rated Date</a></span></li></ul></li><li><span><a href="#Model-One---Biased-Matrix-Factorization" data-toc-modified-id="Model-One---Biased-Matrix-Factorization-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model One - Biased Matrix Factorization</a></span><ul class="toc-item"><li><span><a href="#Implementation-of-the-model" data-toc-modified-id="Implementation-of-the-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Implementation of the model</a></span></li><li><span><a href="#Fitting-the-model" data-toc-modified-id="Fitting-the-model-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Fitting the model</a></span></li><li><span><a href="#Evaluating-the-model" data-toc-modified-id="Evaluating-the-model-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Evaluating the model</a></span></li></ul></li><li><span><a href="#Model-Two---AutoRec" data-toc-modified-id="Model-Two---AutoRec-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Two - AutoRec</a></span><ul class="toc-item"><li><span><a href="#Implementation-of-Model" data-toc-modified-id="Implementation-of-Model-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Implementation of Model</a></span></li><li><span><a href="#Training-and-Evaluation" data-toc-modified-id="Training-and-Evaluation-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Training and Evaluation</a></span></li></ul></li><li><span><a href="#Model-Three---Naive-Bayes" data-toc-modified-id="Model-Three---Naive-Bayes-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Model Three - Naive Bayes</a></span><ul class="toc-item"><li><span><a href="#Implementation-of-the-model" data-toc-modified-id="Implementation-of-the-model-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Implementation of the model</a></span></li><li><span><a href="#Evaluating-the-model" data-toc-modified-id="Evaluating-the-model-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Evaluating the model</a></span></li></ul></li><li><span><a href="#Model-Four---Neural-Collaborative-Filtering" data-toc-modified-id="Model-Four---Neural-Collaborative-Filtering-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Model Four - Neural Collaborative Filtering</a></span><ul class="toc-item"><li><span><a href="#Implementation-of-the-model" data-toc-modified-id="Implementation-of-the-model-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Implementation of the model</a></span></li><li><span><a href="#Training-the-model-based-on-the-data-split" data-toc-modified-id="Training-the-model-based-on-the-data-split-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Training the model based on the data split</a></span></li><li><span><a href="#Plotting-training-and-validation-loss" data-toc-modified-id="Plotting-training-and-validation-loss-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Plotting training and validation loss</a></span></li><li><span><a href="#Showing-top-10-movie-recommendations-to-a-user" data-toc-modified-id="Showing-top-10-movie-recommendations-to-a-user-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>Showing top 10 movie recommendations to a user</a></span></li></ul></li></ul></div>

## Data preprocessing

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import random
from sklearn.model_selection import train_test_split
import seaborn as sns

import matplotlib.pyplot as plt
# show plots automatically
%matplotlib inline
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [None]:
# To ensure plots can be displayed properly in Google Colab
def enable_plotly_in_cell():
    import IPython
    from plotly.offline import init_notebook_mode
    display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
    init_notebook_mode(connected=False)

### Importing All the Data

In [None]:
# DataFrame to store all imported data
data = open('data.csv', mode='w')

files = ['combined_data_1.txt',
         'combined_data_2.txt',
         'combined_data_3.txt',
         'combined_data_4.txt'
        ]

# Remove the line with movie_id: and add a new column of movie_id
# Combine all data files into a csv file
for file in files:
    print("Opening file: {}".format(file))
    with open(file) as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'):
                movie_id = line.replace(':', '')
            else:
                data.write(movie_id + ',' + line)
                data.write('\n')
data.close()

In [None]:
# Read all data into a pd dataframe
dataset = pd.read_csv('data.csv', 
                 names=['movieid', 'userid', 'rating', 'date'])
print(dataset.nunique())

In [None]:
# Check NaN Values
print('number of NaN values in the dataset:', 
      sum(dataset.isnull().any()))

There are 480189 users and 17770 distinct movies in the Netflix Prize dataset. 

user_id ranges from 1 to 2649429 with gaps. 

movie_id ranges from 1 to 17770 sequentially. 

rating can be any integer from 1 to 5. date shows the date when the rating was made and it has the format of YYYY-MM-DD. As a user may rate multiple movies. The dataset has 111631710 rows in total.

### Separating Training Set and Test Set

For my model, I use the qualifying set as the test set of the model. However, the qualifying data is mixed with training dataset and contained in the above dataset as well. 

Therefore, I first separate the training set and test set. 

In [None]:
subdata = open('probe.csv', mode='w')
files = ["probe.txt"]

# Remove the line with movie_id: and add a new column of movie_id
# Combine all data files into a csv file
for file in files:
  print("Opening file: {}".format(file))
  with open(file) as f:
    for line in f:
        line = line.strip()
        if line.endswith(':'):
            movie_id = line.replace(':', '')
        else:
            subdata.write(movie_id + ',' + line)
            subdata.write('\n')
subdata.close()

In [None]:
probeset = pd.read_csv("probe.csv", names=["movie_id", "user_id"])

In [None]:
test = probeset.merge(df, on = ['user_id', 'movie_id'],how='left')
train = pd.concat([df, test, test]).drop_duplicates(keep = False)

In [None]:
# Store training set and test set as separate csv files
train.to_csv(path_or_buf = 'train.csv')
test.to_csv(path_or_buf = 'test.csv')

## Explanatory data analysis

### Movie release date

In [None]:
df_title = pd.read_csv('movie_titles.csv', 
                       encoding = "ISO-8859-1", header = None, 
                       names = ['movieid', 'Year', 'Name'])
df_title.set_index('movieid', inplace = True)
df_title.head(10)

In [None]:
# Get data
data = df_title['Year'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movies Grouped By Year Of Release'.format(df_title.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Movies'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

The number of movies released is stable around 0 during the time period from 1900 to around 1930. Then the number slowly starts to go up after 1930. After 1980, the number of movie released each year starts to rocket. Most movies are released around the time of 2000 in this dataset. This trend is in line with the development of movie industry as well.

In [None]:
del df_title

### Rating distribution

In [None]:
# Read all data into a pd dataframe
df = pd.read_csv('data.csv', names=['movie_id', 'user_id', 'rating', 'date'])

In [None]:
# Get data
data = df['rating'].value_counts().sort_index(ascending=False)

# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Distribution Of {} Netflix-Ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

Users rarely rate a movie to be 1 or 2. Most ratings are above 3 and 4 is the rating that is most frequently given. The distribution is probably biased, since only people liking the movies proceed to be customers and others presumably will leave the platform.

### Movie Rated Date

In [None]:
# Get data
data = df['date'].value_counts()
data.sort_index(inplace=True)

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movie-Ratings Grouped By Day'.format(df.shape[0]),
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'Ratings'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

From 2000 to 2003, Netflix has witness few to 0 daily ratings. After 2003, the daily ratings has its first small peak. After that, the daily rating numbers slowly starts to show a rising trend despite fluctuations. The number of ratings increases with time, and most ratings submited in 2005. It is worth noting that there are two unnormal peaks at 2005.

In [None]:
##### Ratings Per Movie #####
# Get data
data = df.groupby('movie_id')['rating'].count().clip(upper=9999)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 10000,
                                  size = 100),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per Movie (Clipped at 9999)',
                   xaxis = dict(title = 'Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)



##### Ratings Per User #####
# Get data
data = df.groupby('user_id')['rating'].count().clip(upper=199)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 200,
                                  size = 2),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per User (Clipped at 199)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
del df

Both the ratings per movie and the ratings per user have a near-perfect exponential decay. Only a few movies/users have a large number of ratings.

## Model One - Biased Matrix Factorization

Matrix Factorization aims to factorize the rating matrix R into a matrix U which is m × k and a matrix V which is n × k. The rows of  U and V are called latent factors, thus the name latent factor models. To explain in more detail, the ith row of U is called the User Factor while the ith row of V is called the item Factor. The ratings are approximated by multiplying matrices U and V.

&nbsp;
As for Biased Matrix Factorization, I take each user’s and each item’s specific characteristics into consideration, which is the user bias and item bias, denoted in β and γ.

The two models are abbreviated as MFALS(Matrix Factorization with Alternating Least Squares) and BMFALS(Biased Matrix Factorization with Alternating Least Squares) in the following scripts. 

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import time 
from sklearn.preprocessing import LabelEncoder

### Implementation of the model

In [None]:
class BiasedMatrixFactor():
    def __init__(self, k, lamda=0.1, num_iter=1000, bias=False, print_enabled=True):

        self.lamda = lamda
        self.num_iter = num_iter 
        self.k = k
        self.bias = bias
        self.print_enabled = print_enabled
        
      
    def _Root_Mean_Square_Error(self):
        """
        return
        1. n*m np.ndarray: the difference between A and estimated value at each entry of A; 
        2. total loss, the sum of errors for all entries of A. 
        """
        if self.bias:
            A_est = np.matmul(self.U, self.VT) + np.reshape(self.beta, (self.m_, 1)) + np.reshape(self.gamma, (1, self.n_))
        else:
            A_est = np.matmul(self.U, self.VT)
        diff_error = np.subtract(self.A, A_est)
        diff_error[self.nan_cell_mask] = 0       # ignore missing data
        sq_error = np.square(diff_error).sum()
        num_data = np.size(self.A) - len(self.nan_cell_mask[0])
        rmse = np.sqrt(sq_error / num_data)
        
        return diff_error, rmse
    
    
    def _ALS(self):
        """ _ALS, should 
        1. update self.U, self.VT and 
        2. return a 1D np.ndarray which stores the loss after each iteration, 
            the length of the list should be the number of iteration, 
            because I include the loss before optimization, i.e. the loss when num_iter = 0 
        """
        loss_list = np.array([])
        temp_A = self.A.copy()
        self.m_ = np.size(self.A, 0)
        self.n_ = np.size(self.A, 1)
        
        if self.bias:
            self.beta = np.random.rand(self.m_, 1)            # beta represents items' biases
            self.gamma = np.random.rand(self.n_, 1)           # gamma represents users' biases
        
        for n in range(self.num_iter):
            # calculate the root mean square error
            if self.bias:
                # update V^T
                A_beta = temp_A - np.reshape(self.beta, (self.m_, 1))
                U_extend = np.hstack((np.ones(shape=(self.m_, 1)), self.U))
                self.VT = np.dot(np.dot(np.linalg.pinv(np.dot(U_extend.T, U_extend)+self.lamda*np.identity(self.k+1)), U_extend.T), A_beta)
                self.gamma = self.VT[0, :]
                self.VT = self.VT[1:, :]

                # update U
                A_gamma = temp_A - np.reshape(self.gamma, (1, self.n_))
                VT_extend = np.vstack((np.ones(shape=(1, self.n_)), self.VT))
                self.U = np.dot(np.dot(np.linalg.pinv(np.dot(VT_extend, VT_extend.T)+self.lamda*np.identity(self.k+1)), VT_extend), A_gamma.T).T
                self.beta = self.U[:, 0]
                self.U = self.U[:, 1:]
                
            else:
                # update V^T
                self.VT = np.dot(np.dot(np.linalg.pinv(np.dot(self.U.T, self.U)+self.lamda*np.identity(self.k)), self.U.T), temp_A)

                # update U
                self.U = np.dot(np.dot(np.linalg.pinv(np.dot(self.VT, self.VT.T)+self.lamda*np.identity(self.k)), self.VT), temp_A.T).T
            
            # update the missing values in rating matrix
            if self.bias:
                A_est = np.matmul(self.U, self.VT) + np.reshape(self.beta, (self.m_, 1)) + np.reshape(self.gamma, (1, self.n_))
            else:
                A_est = np.matmul(self.U, self.VT)      
            temp_A[self.nan_cell_mask] = A_est[self.nan_cell_mask]
            diff, rmse = self._Root_Mean_Square_Error()
            
            # print total loss if needed
            if self.print_enabled:
                print("iteration " + str(n) + ": " + str(rmse))
                
            loss_list = np.append(loss_list, rmse)
            
        return loss_list        
    
    
    def impute(self):
        ## impute the missing values
        self.nan_cell_mask = np.where(np.isnan(self.A))
        # calculate row mean, if all entries in one row is nan, use population mean
        self.A[np.all(np.isnan(self.A), axis=1)] = np.nanmean(self.A)
        self.rowmean = np.nanmean(self.A, axis=1)
        self.A[self.nan_cell_mask] = np.take(self.rowmean,  self.nan_cell_mask[0])

        
    def fit(self, A, init_U = None, init_VT = None):
        ## initialize U, V 
        self.A = A.copy()
        np.random.seed(0)
        self.impute()    # call the method impute
        if init_U is None: 
            self.U = np.random.rand(np.size(self.A, 0), self.k)
        else:
            self.U = init_U
            
        if init_VT is None:
            self.VT = np.random.rand(self.k, np.size(self.A, 1))
        else:
            self.VT = init_VT

        loss_list = self._ALS()
            
        return self.U, self.VT, loss_list       # loss_list stores the loss after each iteration
    
    def predict(self, user_idxes, movie_idxes):
        if self.bias:
            A_est = np.matmul(self.U, self.VT) + np.reshape(self.beta, (self.m_, 1)) + np.reshape(self.gamma, (1, self.n_))
        else:
            A_est = np.matmul(self.U, self.VT)
        
        if len(user_idxes) == len(movie_idxes):
            pred_list = [A_est[user_idxes[i]][movie_idxes[i]] for i in range(len(user_idxes))]
        else:
            raise InputError("Inputs must have the same length!!!")
            
        return pred_list

### Fitting the model

In [None]:
train = pd.read_csv("train.csv", usecols=[1,2,3])
train.head()

In [None]:
test = pd.read_csv('test.csv', usecols = [1,2,3])
test.head()

In [None]:
interval = 100000 # Load by user_id intervals
pred_list = []
biased_pred_list = []

In [None]:
for i in range(0, int(np.floor(train['user_id'].max() / interval)) + 1):
    lower_limit = i*interval
    upper_limit = min((i+1)*interval, train['user_id'].max())
    
    if i == np.floor(train['user_id'].max() / interval):
        train_part = train[(train['user_id'] >= lower_limit) & (train['user_id'] <= upper_limit)]
        test_part = test[(test['user_id'] >= lower_limit) & (test['user_id'] <= upper_limit)]
    else:
        train_part = train[(train['user_id'] >= lower_limit) & (train['user_id'] < upper_limit)]
        test_part = test[(test['user_id'] >= lower_limit) & (test['user_id'] < upper_limit)]
    
    A_part = train_part.pivot(index="user_id", columns='movie_id', values='rating')

    # Train the MFALS model
    MF = BiasedMatrixFactor(k=100, lamda=0.02, num_iter=100, print_enabled=True, bias=False)
    U, VT, loss_hist = MF.fit(np.array(A_part))

    # Train the BMFALS model
    biased_MF = BiasedMatrixFactor(k=100, lamda=0.02, num_iter=100, print_enabled=True, bias=True)
    biased_U, biased_VT, biased_loss_hist = biased_MF.fit(np.array(A_part))

    le = LabelEncoder()
    le.fit(test_part["user_id"])
    test_part["user_id"] = le.transform(test_part["user_id"])
    le.fit(test_part["movie_id"])
    test_part["movie_id"] = le.transform(test_part["movie_id"])

    ## MFASL prediction
     pred_ratings = MF.predict(np.array(test_part.user_id), np.array(test_part.movie_id))
     pred_list = np.append(pred_list, pred_ratings)

    ## BMFALS prediction
    biased_pred_ratings = biased_MF.predict(np.array(test_part.user_id), np.array(test_part.movie_id))
    biased_pred_list = np.append(biased_pred_list, biased_pred_ratings)

    print("---------The ", i + 1, " interval is processed----------")

### Evaluating the model 

In [None]:
# ALS without biases
mse = mean_squared_error(np.array(test['rating']), pred_list)
wo_b_rmse = math.sqrt(mse)
print("The RMSE for ALS model without biases is ", wo_b_rmse)

# ALS with biases
b_mse = mean_squared_error(np.array(test['rating']), biased_pred_list)
b_rmse = math.sqrt(b_mse)
print("The RMSE for ALS model with biases is ", b_rmse)

In terms of loss, it is clear that the implementation Matrix Factorization with bias has a much better performance than the implementation without bias.

In [None]:
# ALS without biases
mse = mean_squared_error(np.array(test['rating']), pred_list)
wo_b_rmse = math.sqrt(mse)
print("The RMSE for ALS model without biases is ", wo_b_rmse)

# ALS with biases
b_mse = mean_squared_error(np.array(test['rating']), biased_pred_list)
b_rmse = math.sqrt(b_mse)
print("The RMSE for ALS model with biases is ", b_rmse)

The two models have similar RMSE when it comes to predicting, with the RMSE score of Matrix Factorization without biases being slightly lower. 

## Model Two - AutoRec

AutoRec, short for AutoRec: Autoencoders Meet Collaborative Filtering (Sedhain et.al, 2015), is a neural network-based model in Recommender System. Its main idea is to reduce the dimension by neural network, whose output is the low-dimensional vector with denser information. After obtaining the encoded output, it is fed into the decoder and return a predicted user/item vector as final prediction.

&nbsp;
The model consists of two parts: encoder and decoder. Encoder compresses the sparse and large user/item vector into dense vectors. On the other side, Decoder returns the vector with same dimensions as the input user/item vector. As for their structure, the encoder is built with a Linear layer and a Sigmoid Layer, whereas the decoder only contains a Linear layer.
Apart from being used as a model for recommendation, it could also be viewed as a tool for dimension reduction based on the idea of Autoencoder (D.E. Rumelhart, G.E. Hinton, et.al, 1986). And this could be utilized in further models such as Decision Tree.

In [None]:
import random
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
import numpy as np
import pandas as pd

from sklearn import preprocessing
from scipy import sparse

### Implementation of Model

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, user_num, item_num, num_latent_variables, penalty_lambda = 1, prog_type='item-based'):
        self.prog_type = prog_type
        self.penalty_lambda = penalty_lambda
        self.loss_list = []
        self.val_loss = []
        self.best_loss = np.inf
        super(Autoencoder, self).__init__()
        if prog_type == 'item-based':
            self.encoder = nn.Sequential(
                nn.Linear(user_num, num_latent_variables),
                nn.Sigmoid(),
            )
            self.decoder = nn.Sequential(
                nn.Linear(num_latent_variables, user_num),
            )
        elif prog_type == 'user-based':
            self.encoder = nn.Sequential(
                nn.Linear(item_num, num_latent_variables),
                nn.Sigmoid(),
            )
            self.decoder = nn.Sequential(
                nn.Linear(num_latent_variables, item_num),
            )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

    def loss(self, loss_fn, optimizer, decoded, target, cell_mask):
        orig_loss = loss_fn(decoded[cell_mask], target[cell_mask])
        loss = torch.sqrt(1 / len(cell_mask[0]) * orig_loss)
        regularization_loss = 0

        for param in optimizer.param_groups:
            for w_param in param['params']:
                if w_param.data.dim() == 2:
                    regularization_loss += torch.t(w_param.data).pow(2).sum()

        return orig_loss, loss, loss + self.penalty_lambda * regularization_loss * 0.5

    def train_model(self, device, optimizer, train_data, epochs, data_num, test_data, val_num, batch_size_train, prog_type='item-based', log_interval=10, save_model=True):
        self.train()
        loss_fn = nn.MSELoss(reduction='sum')
        rmse = 0
        val_loss_per_epoch = 0
        cell_mask_train = torch.nonzero(train_data, as_tuple=True)
        for epoch in range(epochs):
            train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(train_data, train_data), batch_size=batch_size_train, shuffle=True)
            for batch_idx, (data, target) in enumerate(train_loader):
                cell_mask = torch.nonzero(data, as_tuple=True)
                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()
                _, decoded = self(data)
                orig_loss, loss, loss_pred = self.loss(loss_fn, optimizer, decoded, target, cell_mask)
                loss_pred.backward()
                optimizer.step()

                if batch_idx % log_interval == 0:
                    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch, batch_idx * len(data), len(train_loader.dataset),
                                100. * batch_idx / len(train_loader), loss.item()))

                rmse = rmse + orig_loss.item()
            with torch.no_grad():
                _, pred = self.predict(device, train_data)

                pred[cell_mask_train] = train_data[cell_mask_train]
                train_data = pred
                if prog_type == 'item-based':
                    pred_test = pred[(test_data[1, :], test_data[0, :])]
                else:
                    pred_test = pred[(test_data[0, :], test_data[1, :])]

                val_loss_per_epoch = loss_fn(pred_test, torch.Tensor(test_data[2, :]).to(device))

                rmse = rmse / data_num

                val_loss_per_epoch = torch.sqrt(1 / val_num * val_loss_per_epoch).item()
                print("--------Epoch {}-------\n RMSE: {:.6f}\t Val_loss: {:.6f}\n".format(epoch, rmse, val_loss_per_epoch))       
                if save_model:
                    if val_loss_per_epoch < self.best_loss:
                        self.best_loss = val_loss_per_epoch
                        torch.save(self.state_dict(), "Autorec.pt")
                        print("Best model saved at val loss of {:.6f}.\n".format(val_loss_per_epoch))

            self.loss_list.append(rmse)
            self.val_loss.append(val_loss_per_epoch)
        
        return self.loss_list, self.val_loss, rmse
    
    def predict(self, device, input_data):
        self.eval()
        encoded, pred = self(input_data)
        
        negetive_index = torch.where(pred < 0)
        pred[negetive_index] = 0
        
        return encoded, pred

### Training and Evaluation

In [None]:
train = train.reset_index()
test = test.reset_index()

In [None]:
le = preprocessing.LabelEncoder()
le.fit(train["user_id"])
train["user_id"] = pd.Series(le.transform(train["user_id"]))
test["user_id"] = pd.Series(le.transform(test["user_id"]))

First, I fit the model of user-based AutoRec Recommender System

In [None]:
# Training settings
batch_size_train = 80
batch_size_test = 500
epochs = 40
lr = 1e-3
gamma = 1e-4
seed = 1
log_interval = 10
interval = 50000
save_model = True
use_cuda = torch.cuda.is_available()
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")
prog_type = 'user-based'

In [None]:
movie_num = train['movie_id'].nunique()
user_num = train['user_id'].nunique()
user_index = train['user_id'].unique()
movie_index = train['movie_id'].unique()
data_num = len(train)
if prog_type == 'item-based':
    
    A = sparse.coo_matrix((train.rating, (train.user_id, train.movie_id-1)))
    del train
    
    test_matrix = sparse.coo_matrix((test.rating, (test.user_id, test.movie_id-1)))
    del test
    
else:
    
    A = sparse.coo_matrix((train.rating, (train.user_id, train.movie_id-1)))
    del train
    
    test_matrix = sparse.coo_matrix((test.rating, (test.user_id, test.movie_id)))
    del test

In [None]:
model = Autoencoder(user_num=A.shape[0], item_num=A.shape[1], num_latent_variables=1024, penalty_lambda=1, prog_type=prog_type).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=gamma)

if prog_type == 'item-based':
    for i in range(1, int(np.floor(user_num / interval) + 1)):
        random_index = random.sample(movie_index.tolist(), interval)
        index = np.where(np.in1d(A.col, random_index))
        test_index = np.where(np.in1d(test_matrix.col, random_index))
        col_le = preprocessing.LabelEncoder()
        col_le.fit(A.col[index])
 
        A_part = torch.sparse_coo_tensor(torch.Tensor(np.vstack((A.row[index], col_le.transform(A.col[index])))), torch.Tensor(A.data[index]), torch.Size((user_num, interval))).to(device).to_dense()    

        test_data = np.vstack((test_matrix.row[test_index], col_le.transform(test_matrix.col[test_index]), test_matrix.data[test_index]))
        val_num = len(test_index[0])
        loss_1, val_loss, rmse = model.train_model(device, optimizer, A_part.T, epochs, data_num, test_data, val_num, batch_size_train, 'item-based', log_interval)
    
elif prog_type == 'user-based':
    for i in range(1, int(np.floor(user_num / interval) + 1)):
        random_index = random.sample(user_index.tolist(), interval)
        index = np.where(np.in1d(A.row, random_index))
        test_index = np.where(np.in1d(test_matrix.row, random_index))
        col_le = preprocessing.LabelEncoder()
        col_le.fit(A.row[index])
 
        A_part = torch.sparse_coo_tensor(torch.Tensor(np.vstack((col_le.transform(A.row[index]), A.col[index]))), torch.Tensor(A.data[index]), torch.Size((interval, movie_num))).to(device).to_dense()    
    
        test_data = np.vstack((col_le.transform(test_matrix.row[test_index]), test_matrix.col[test_index]-1, test_matrix.data[test_index]))
        val_num = len(test_index[0])
        loss_1, val_loss, rmse = model.train_model(device, optimizer, A_part, epochs, data_num, test_data, val_num, batch_size_train, 'user-based', log_interval)

print(val_loss)

In [None]:
# Model Evaluation
Print("The RMSE score for user-based AutoRec Model is ", rmse)

Next, I fit the model of item-based AutoRec Recommender System

In [None]:
# Training settings
batch_size_train = 80
batch_size_test = 500
epochs = 40
lr = 1e-3
gamma = 1e-4
seed = 1
log_interval = 10
interval = 50000
save_model = True
use_cuda = torch.cuda.is_available()
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")
prog_type = 'item-based'

In [None]:
model = Autoencoder(user_num=A.shape[0], item_num=A.shape[1], num_latent_variables=1024, penalty_lambda=1, prog_type=prog_type).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=gamma)

if prog_type == 'item-based':
    for i in range(1, int(np.floor(user_num / interval) + 1)):
        random_index = random.sample(movie_index.tolist(), interval)
        index = np.where(np.in1d(A.col, random_index))
        test_index = np.where(np.in1d(test_matrix.col, random_index))
        col_le = preprocessing.LabelEncoder()
        col_le.fit(A.col[index])
 
        A_part = torch.sparse_coo_tensor(torch.Tensor(np.vstack((A.row[index], col_le.transform(A.col[index])))), torch.Tensor(A.data[index]), torch.Size((user_num, interval))).to(device).to_dense()    

        test_data = np.vstack((test_matrix.row[test_index], col_le.transform(test_matrix.col[test_index]), test_matrix.data[test_index]))
        val_num = len(test_index[0])
        loss_1, val_loss = model.train_model(device, optimizer, A_part.T, epochs, data_num, test_data, val_num, batch_size_train, 'item-based', log_interval)
    
elif prog_type == 'user-based':
    for i in range(1, int(np.floor(user_num / interval) + 1)):
        random_index = random.sample(user_index.tolist(), interval)
        index = np.where(np.in1d(A.row, random_index))
        test_index = np.where(np.in1d(test_matrix.row, random_index))
        col_le = preprocessing.LabelEncoder()
        col_le.fit(A.row[index])
 
        A_part = torch.sparse_coo_tensor(torch.Tensor(np.vstack((col_le.transform(A.row[index]), A.col[index]))), torch.Tensor(A.data[index]), torch.Size((interval, movie_num))).to(device).to_dense()    
    
        test_data = np.vstack((col_le.transform(test_matrix.row[test_index]), test_matrix.col[test_index]-1, test_matrix.data[test_index]))
        val_num = len(test_index[0])
        loss_1, val_loss = model.train_model(device, optimizer, A_part, epochs, data_num, test_data, val_num, batch_size_train, 'user-based', log_interval)

print(val_loss)

In [None]:
# Model Evaluation
Print("The RMSE score for item-based AutoRec Model is ", rmse)

## Model Three - Naive Bayes

I consider the Naïve Bayes collaborative filtering algorithm for the dataset because Naive Bayes is best suited for categorical input variables. The ratings in the Netflix Prize dataset are five integers, which can be treated as five categories. Moreover, Naive Bayes can handle the problem of overfitting and MemoryError. It also achieve better results in accuracy and performance, which makes it favorable in real-life applications.

### Implementation of the model

In [None]:
class NaiveBayesCF():

    def __init__(self, train_user, train_item, train_data, test_user, test_item, test_label, alpha=0.1):
        
        self.alpha = alpha
        self.train_user = train_user
        self.train_item = train_item
        self.rating = train_data
        self.values = np.unique(train_data)
        self.test_user = test_user
        self.test_item = test_item
        self.test_label = test_label
    
    
    def _Root_Mean_Square_Error(self):
        """
        return
        1. n*m np.ndarray: the difference between A and estimated value at each entry of A; 
        2. total loss, the sum of errors for all entries of A. 
        """
        _, _, pred_list, _, _ = self.predict(user_idxes=self.test_user,
                                 movie_idxes=self.test_item)
        diff_error = pred_list - self.test_label
        rmse = np.sqrt(np.square(diff_error).sum() / len(pred_list))
        
        return diff_error, rmse                    
                        
    def predict(self, user_idxes, movie_idxes, method="weight_average"):
        """
        method includes "weight_average" and "argmax"
        """
        self.method = method
        if len(user_idxes) == len(movie_idxes):
            pred_user = []
            pred_item = []
            pred_rating = []
            pred_list = []
            pred_likeli = []
            pred_prior = []
            for user_id in np.unique(user_idxes):
                u_id = np.where(self.train_user == user_id)
                
                I_u = movie_idxes[user_idxes == user_id]
                rating_u = self.rating[u_id]
                item_u = self.train_item[u_id]
                
                for item in I_u:      # Iu is the movie indexes
                    tmp_j = np.where(self.train_item == item)
                    tmp_rating = self.rating[tmp_j]
                    probability = []
                    for (s, vs) in enumerate(self.values):
                        prior = 1.0 * (sum(tmp_rating == vs) + self.alpha) / (len(tmp_rating) + self.alpha * len(self.values))
                        
                        tmp_cond_id = np.where(((self.rating == vs) & (self.train_item == item)))
                        tmp_cond_user = self.train_user[tmp_cond_id]    # find all users who have specified ratings for item j to be vs
                        tmp_cond_rating = self.rating[tmp_cond_id]      # find all ratings whose raters have specified item j's rating to be vs
                        tmp_cond_item = self.train_item[tmp_cond_id]    # find all item
 
                        likelihood = float(1.0)
                        for k in np.unique(self.train_item[u_id]):
                            tmp_k_u = np.where(item_u == k)
                            u_k_rating = np.max(rating_u[tmp_k_u])       # find user u's ratings for item k
                            tmp_k = np.where(tmp_cond_item == k)        # find other users who have ratings for item k
                            tmp_rating_k = tmp_cond_rating[tmp_k]        # find their ratings
                            
                            likelihood *= (1.0 * (sum(tmp_rating_k == u_k_rating) + self.alpha) / (len(tmp_k[0]) + self.alpha * len(self.values)))

                        probability.append(prior*likelihood)
                        pred_prior.append(prior)
                        pred_likeli.append(likelihood)
                        
                    if self.method == "weight_average":
                        pred_list.append((np.array(probability)*self.values).sum()/(sum(probability))) if sum(probability) != 0 else pred_list.append(2.5)
                    elif self.method == "argmax":
                        pred_list.append(self.values[(probability == max(probability))][0])
                    else:
                        raise NotImplementedError("Cannot use provided optimization method!!!")
                    pred_item.append(item)
                    pred_user.append(user_id)
                    
        else:
            raise InputError("Inputs must have the same length!!!")
            
        return pred_user, pred_item, pred_list, pred_prior, pred_likeli

### Evaluating the model

In order to handle the problem of overfitting, the method of Laplacian smoothing is commonly used. Instead of estimating P(ruj = vs) in a straightforward way, I smooth it with a Laplacian smoothing parameter 𝛼. Here I testified different values from 0.01 to 1 to see how it influences the final result and I find the model performs best when 𝛼 is set to 0.1.

In [None]:
NB = NaiveBayesCF(train['user_id'].values, train['movie_id'].values, train['rating'].values, 
                  test['user_id'].values, test['movie_id'].values, test['rating'].values, 
                  alpha=0.1)

In [None]:
# calculating the rmse
_, rmse = NB._Root_Mean_Square_Error()
print("The RMSE on the test set is ", rmse)

Since time is quite finite, I only run my user-based Naïve Bayes model on the test set. Just as what I have mentioned above, to overcome low computation efficiency, I divide the test set into twelve parts based on user-id and combine the results.

## Model Four - Neural Collaborative Filtering

This example demonstrates Neural Collaborative filtering using the Netflix dataset
to recommend movies to users.

The steps in the model are as follows:

1. Map user ID to a "user vector" via an embedding matrix
2. Map movie ID to a "movie vector" via an embedding matrix
3. Compute the dot product between the user vector and movie vector, to obtain
the a match score between the user and the movie (predicted rating).
4. Train the embeddings via gradient descent using all known user-movie pairs.

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
import random

In [None]:
test = pd.read_csv('test.csv', usecols = [1,2,3])
test.head()

In [None]:
train = pd.read_csv("train.csv", usecols=[1,2,3])
train.head()

In [None]:
trainids=train['user_id'].unique().tolist()
testids=test['user_id'].unique().tolist()
trainids.extend(testids)
UserIds=list(set(trainids))
del trainids,testids


trainids=train['movie_id'].unique().tolist()
testids=test['movie_id'].unique().tolist()
trainids.extend(testids)
MovieIds=list(set(trainids))
del trainids,testids

print(len(UserIds),len(MovieIds))

First, need to perform some preprocessing to encode users and movies as integer indices.

In [None]:
user2user_encoded = {x: i for i, x in enumerate(UserIds)}
movie2movie_encoded = {x: i for i, x in enumerate(MovieIds)}

test["user"] = test["user_id"].map(user2user_encoded)
test["movie"] = test["movie_id"].map(movie2movie_encoded)
train["user"] = train["user_id"].map(user2user_encoded)
train["movie"] = train["movie_id"].map(movie2movie_encoded)

num_users = len(UserIds)
num_movies = len(MovieIds)
train["Rating"] = train["Rating"].values.astype(float)
test["Rating"] = test["Rating"].values.astype(float)

min_rating = 1
max_rating = 5

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

In [None]:
trainX = train[["user", "movie"]].values.astype(int)
# Normalize the targets between 0 and 1. Makes it easy to train.
trainy = train["Rating"].apply(lambda x: (x - 1) / (5 - 1)).values

testX = test[["user", "movie"]].values.astype(int)
# Normalize the targets between 0 and 1. Makes it easy to train.
testy = test["Rating"].apply(lambda x: (x - 1) / (5 - 1)).values

### Implementation of the model

I embed both users and movies in to 50-dimensional vectors.

The model computes a match score between user and movie embeddings via a dot product,
and adds a per-movie and per-user bias. The match score is scaled to the `[0, 1]`
interval via a sigmoid (since the ratings are normalized to this range).

In [None]:
EMBEDDING_SIZE = 50


class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
)

### Training the model based on the data split

In [None]:
history = model.fit(
    x=trainX,
    y=trainy,
    batch_size=32,
    epochs=10,
    verbose=2,
    validation_data=(testX, testy),
)

In [None]:
model.save_weights('my_model.ckpt')

### Plotting training and validation loss

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

### Showing top 10 movie recommendations to a user

In [None]:
movie_df = pd.read_csv("movie_titles.csv",encoding = "ISO-8859-1")
movie_df=movie_df.iloc[:,[0,1,2]]
movie_df.columns=['MovieId','Time','MovieTitle']
movie_df.head()

In [None]:
user_id = test.user_id.sample(1).iloc[0]

In [None]:
movies_watched_by_user = test[test.user_id == user_id]
movies_watched_by_user

In [None]:
movies_not_watched = movie_df[
    ~movie_df["movie_id"].isin(movies_watched_by_user.movie_id.values)
]["movie_id"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)

In [None]:
user_encoded2user = {x: i for i, x in enumerate(UserIds)}
movie_encoded2movie = {i:x for i, x in enumerate(MovieIds)}

In [None]:
ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="Rating", ascending=False)
    .head(5)
    .MovieId.values
)
movie_df_rows = movie_df[movie_df["MovieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.MovieTitle)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["MovieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.MovieTitle)