In [18]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from collections import namedtuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.decomposition import NMF

In [19]:
#Reading in Dataframe from project github repository
url_users = 'https://raw.githubusercontent.com/JSchlangen9/Movie-Ratings/main/users.csv'
url_movies = 'https://raw.githubusercontent.com/JSchlangen9/Movie-Ratings/main/movies.csv'
url_train = 'https://raw.githubusercontent.com/JSchlangen9/Movie-Ratings/main/train.csv'
url_test = 'https://raw.githubusercontent.com/JSchlangen9/Movie-Ratings/main/test.csv'

file_users = requests.get(url_users)
file_movies = requests.get(url_movies)
file_train = requests.get(url_train)
file_test = requests.get(url_test)

string_users = StringIO(file_users.text)
string_movies = StringIO(file_movies.text)
string_train = StringIO(file_train.text)
string_test = StringIO(file_test.text)

MV_users = pd.read_csv(string_users)
MV_movies = pd.read_csv(string_movies)
train = pd.read_csv(string_train)
test = pd.read_csv(string_test)

In [20]:
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [21]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())
    
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        users = np.array(self.data.test['uID'])
        x = np.zeros(len(users))
        
        for i in range(len(x)):
            idx = self.uid2idx[users[i]]
            avg = self.Mr[idx][np.nonzero(self.Mr[idx])].mean()
            x[i] = avg
            
        return x

In [22]:
rs = RecSys(data)
rm = rs.rating_matrix()
avg = rs.predict_to_user_average()[0:len(rs.allusers)]

model = NMF()
W = model.fit_transform(np.array(avg).reshape(-1, 1))
H = model.components_

vals = np.matmul(W, H.transpose())

print('RMSE:',  root_mean_squared_error(vals, avg))

RMSE: 2.6942996973225916e-16




## Discussion

Overall, the model produced generated a very low root mean squared error, however we are not confident in the model's ability to predict acurately. We could look to improve the performance of the model by using different techniques as opposed to matrix factorization. It is helpful when we can use matrix factorization to find aspects of the data that make it different from other data points for a model to learn. This technique could cause the model to incorrectly interpret the movie ratings and which movies were not rated, which would ultimately deliver an inaccurate prediction. Building a model using movie rating by movie genre would be more helpful and easier for a model to learn.