In [7]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

In [3]:
MV_users = pd.read_csv('./movies_data/users.csv')
MV_movies = pd.read_csv('./movies_data/movies.csv')
train = pd.read_csv('./movies_data/train.csv')
test = pd.read_csv('./movies_data/test.csv')

In [4]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [9]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())

        
    def predict_from_sim(self,uid,mid):
        user_ratings = self.Mr[self.uid2idx[uid]]
        movie_similarities = self.sim[self.mid2idx[mid]]
        
        return np.dot(user_ratings, movie_similarities) / np.dot(movie_similarities, user_ratings > 0)
    
    def predict(self):        
        return np.array(self.data.test.apply(lambda x: self.predict_from_sim(x['uID'], x['mID']), axis=1))
    
    def rmse(self,yp):
        yp[np.isnan(yp)] = 3
        yt = np.array(self.data.test.rating)
        return np.sqrt(((yt - yp) ** 2).mean())


In [26]:
from sklearn.decomposition import NMF

class NMFRecommender(RecSys):    
    def __init__(self, data, n_components=10):
        super().__init__(data)
        self.n_components = n_components
        self.model = NMF(n_components=self.n_components)
        self.user_matrix = []
        self.item_matrix = []
        
    def fit(self):
        W = self.model.fit_transform(self.Mr)
        H = self.model.components_
        self.user_matrix = W
        self.item_matrix = H.T
        
    def predict_rating(self, uid, mid):
        user_idx = self.uid2idx[uid]
        movie_idx = self.mid2idx[mid]
        user_vector = self.user_matrix[user_idx]
        movie_vector = self.item_matrix[movie_idx]
        return np.dot(user_vector, movie_vector)
        
    def predict(self):
        return np.array([self.predict_rating(uid, mid) for uid, mid in zip(self.data.test.uID, self.data.test.mID)])


In [24]:
nmf = NMFRecommender(data, n_components=20)
nmf.fit()
yp = nmf.predict()




In [25]:
nmf.rmse(yp)

2.861597419838157

This RSME is a lot worse than our baseline models in HW3. I think it is because NMF doesn't work well with too sparse data like this. 