In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix 
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error

In [5]:
MV_users = pd.read_csv('movie_data/users.csv')
MV_movies = pd.read_csv('movie_data/movies.csv')
train = pd.read_csv('movie_data/train.csv')
test = pd.read_csv('movie_data/test.csv')

In [6]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [7]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())


    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        # your code here
        return np.full(len(self.data.test), 3)
        
        
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        user_avg_ratings = np.zeros(len(self.allusers))
        for i in range(len(self.allusers)):
            user_ratings = self.Mr[i]
            rated_movies = user_ratings[user_ratings > 0]
            if rated_movies.size > 0:
                user_avg_ratings[i] = rated_movies.mean()
            else:
                user_avg_ratings[i] = 3  
                
        # Get average ratings for users in test data
        test_user_indices = [self.uid2idx[u] for u in self.data.test.uID]
        test_user_avg_ratings = user_avg_ratings[test_user_indices]
        
        return test_user_avg_ratings
    
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        user_idx = self.uid2idx[uid]
        movie_idx = self.mid2idx[mid]
        user_ratings = self.Mr[user_idx]
        sim_scores = self.sim[movie_idx]

        valid_scores = sim_scores > 0
        valid_ratings = user_ratings > 0
        valid = valid_scores & valid_ratings

        if not valid.any():
            return 3

        weighted_ratings = np.dot(user_ratings[valid], sim_scores[valid])
        norm = np.sum(sim_scores[valid])

        if norm > 0:
            return weighted_ratings / norm
        else:
            return 3
    
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        predictions = []
        for _, row in self.data.test.iterrows():
            uid, mid = row['uID'], row['mID']
            prediction = self.predict_from_sim(uid, mid)
            predictions.append(prediction)
        return np.array(predictions)
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

In [8]:
# Creating Sample test data
np.random.seed(42)
sample_train = train[:30000]
sample_test = test[:30000]


sample_MV_users = MV_users[(MV_users.uID.isin(sample_train.uID)) | (MV_users.uID.isin(sample_test.uID))]
sample_MV_movies = MV_movies[(MV_movies.mID.isin(sample_train.mID)) | (MV_movies.mID.isin(sample_test.mID))]


sample_data = Data(sample_MV_users, sample_MV_movies, sample_train, sample_test)

In [9]:
sample_rs = RecSys(sample_data)
sample_yp = sample_rs.predict_everything_to_3()
print(sample_rs.rmse(sample_yp))

1.2642784503423288


In [10]:
rs = RecSys(data)
yp = rs.predict_everything_to_3()
print(rs.rmse(yp))

1.2585510334053043


In [11]:
sample_yp = sample_rs.predict_to_user_average()
print(sample_rs.rmse(sample_yp))

1.1429596846619763


In [12]:
yp = rs.predict_to_user_average()
print(rs.rmse(yp))

1.0352910334228647


In [14]:
class NMFRecSys(RecSys):
    def __init__(self, data, n_components=5, init='random', random_state=0):
        super().__init__(data)
        self.n_components = n_components
        self.model = NMF(n_components=n_components, init=init, random_state=random_state)

    def fit(self):
        self.user_features = self.model.fit_transform(self.Mr)
        self.item_features = self.model.components_

    def predict_ratings(self):
        return np.dot(self.user_features, self.item_features)

    def predict(self):
        predicted_ratings = self.predict_ratings()
        predictions = []
        for _, row in self.data.test.iterrows():
            uid, mid = row['uID'], row['mID']
            user_idx = self.uid2idx[uid]
            movie_idx = self.mid2idx[mid]
            prediction = predicted_ratings[user_idx, movie_idx]
            predictions.append(prediction)
        return np.array(predictions)

    def rmse(self, yp):
        yp[np.isnan(yp)] = 3
        yt = np.array(self.data.test.rating)
        return np.sqrt(mean_squared_error(yt, yp))

nmf_rs = NMFRecSys(sample_data)
nmf_rs.fit()
nmf_predictions = nmf_rs.predict()
print(nmf_rs.rmse(nmf_predictions))

3.7284246943887047
