# Limitation of Sklearn’s Non-Negative Matrix Factorization Library



In [10]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings


In [11]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [12]:
# Suppress UserWarnings from sklearn
warnings.filterwarnings('ignore')

# Create a utility matrix (user-item matrix)
utility_matrix = train_data.pivot(index='uID', columns='mID', values='rating')

# Fill missing values with 0 for NMF (but depending on the method, could use NaN)
utility_matrix.fillna(0, inplace=True)

# Matrix Factorization using NMF
nmf_model = NMF(n_components=20, init='nndsvd', beta_loss='kullback-leibler', solver='mu', random_state=42, max_iter=500)
W = nmf_model.fit_transform(utility_matrix)
H = nmf_model.components_

# Predict ratings for the utility matrix
predicted_ratings = np.dot(W, H)

# Create a function to get predicted ratings for test data
def predict_rating(uID, mID):
    if uID in utility_matrix.index and mID in utility_matrix.columns:
        return predicted_ratings[utility_matrix.index.get_loc(uID), utility_matrix.columns.get_loc(mID)]
    else:
        return np.nan  # If uID or mID not found

# Apply the function to predict ratings for the test set
test_data['predicted_rating'] = test_data.apply(lambda row: predict_rating(row['uID'], row['mID']), axis=1)

# Drop rows where prediction is NaN
test_data = test_data.dropna(subset=['predicted_rating'])

# Calculate RMSE between actual and predicted ratings
rmse = sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))

print(f'RMSE: {rmse}')

RMSE: 2.9400073885874316
