In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt

In [23]:
train_df = pd.read_csv('movie_data/train.csv')
print(train_df.head())
print(train_df.info())    
print(train_df.describe())
print(train_df.columns)
print(train_df.shape)


    uID   mID  rating
0   744  1210       5
1  3040  1584       4
2  1451  1293       5
3  5455  3176       2
4  2507  3074       5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700146 entries, 0 to 700145
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   uID     700146 non-null  int64
 1   mID     700146 non-null  int64
 2   rating  700146 non-null  int64
dtypes: int64(3)
memory usage: 16.0 MB
None
                 uID            mID         rating
count  700146.000000  700146.000000  700146.000000
mean     3022.960334    1865.307324       3.581589
std      1729.128758    1096.507590       1.117508
min         1.000000       1.000000       1.000000
25%      1503.000000    1029.000000       3.000000
50%      3067.000000    1834.000000       4.000000
75%      4474.000000    2770.000000       4.000000
max      6040.000000    3952.000000       5.000000
Index(['uID', 'mID', 'rating'], dtype='object')
(700146, 3)


## Matrix Facorization

In [None]:
# Load the training and test data
train_df = pd.read_csv('movie_data/train.csv')
test_df = pd.read_csv('movie_data/test.csv')

# Create user-item matrix for training data
user_item_matrix = train_df.pivot(index='uID', columns='mID', values='rating').fillna(0)

# Create user-item matrix for test data and align with training data
test_user_item_matrix = test_df.pivot(index='uID', columns='mID', values='rating').fillna(0)
test_user_item_matrix = test_user_item_matrix.reindex(
    index=user_item_matrix.index, 
    columns=user_item_matrix.columns, 
    fill_value=0
)

# Evaluation Function Without Regularization
def evaluate_model(n_components, user_item_matrix, test_user_item_matrix):
    """
    Fit an NMF model with specified n_components and evaluate RMSE.
    Note: This version does not use alpha or l1_ratio.
    """
    # Create and fit the NMF model (no alpha or l1_ratio)
    nmf_model = NMF(n_components=n_components, random_state=42, max_iter=500)
    user_features = nmf_model.fit_transform(user_item_matrix)
    item_features = nmf_model.components_
    
    # Predict ratings by taking the dot product of features
    predictions = np.dot(user_features, item_features)
    
    # Convert test matrix to a NumPy array for consistent masking
    test_array = test_user_item_matrix.to_numpy()
    mask = test_array > 0  # Boolean mask for entries where a rating exists
    
    # Compute RMSE over the masked (non-zero) entries
    rmse = sqrt(mean_squared_error(test_array[mask], predictions[mask]))
    return rmse

# Approach 1: Tuning n_components
print("Tuning n_components:")
for n in [5, 10, 15, 20, 25]:
    rmse = evaluate_model(n, user_item_matrix, test_user_item_matrix)
    print(f"n_components = {n} => RMSE: {rmse:.4f}")

# Approach 2: Shifting Ratings to Start at 0
print("\nUsing shifted ratings (rating - 1):")
# Shift training ratings so that they start at 0 (i.e., 1 becomes 0 and 5 becomes 4)
train_df['rating_shifted'] = train_df['rating'] - 1
test_df['rating_shifted'] = test_df['rating'] - 1

# Create shifted user-item matrices
user_item_matrix_shifted = train_df.pivot(index='uID', columns='mID', values='rating_shifted').fillna(0)
test_user_item_matrix_shifted = test_df.pivot(index='uID', columns='mID', values='rating_shifted').fillna(0)
test_user_item_matrix_shifted = test_user_item_matrix_shifted.reindex(
    index=user_item_matrix_shifted.index,
    columns=user_item_matrix_shifted.columns,
    fill_value=0
)

def evaluate_model_shifted(n_components, user_item_matrix_shifted, test_user_item_matrix_shifted):
    """
    Fit an NMF model on shifted ratings and evaluate RMSE after shifting predictions back.
    """
    nmf_model = NMF(n_components=n_components, random_state=42, max_iter=500)
    user_features = nmf_model.fit_transform(user_item_matrix_shifted)
    item_features = nmf_model.components_
    
    # Predict on shifted scale and then shift predictions back to the original scale
    predictions_shifted = np.dot(user_features, item_features)
    predictions = predictions_shifted + 1  # Shift back: 0->1, 4->5
    
    # Adjust the test matrix similarly
    test_array = test_user_item_matrix_shifted.to_numpy() + 1
    mask = test_array > 1  # After shifting back, ratings are >= 1
    rmse = sqrt(mean_squared_error(test_array[mask], predictions[mask]))
    return rmse

for n in [5, 10, 15]:
    rmse = evaluate_model_shifted(n, user_item_matrix_shifted, test_user_item_matrix_shifted)
    print(f"(Shifted ratings) n_components = {n} => RMSE: {rmse:.4f}")


Tuning n_components:
n_components = 5 => RMSE: 2.9914
n_components = 10 => RMSE: 2.9118
n_components = 15 => RMSE: 2.8724
n_components = 20 => RMSE: 2.8608
n_components = 25 => RMSE: 2.8582

Using shifted ratings (rating - 1):
(Shifted ratings) n_components = 5 => RMSE: 2.3248
(Shifted ratings) n_components = 10 => RMSE: 2.2653
(Shifted ratings) n_components = 15 => RMSE: 2.2392


As seen above, I tried different methods to improve the performance of the model, such as:
- Evaluation Function:
The evaluate_model function fits an NMF model with given hyperparameters, makes predictions, and then calculates RMSE only on the entries where a test rating exists.

- Tuning n_components:
The first loop tests various values of n_components (latent features) and prints the RMSE for each.

- Regularization:
The second loop explores different regularization settings by adjusting alpha and l1_ratio with a fixed n_components of 10.

- Shifting Ratings:
In Approach 3, the ratings are shifted by subtracting 1 so that the minimum value becomes 0. After factorization, the predictions are shifted back by adding 1. This is helpful since NMF only works with nonnegative inputs.

### Non of the above methods worked, and the RSME continue to be very high (between 2 and 3, which is very bad on a 1-5 scale)
Possible reasons for this:
- The data is very sparse, with many users and movies having no ratings.

- NMF’s optimization is sensitive to how the initial factors are set. A poor initialization can lead the algorithm to converge to suboptimal local minima.

- NMF assumes that ratings can be modeled as a linear combination of latent features. This may not capture the complexity of user preferences as well as similarity-based methods, which often directly leverage user–user or item–item correlations.

### Suggestions for improvement:
- Tune Hyperparameters

- Improve Initialization

- Data Preprocessing:
Consider preprocessing steps such as:

    - Normalization: Center or scale ratings (e.g., subtracting the global mean or user/item biases) to improve factorization.

    - Shifting Ratings: Since NMF requires nonnegative data, shifting the rating scale (e.g., subtracting the minimum rating) can sometimes enhance performance.

- Consider using other algorithms such as SVD, SVD++ or probabilistic matrix factorization, which might capture the data structure better in sparse settings.