## 1. Non-Negative Matrix Factorization (NMF, scikit-learn package)


### 1 Load the data set

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from scipy import sparse

import matplotlib.pyplot as plt
# for interactive inline plots
#%matplotlib notebook
# for simple inline plots
%matplotlib inline
import seaborn as sns
import time, math
#from IPython.display import display



def ConvertToDense(X, y, shape):
    row  = X[:, 0]
    col  = X[:, 1]
    data = y
    matrix_sparse = sparse.csr_matrix((data, (row, col)), shape=(shape[0] + 1, shape[1] + 1))
    R = matrix_sparse.todense()
    R = R[1:, 1:]  # Remove offset
    R = np.asarray(R)
    return R


def GetShape(filename):
    names = ['user_id', 'item_id', 'rating']
    df = pd.read_csv(filename)
    df.rename(columns={'user':'user_id', 'item':'item_id', 'rating':'rating'}, inplace=True)
    n_users = len(df['user_id'].unique())
    n_items = len(df['item_id'].unique())
    return (n_users, n_items)

def LoadData(filename, R_shape):
    df = pd.read_csv(filename)
    df.rename(columns={'user': 'user_id', 'item': 'item_id', 'rating': 'rating'}, inplace=True)
    # Encode user and item IDs to zero-based integer indices
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()

    df['user_id'] = user_encoder.fit_transform(df['user_id'])
    df['item_id'] = item_encoder.fit_transform(df['item_id'])

    X = df[['user_id', 'item_id']].values
    y = df['rating'].values.astype(np.float64)  # Ensure y is numeric

    return X, y, ConvertToDense(X, y, R_shape)


R_shape = GetShape('train.csv')

X, y, R = LoadData('train.csv', R_shape)

### 1.2 Split into training and test set

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

R_train = ConvertToDense(X_train, y_train, R_shape)
R_test = ConvertToDense(X_test, y_test, R_shape)


### 1.3 Choose a model: NMF

In [4]:
from sklearn.decomposition import NMF

parametersNMF = {
                    'n_components' : 20,     # number of latent factors
                    'init' : 'random',
                    'random_state' : 0,
                    'l1_ratio' : 0,          # set regularization = L2
                    'max_iter' : 15
                }

estimator = NMF(**parametersNMF)

##### Estimating the error (RMSE) before tuning the hyperparameters

In [5]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()     # Ignore nonzero terms
    actual = actual[actual.nonzero()].flatten() # Ignore nonzero terms
    return np.sqrt(mean_squared_error(pred, actual))

In [None]:
R_train = ConvertToDense(X_train, y_train, R_shape)
R_test = ConvertToDense(X_test, y_test, R_shape)

# Track time for training and evaluation
start_time = time.time()

# Convert the training data to sparse format
R_train_sparse = sparse.csr_matrix((y_train, (X_train[:, 0], X_train[:, 1])), shape=R_shape)

# Train NMF model on the training data
estimator.fit(R_train_sparse)
Theta = estimator.transform(R_train_sparse)  # User features
M = estimator.components_.T  # Item features

# Predict ratings for the training set
R_pred_train = M.dot(Theta.T).T
R_pred_train = np.clip(R_pred_train, 1, 5)  # Clip ratings between 1 and 5

# Compute RMSE on the training data
train_rmse = get_rmse(R_pred_train, R_train_sparse.toarray())

# Predict ratings for the test set
R_pred_test = M.dot(Theta.T).T
R_pred_test = np.clip(R_pred_test, 1, 5)  # Clip ratings between 1 and 5

# Compute RMSE on the test data
R_test_sparse = sparse.csr_matrix((y_test, (X_test[:, 0], X_test[:, 1])), shape=R_shape)
test_rmse = get_rmse(R_pred_test, R_test_sparse.toarray())

# Print results
elapsed = time.time() - start_time
print(f"Training Complete - Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}, Time: {elapsed:.2f}s")



### 1.4 Final evaluation on the test set

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from scipy import sparse

# Load training data
df = pd.read_csv("train.csv")
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Encode user and item IDs
df['user'] = user_encoder.fit_transform(df['user'])
df['item'] = item_encoder.fit_transform(df['item'])

# Define mappings for user and item IDs to apply to test data
user_mapping = {u: i for i, u in enumerate(user_encoder.classes_)}
item_mapping = {i: j for j, i in enumerate(item_encoder.classes_)}

# Load test data
test_df = pd.read_csv("test.csv")

# Encode user and item IDs using training mappings
test_df['user'] = test_df['user'].map(user_mapping)  # Convert user IDs
test_df['item'] = test_df['item'].map(item_mapping)  # Convert item IDs

# Handle unknown users/items by dropping them
test_df.dropna(inplace=True)  # Remove rows with unknown users/items
test_df[['user', 'item']] = test_df[['user', 'item']].astype(int)  # Ensure they are integers

# Extract user-item pairs from the test set
X_test = test_df[['user', 'item']].values

# Predict ratings using the trained model (assuming `M` and `Theta` are already defined from training)
predictions = []
print("Generating Predictions...")

# Loop through user-item pairs to make predictions
for user, item in tqdm(X_test, desc="Predicting", unit="pair"):
    if user < Theta.shape[0] and item < M.shape[0]:  # Ensure valid indices
        pred = M[item].dot(Theta[user])  # Matrix multiplication for prediction
    else:
        pred = np.mean(y)  # Default to the mean rating if out of range

    pred = np.clip(pred, 1, 5)  # Clip predictions between 1 and 5
    predictions.append(pred)

# Save predictions
test_df['prediction'] = predictions
test_df[['ID', 'prediction']].to_csv("predictions.csv", index=False)

print("\n✅ Predictions saved to predictions.csv")


Generating Predictions...


Predicting: 100%|██████████| 23938/23938 [00:00<00:00, 115750.89pair/s]


✅ Predictions saved to predictions.csv



