In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, lil_matrix
from scipy.sparse.linalg import svds
import scipy.sparse as sp
from tqdm import tqdm


# Set your data directory path
DATA_DIR = '../../Data/'
ratings_path = os.path.join(DATA_DIR, 'train_ratings.csv')
wishlist_path = os.path.join(DATA_DIR, 'train_tbr.csv')
sample_path = os.path.join(DATA_DIR, 'sample_submission.csv')


# Alternating Least Squares (ALS) 

## Helper Functions + Load data

In [2]:
def load_data(train_path):
    """
    Load ratings CSV, split sid_pid into sid and pid, drop original column.
    """
    ratings = pd.read_csv(train_path)
    ratings[["sid", "pid"]] = ratings["sid_pid"].str.split("_", expand=True)
    ratings.drop(columns=["sid_pid"], inplace=True)
    ratings["sid"] = ratings["sid"].astype(int)
    ratings["pid"] = ratings["pid"].astype(int)
    return ratings

def read_data_matrix(df):
    """Returns matrix view of the training data, where rows are scientists (sid) and
    columns are papers (pid)."""
    return df.pivot(index="sid", columns="pid", values="rating").fillna(0)

def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))

def make_submission(model, filename):
    """
    Generate submission file
    
    Parameters:
    -----------
    model: trained ALS model
    filename: path to save submission file
    """
    # Load sample submission file
    df = pd.read_csv(sample_path)
    
    # Extract sids and pids
    sid_pid = df["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0].astype(int).values
    pids = sid_pid[1].astype(int).values
    
    # Predict ratings
    df["rating"] = model.predict_for_submission(sids, pids)
    
    # Save submission file
    df.to_csv(filename, index=False)
    print(f"Submission saved to {filename}")


In [3]:
# Load data
ratings = load_data(ratings_path)
print(f"Loaded {len(ratings)} ratings")

# Split data for training and testing
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
print(f"Training set: {len(train_data)}, Test set: {len(test_data)}")

Loaded 1128187 ratings
Training set: 902549, Test set: 225638


## ALS Model Implementation

In [5]:
class EnhancedALSModel:
    def __init__(self, rank=100, num_iterations=10, reg_parameter=0.1, num_svd_runs=3, 
                 svd_lr=0.1, use_iSVD=False, transpose=False, bias_reg=0.01, 
                 use_bias=True, use_confidence=True, alpha=40):
        """
        Initialize enhanced ALS model with hyperparameters
        
        Parameters:
        -----------
        rank: int, default=100
            Number of latent factors
        num_iterations: int, default=10
            Number of ALS iterations
        reg_parameter: float, default=0.1
            Regularization parameter
        num_svd_runs: int, default=3
            Number of SVD iterations for initialization
        svd_lr: float, default=0.1
            Learning rate for SVD
        use_iSVD: bool, default=False
            Whether to use iSVD or proj_SVD
        transpose: bool, default=False
            Whether to transpose the rating matrix
        bias_reg: float, default=0.01
            Regularization parameter for bias terms
        use_bias: bool, default=True
            Whether to use bias terms in the model
        use_confidence: bool, default=True
            Whether to use confidence weighting
        alpha: float, default=40
            Confidence scaling factor
        """
        self.rank = rank
        self.num_iters = num_iterations
        self.lam = reg_parameter
        self.num_svd_runs = num_svd_runs
        self.lr = svd_lr
        self.use_iSVD = use_iSVD
        self.transpose = transpose
        self.bias_reg = bias_reg
        self.use_bias = use_bias
        self.use_confidence = use_confidence
        self.alpha = alpha
        self.rec_mtx = None
        
    def extract_data(self, data):
        """Extract user and item indices and ratings"""
        users = data["sid"].values
        items = data["pid"].values
        predictions = data["rating"].values
        return users, items, predictions
    
    def proj_SVD(self, A, mask, lr=0.1, rank=10, num_iters=10):
        """Projected SVD for initialization"""
        U = np.random.uniform(low=-1.0, high=1.0, size=(A.shape[0], rank))
        V = np.random.uniform(low=-1.0, high=1.0, size=(rank, A.shape[1]))
        A_curr = np.zeros((A.shape[0], A.shape[1]))
        for _ in range(num_iters):
            diff = np.multiply(np.subtract(A, A_curr), mask)
            pre_svd = A_curr + lr*diff
            u, s, vt = svds(pre_svd, k=rank)
            # Sort singular values in descending order
            idx = np.argsort(s)[::-1]
            s = s[idx]
            u = u[:, idx]
            vt = vt[idx, :]
            
            S = np.diag(s)
            U = u.dot(np.sqrt(S))
            V = np.sqrt(S).dot(vt)
            A_curr = U.dot(V)
        return U, V, A_curr
    
    def iSVD(self, A, mask, rank=10, num_iters=3):
        """Incremental SVD for initialization"""
        U = np.random.uniform(low=-1.0, high=1.0, size=(A.shape[0], rank))
        V = np.random.uniform(low=-1.0, high=1.0, size=(rank, A.shape[1]))
        A_curr = A
        for _ in range(num_iters):
            u, s, vt = svds(A_curr, k=rank)
            # Sort singular values in descending order
            idx = np.argsort(s)[::-1]
            s = s[idx]
            u = u[:, idx]
            vt = vt[idx, :]
            
            S = np.diag(s)
            U = u.dot(np.sqrt(S))
            V = np.sqrt(S).dot(vt)
            A_curr = np.multiply(A, mask) + np.multiply(U@V, 1-mask)
        return U, V, A_curr
    
    def calculate_confidence(self, ratings):
        """Calculate confidence weights based on ratings"""
        # Higher ratings get higher confidence
        return 1 + self.alpha * (ratings - 1) / 4
    
    def ALS(self, users, items, preds):
        """
        Implement ALS algorithm with bias terms and confidence weighting
        
        Parameters:
        -----------
        users: array of user indices
        items: array of item indices
        preds: array of ratings
        
        Returns:
        --------
        rec_mtx: reconstructed rating matrix
        """
        # Determine dimensions
        max_user = max(users) + 1
        max_movie = max(items) + 1
        rows, cols = max_user, max_movie
        
        # Create data matrix and mask
        data = np.zeros((rows, cols))
        mask = np.zeros((rows, cols))
        
        # Fill data matrix with ratings
        for user, item, pred in zip(users, items, preds):
            data[user][item] = pred
            mask[user][item] = 1
        
        # Create confidence matrix if using confidence weighting
        if self.use_confidence:
            confidence = np.zeros((rows, cols))
            for user, item, pred in zip(users, items, preds):
                confidence[user][item] = self.calculate_confidence(pred)
        else:
            confidence = mask.copy()
            
        # Normalize data
        if self.transpose:
            axis = 1
        else:
            axis = 0
            
        # Calculate global mean
        global_mean = np.nanmean(np.where(data != 0, data, np.nan))
        
        # Calculate user and item bias terms
        user_mean = np.nanmean(np.where(data != 0, data, np.nan), axis=1)
        user_mean = np.nan_to_num(user_mean, nan=global_mean)
        item_mean = np.nanmean(np.where(data != 0, data, np.nan), axis=0)
        item_mean = np.nan_to_num(item_mean, nan=global_mean)
        
        # Calculate user and item bias
        if self.use_bias:
            user_bias = user_mean - global_mean
            item_bias = item_mean - global_mean
        else:
            user_bias = np.zeros(rows)
            item_bias = np.zeros(cols)
        
        # Adjust data for biases
        A = data.copy()
        if self.use_bias:
            for i in range(rows):
                for j in range(cols):
                    if mask[i, j]:
                        A[i, j] = data[i, j] - global_mean - user_bias[i] - item_bias[j]
        
        # SVD initialization
        U, V = None, None
        if self.use_iSVD:
            U, V, _ = self.iSVD(A, mask, self.rank, self.num_svd_runs)
        else:
            U, V, _ = self.proj_SVD(A, mask, self.lr, self.rank, self.num_svd_runs)
            
        # ALS iterations
        for _ in range(self.num_iters):
            # Update item factors (V)
            for j in range(cols):
                # Get users who rated this item
                users_idx = np.where(mask[:, j])[0]
                if len(users_idx) > 0:
                    U_j = U[users_idx, :]
                    A_j = A[users_idx, j]
                    C_j = confidence[users_idx, j]
                    
                    # Weighted regularization
                    WU = (U_j.T * C_j) @ U_j
                    reg = self.lam * np.eye(self.rank)
                    V[:, j] = np.linalg.solve(WU + reg, (U_j.T * C_j) @ A_j)
            
            # Update user factors (U)
            for i in range(rows):
                # Get items rated by this user
                items_idx = np.where(mask[i, :])[0]
                if len(items_idx) > 0:
                    V_i = V[:, items_idx]
                    A_i = A[i, items_idx]
                    C_i = confidence[i, items_idx]
                    
                    # Weighted regularization
                    WV = V_i @ (np.diag(C_i) @ V_i.T)
                    reg = self.lam * np.eye(self.rank)
                    U[i, :] = np.linalg.solve(WV + reg, V_i @ (np.diag(C_i) @ A_i))
            
            # Update bias terms if using bias
            if self.use_bias:
                # Update user bias
                for i in range(rows):
                    items_idx = np.where(mask[i, :])[0]
                    if len(items_idx) > 0:
                        bias_residuals = data[i, items_idx] - global_mean - item_bias[items_idx] - U[i, :] @ V[:, items_idx]
                        user_bias[i] = np.sum(bias_residuals) / (len(items_idx) + self.bias_reg)
                
                # Update item bias
                for j in range(cols):
                    users_idx = np.where(mask[:, j])[0]
                    if len(users_idx) > 0:
                        bias_residuals = data[users_idx, j] - global_mean - user_bias[users_idx] - np.sum(U[users_idx, :] * V[:, j], axis=1)
                        item_bias[j] = np.sum(bias_residuals) / (len(users_idx) + self.bias_reg)
                        
        # Generate predictions
        output = np.zeros((rows, cols))
        for i in range(rows):
            for j in range(cols):
                output[i, j] = global_mean + user_bias[i] + item_bias[j] + U[i, :] @ V[:, j]
                
        # Clip predictions to valid rating range [1, 5]
        rec_mtx = np.clip(output, 1, 5)
        
        # Store model parameters for prediction
        self.user_factors = U
        self.item_factors = V
        self.global_mean = global_mean
        self.user_bias = user_bias
        self.item_bias = item_bias
        
        return rec_mtx
    
    def train(self, train_data, test_data=None):
        """
        Train the ALS model
        
        Parameters:
        -----------
        train_data: DataFrame with columns ['sid','pid','rating']
        test_data: DataFrame with columns ['sid','pid','rating'] or None
        
        Returns:
        --------
        test_score: RMSE on test data (if test_data is provided)
        """
        # Extract data
        self.train_users, self.train_items, self.train_predictions = self.extract_data(train_data)
        
        if test_data is not None:
            self.test_users, self.test_items, self.test_predictions = self.extract_data(test_data)
        
        # Train model
        self.rec_mtx = self.ALS(self.train_users, self.train_items, self.train_predictions)
        
        # Evaluate on test data if provided
        if test_data is not None:
            preds = self.predict_ratings(test_data)
            test_score = math.sqrt(mean_squared_error(self.test_predictions, preds))
            return test_score
        
        return None
    
    def predict_ratings(self, test_data):
        """
        Predict ratings for test data
        
        Parameters:
        -----------
        test_data: DataFrame with columns ['sid','pid','rating']
        
        Returns:
        --------
        predictions: array of predicted ratings
        """
        users, items, _ = self.extract_data(test_data)
        predictions = np.zeros(len(users))
        
        for i, (user, item) in enumerate(zip(users, items)):
            # Check if indices are in bounds
            if user < self.rec_mtx.shape[0] and item < self.rec_mtx.shape[1]:
                predictions[i] = self.rec_mtx[user][item]
            else:
                # For users or items not seen during training, predict using available information
                if user < self.rec_mtx.shape[0] and hasattr(self, 'user_bias'):
                    # We have user but not item
                    predictions[i] = self.global_mean + self.user_bias[user]
                elif item < self.rec_mtx.shape[1] and hasattr(self, 'item_bias'):
                    # We have item but not user
                    predictions[i] = self.global_mean + self.item_bias[item]
                else:
                    # Neither user nor item seen before
                    predictions[i] = self.global_mean if hasattr(self, 'global_mean') else 3.0
                
        return predictions
    
    def predict_for_submission(self, sids, pids):
        """
        Predict ratings for submission file
        
        Parameters:
        -----------
        sids: array of user indices
        pids: array of item indices
        
        Returns:
        --------
        predictions: array of predicted ratings
        """
        predictions = np.zeros(len(sids))
        
        for i, (sid, pid) in enumerate(zip(sids, pids)):
            # Check if indices are in bounds
            if sid < self.rec_mtx.shape[0] and pid < self.rec_mtx.shape[1]:
                predictions[i] = self.rec_mtx[sid][pid]
            else:
                # For users or items not seen during training, use bias terms if available
                if sid < self.rec_mtx.shape[0] and hasattr(self, 'user_bias'):
                    # We have user but not item
                    predictions[i] = self.global_mean + self.user_bias[sid]
                elif pid < self.rec_mtx.shape[1] and hasattr(self, 'item_bias'):
                    # We have item but not user
                    predictions[i] = self.global_mean + self.item_bias[pid]
                else:
                    # Neither user nor item seen before
                    predictions[i] = self.global_mean if hasattr(self, 'global_mean') else 3.0
                
        return predictions

## Hyperparameter Tuning 

In [9]:
def tune_hyperparameters(train_data, val_data, param_grid):
    """
    Tune hyperparameters using grid search
    
    Parameters:
    -----------
    train_data: DataFrame with columns ['sid','pid','rating']
    val_data: DataFrame with columns ['sid','pid','rating']
    param_grid: dict of hyperparameter ranges
    
    Returns:
    --------
    best_params: dict of best hyperparameters
    best_model: best model
    best_rmse: best RMSE score
    """
    best_rmse = float('inf')
    best_params = None
    best_model = None
    
    # Convert param_grid to list of dictionaries
    from itertools import product
    keys = param_grid.keys()
    values = param_grid.values()
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]
    
    print(f"Evaluating {len(param_combinations)} hyperparameter combinations")
    
    for params in param_combinations:
        print(f"Evaluating parameters: {params}")
        
        model = EnhancedALSModel(
            rank=params['factors'],
            num_iterations=params['iterations'],
            reg_parameter=params['regularization'],
            num_svd_runs=params.get('num_svd_runs', 3),
            svd_lr=params.get('svd_lr', 0.1),
            use_iSVD=params.get('use_iSVD', False),
            transpose=params.get('transpose', False)
        )
        
        val_rmse = model.train(train_data, val_data)
        print(f"Validation RMSE: {val_rmse}")
        
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_params = params
            best_model = model
            
    return best_params, best_model, best_rmse

In [10]:
param_grid = {
    'factors': [10,20,30,40 ], 
    'regularization': [0.1, 0.2, 0.35, 0.5, 0.6, 0.7, 0.754, 0.8], 
    'iterations': [1,2,5,10,20,50,100], 
    'num_svd_runs': [3,4,5,6,7],
    'svd_lr': [4,5,5.5,6,6.5,6.6,6.65,6.7,6.75,6.9,7,7.1,7.2,7.5,8,9],
    'use_iSVD': [False],
    'transpose': [False]
}

In [11]:
best_params, best_model, best_rmse = tune_hyperparameters(train_data, test_data, param_grid)
print(f"Best parameters: {best_params}, Best RMSE: {best_rmse}")

Evaluating 1 hyperparameter combinations
Evaluating parameters: {'factors': 40, 'regularization': 0.6, 'iterations': 1, 'num_svd_runs': 6, 'svd_lr': 5.5, 'use_iSVD': False, 'transpose': False}
Validation RMSE: 0.8672560086578185
Best parameters: {'factors': 40, 'regularization': 0.6, 'iterations': 1, 'num_svd_runs': 6, 'svd_lr': 5.5, 'use_iSVD': False, 'transpose': False}, Best RMSE: 0.8672560086578185


## Model Training and Evaluation

In [6]:
# Train a model with specific parameters
model = EnhancedALSModel(
    rank=40,
    num_iterations=1,
    reg_parameter=0.6,
    num_svd_runs=6,
    svd_lr=5.5,
    use_iSVD=False,
    transpose=False
)

In [9]:
# Train the model and get validation RMSE
val_rmse = model.train(train_data, test_data)
print(f"Validation RMSE: {val_rmse}")

Validation RMSE: 0.8672560086578184
✅ Compressed model written to ../../saved_models/als_model.joblib
🗄️  Model file is 79.74 MB on disk


## Generate Predictions for Submission

In [23]:
# Generate submission file
make_submission(model, os.path.join(DATA_DIR, 'sample_submissionALS.csv'))

Submission saved to /Users/ccylmichel/Documents/CIL/BayesWatch_CollaborativeFiltering/Data/sample_submissionALS.csv
