In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, lil_matrix
from scipy.sparse.linalg import svds
import scipy.sparse as sp
from tqdm import tqdm

# Set your data directory path
DATA_DIR = '../../Data/'
ratings_path = os.path.join(DATA_DIR, 'train_ratings.csv')
wishlist_path = os.path.join(DATA_DIR, 'train_tbr.csv')
sample_path = os.path.join(DATA_DIR, 'sample_submission.csv')

# Weighted Alternating Least Squares (ALS) for implicit feedback

In [3]:
def load_tbr_data(train_path):
    """
    Load TBR (wishlist) CSV which already has sid,pid columns.
    """
    implicit_df = pd.read_csv(train_path)
    implicit_df["sid"] = implicit_df["sid"].astype(int)
    implicit_df["pid"] = implicit_df["pid"].astype(int)
    # add a flag column for convenience:
    implicit_df["tbr_flag"] = 1
    return implicit_df

def load_data(train_path):
    """
    Load ratings CSV, split sid_pid into sid and pid, drop original column.
    """
    ratings = pd.read_csv(train_path)
    ratings[["sid", "pid"]] = ratings["sid_pid"].str.split("_", expand=True)
    ratings.drop(columns=["sid_pid"], inplace=True)
    ratings["sid"] = ratings["sid"].astype(int)
    ratings["pid"] = ratings["pid"].astype(int)
    return ratings

def read_data_matrix(df):
    """Returns matrix view of the training data, where rows are scientists (sid) and
    columns are papers (pid)."""
    return df.pivot(index="sid", columns="pid", values="rating").fillna(0)

def make_submission(model, sample_path, filename):
    df_sub = pd.read_csv(sample_path)
    sid_pid = df_sub["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0].astype(int).values
    pids = sid_pid[1].astype(int).values
    df_sub["rating"] = model.predict_for_submission(sids, pids)
    df_sub.to_csv(filename, index=False)
    print(f"Submission saved to {filename}")

def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))

## Weighted ALS Model Implementation (with Implicit ratings)

In [5]:
class WeightedALSModel:
    def __init__(self,
                 rank=100,
                 num_iterations=10,
                 reg_parameter=0.1,
                 num_svd_runs=3,
                 svd_lr=0.1,
                 use_iSVD=False,
                 transpose=False,
                 bias_reg=0.01,
                 use_bias=True,
                 use_confidence=True,
                 alpha_r=40.0,
                 alpha_tbr=10.0):
        """
        Extended ALS with optional bias terms and two‐signal confidence weighting.
        """
        self.rank           = rank
        self.num_iters      = num_iterations
        self.lam            = reg_parameter
        self.num_svd_runs   = num_svd_runs
        self.lr             = svd_lr
        self.use_iSVD       = use_iSVD
        self.transpose      = transpose
        self.bias_reg       = bias_reg
        self.use_bias       = use_bias
        self.use_confidence = use_confidence
        self.alpha_r        = alpha_r
        self.alpha_tbr      = alpha_tbr

        # to be filled after training
        self.rec_mtx     = None
        self.user_factors = None
        self.item_factors = None
        self.global_mean  = None
        self.user_bias    = None
        self.item_bias    = None

    def extract_data(self, df):
        return df["sid"].values, df["pid"].values, df["rating"].values

    def calculate_confidence(self, rating, tbr_flag):
        """
        c_ui = 1 + alpha_r * r_ui + alpha_tbr * 1{tbr}
        """
        return 1.0 + self.alpha_r * rating + self.alpha_tbr * tbr_flag

    def proj_SVD(self, A, mask, lr, rank, num_iters):
        U = np.random.uniform(-1,1,(A.shape[0], rank))
        V = np.random.uniform(-1,1,(rank, A.shape[1]))
        A_curr = np.zeros_like(A)
        for _ in range(num_iters):
            diff = (A - A_curr) * mask
            pre   = A_curr + lr * diff
            u, s, vt = svds(pre, k=rank)
            idx = np.argsort(s)[::-1]
            s, u, vt = s[idx], u[:,idx], vt[idx]
            S = np.diag(s)
            U = u.dot(np.sqrt(S))
            V = np.sqrt(S).dot(vt)
            A_curr = U.dot(V)
        return U, V, A_curr

    def iSVD(self, A, mask, rank, num_iters):
        U = np.random.uniform(-1,1,(A.shape[0], rank))
        V = np.random.uniform(-1,1,(rank, A.shape[1]))
        A_curr = A.copy()
        for _ in range(num_iters):
            u, s, vt = svds(A_curr, k=rank)
            idx = np.argsort(s)[::-1]
            s, u, vt = s[idx], u[:,idx], vt[idx]
            S = np.diag(s)
            U = u.dot(np.sqrt(S))
            V = np.sqrt(S).dot(vt)
            A_curr = A * mask + (U.dot(V)) * (1-mask)
        return U, V, A_curr

    def ALS(self, users, items, ratings, tbr_flags):
        # dimension
        n_users = users.max()+1
        n_items = items.max()+1

        # build data + mask
        data = np.zeros((n_users, n_items))
        mask = np.zeros((n_users, n_items))
        for u,i,r in zip(users, items, ratings):
            data[u,i] = r
            mask[u,i] = 1

        # build confidence
        if self.use_confidence:
            C = np.ones((n_users, n_items))
            for u,i,r,f in zip(users, items, ratings, tbr_flags):
                C[u,i] = self.calculate_confidence(r, f)
        else:
            C = mask.copy()

        # global & bias
        global_mean = np.nanmean(np.where(mask, data, np.nan))
        user_mean   = np.nan_to_num(np.nanmean(np.where(mask, data, np.nan), axis=1), nan=global_mean)
        item_mean   = np.nan_to_num(np.nanmean(np.where(mask, data, np.nan), axis=0), nan=global_mean)
        if self.use_bias:
            ubias = user_mean - global_mean
            ibias = item_mean - global_mean
        else:
            ubias = np.zeros(n_users)
            ibias = np.zeros(n_items)

        # subtract bias
        A = data.copy()
        if self.use_bias:
            for u in range(n_users):
                for i in range(n_items):
                    if mask[u,i]:
                        A[u,i] = data[u,i] - global_mean - ubias[u] - ibias[i]

        # init factors
        if self.use_iSVD:
            U, V, _ = self.iSVD(A, mask, self.rank, self.num_svd_runs)
        else:
            U, V, _ = self.proj_SVD(A, mask, self.lr, self.rank, self.num_svd_runs)

        # ALS loops
        I_r = np.eye(self.rank)
        for _ in range(self.num_iters):
            # update V (items)
            UtU = U.T.dot(U)
            for j in range(n_items):
                idx_u = np.where(mask[:,j]==1)[0]
                if idx_u.size:
                    Cj = C[idx_u,j]
                    Uj = U[idx_u,:]
                    Aj = A[idx_u,j]
                    A_mat = UtU + Uj.T.dot(np.diag(Cj-1)).dot(Uj) + self.lam*I_r
                    b_vec = Uj.T.dot(Cj*Aj)
                    V[:,j] = np.linalg.solve(A_mat, b_vec)

            # update U (users)
            VtV = V.dot(V.T)
            for u in range(n_users):
                idx_i = np.where(mask[u,:]==1)[0]
                if idx_i.size:
                    Ci = C[u,idx_i]
                    Vi = V[:,idx_i]
                    Ai = A[u,idx_i]
                    B_mat = VtV + Vi.dot(np.diag(Ci-1)).dot(Vi.T) + self.lam*I_r
                    d_vec = Vi.dot(Ci*Ai)
                    U[u,:] = np.linalg.solve(B_mat, d_vec)

            # update biases
            if self.use_bias:
                for u in range(n_users):
                    idx_i = np.where(mask[u,:]==1)[0]
                    res = data[u,idx_i] - global_mean - ibias[idx_i] - U[u,:].dot(V[:,idx_i])
                    ubias[u] = res.sum() / (idx_i.size + self.bias_reg)
                for j in range(n_items):
                    idx_u = np.where(mask[:,j]==1)[0]
                    res = data[idx_u,j] - global_mean - ubias[idx_u] - (U[idx_u,:]*V[:,j]).sum(axis=1)
                    ibias[j] = res.sum() / (idx_u.size + self.bias_reg)

        # build full prediction matrix
        rec = np.zeros((n_users, n_items))
        for u in range(n_users):
            for i in range(n_items):
                rec[u,i] = global_mean + ubias[u] + ibias[i] + U[u,:].dot(V[:,i])
        rec = np.clip(rec, 1, 5)

        # save for later
        self.rec_mtx      = rec
        self.user_factors = U
        self.item_factors = V
        self.global_mean  = global_mean
        self.user_bias    = ubias
        self.item_bias    = ibias

        return rec

    def train(self, train_data, tbr_data=None, test_data=None):
        """
        train_data: DataFrame with ['sid','pid','rating']
        tbr_data:   DataFrame with ['sid','pid','tbr_flag']
        """
        # explicit
        u,i,r = self.extract_data(train_data)
        # tbr flags array
        if tbr_data is not None:
            df = train_data.merge(tbr_data, on=["sid","pid"], how="left").fillna(0)
            t = df["tbr_flag"].values
        else:
            t = np.zeros_like(r)

        # run ALS
        self.ALS(u, i, r, t)

        # optional test eval
        if test_data is not None:
            preds = self.predict_ratings(test_data)
            return math.sqrt(mean_squared_error(test_data["rating"], preds))

    def predict_ratings(self, df):
        u,i,_ = self.extract_data(df)
        out = []
        for uu,ii in zip(u,i):
            if uu < self.rec_mtx.shape[0] and ii < self.rec_mtx.shape[1]:
                out.append(self.rec_mtx[uu,ii])
            else:
                # fallback to just global or user/item bias
                if uu < len(self.user_bias):
                    out.append(self.global_mean + self.user_bias[uu])
                elif ii < len(self.item_bias):
                    out.append(self.global_mean + self.item_bias[ii])
                else:
                    out.append(self.global_mean)
        return np.array(out)

    def predict_for_submission(self, sids, pids):
        preds = []
        for uu,ii in zip(sids, pids):
            if uu < self.rec_mtx.shape[0] and ii < self.rec_mtx.shape[1]:
                preds.append(self.rec_mtx[uu,ii])
            else:
                preds.append(self.global_mean)
        return np.array(preds)


In [6]:
# load
df_r = load_data(ratings_path)
train_data, test_data = train_test_split(df_r, test_size=0.2, random_state=42)
print(f"Training set: {len(train_data)}, Test set: {len(test_data)}")
df_t = load_tbr_data(wishlist_path)

Training set: 902549, Test set: 225638


## Hyperparameter Tuning 

In [10]:
def tune_hyperparameters_wishlist(train_data, wishlist_data, val_data, param_grid):
    """
    Tune hyperparameters using grid search
    
    Parameters:
    -----------
    train_data: DataFrame with columns ['sid','pid','rating']
    val_data: DataFrame with columns ['sid','pid','rating']
    param_grid: dict of hyperparameter ranges
    
    Returns:
    --------
    best_params: dict of best hyperparameters
    best_model: best model
    best_rmse: best RMSE score
    """
    best_rmse = float('inf')
    best_params = None
    best_model = None
    
    # Convert param_grid to list of dictionaries
    from itertools import product
    keys = param_grid.keys()
    values = param_grid.values()
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]
    
    print(f"Evaluating {len(param_combinations)} hyperparameter combinations")
    
    for params in param_combinations:
        print(f"Evaluating parameters: {params}")
        
        model = WeightedALSModel(
            rank=params['factors'],
            num_iterations=params['iterations'],
            reg_parameter=params['regularization'],
            num_svd_runs=params.get('num_svd_runs', 3),
            svd_lr=params.get('svd_lr', 0.1),
            use_iSVD=params.get('use_iSVD', False),
            transpose=params.get('transpose', False),
            bias_reg=params['bias_reg'],
            use_bias=True,
            use_confidence=True,
            alpha_r=params['alpha_r'],
            alpha_tbr=params['alpha_tbr']
        )
         
        
        val_rmse = model.train(train_data=train_data, tbr_data=wishlist_data, test_data=val_data)
        print(f"Validation RMSE: {val_rmse}")
        
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_params = params
            best_model = model
            
    return best_params, best_model, best_rmse

In [11]:
param_grid = {
    'factors': [40],
    'regularization': [0.1,0.2,0.35,0.5,0.6], 
    'iterations': [1,5,10,50,100],
    'num_svd_runs': [3,6],
    'svd_lr': [4,5,5.5,6,7,8,9], 
    'use_iSVD': [False],
    'transpose': [False],
    'bias_reg': [0.05,0.01,0.2,0.35,0.5, 0.1,0.75,0.8,1.0,1.1,1.2,1.5,2], 
    'alpha_r': [1,5,10,13,14,15,16,17,20,30,35,40],
    'alpha_tbr': [0,0,0.05,0.01,0.1,0.21,5,7,10] 
}

In [12]:
best_params, best_model, best_rmse = tune_hyperparameters_wishlist(train_data, df_t, test_data, param_grid)
print(f"Best parameters: {best_params}, Best RMSE: {best_rmse}")

Evaluating 1 hyperparameter combinations
Evaluating parameters: {'factors': 40, 'regularization': 0.6, 'iterations': 1, 'num_svd_runs': 6, 'svd_lr': 5.5, 'use_iSVD': False, 'transpose': False, 'bias_reg': 1.0, 'alpha_r': 15, 'alpha_tbr': 0}
Validation RMSE: 0.8657531031562601
Best parameters: {'factors': 40, 'regularization': 0.6, 'iterations': 1, 'num_svd_runs': 6, 'svd_lr': 5.5, 'use_iSVD': False, 'transpose': False, 'bias_reg': 1.0, 'alpha_r': 15, 'alpha_tbr': 0}, Best RMSE: 0.8657531031562601


## Model Training and Evaluation

In [13]:
# Train the model with best parameters
model = WeightedALSModel(rank=40, 
                         num_iterations=1,
                         reg_parameter=0.6,
                         num_svd_runs= 6,
                         svd_lr= 5.5,
                         use_iSVD = False,
                         transpose = False,
                         bias_reg = 1.0,
                         alpha_r=15.0, 
                         alpha_tbr=0.05)

In [14]:
# optional test
rmse = model.train(train_data=train_data, tbr_data=df_t, test_data=test_data)
print("Test RMSE:", rmse)

Test RMSE: 0.8657531031562601


## Generate Predictions for Submission

In [16]:
make_submission(model,
                sample_path=sample_path,
                filename=   os.path.join(DATA_DIR, 'sample_submissionALS.csv'))

Submission saved to /Users/ccylmichel/Documents/CIL/BayesWatch_CollaborativeFiltering/Data/sample_submissionALS.csv
