# Collaborative Filtering

## Matrix Factorization

In [3]:
import numpy as np
import pandas as pd

import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.multiprocessing
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

torch.manual_seed(21)
np.random.seed(21)

ratings = pd.read_csv('/content/ml-1m/ratings.dat', delimiter='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])
users = pd.read_csv('/content/ml-1m/users.dat', delimiter='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip-code'])
movies = pd.read_csv('/content/ml-1m/movies.dat', delimiter='::', engine='python', names=['movieId', 'title', 'genres'], encoding='ISO-8859-1')

In [4]:
ratings["rating"] = ratings["rating"] - 3 # make range(-2, 2) for opt process

train_ratings, test_ratings = train_test_split(
    ratings, test_size=0.15, random_state=21
)

In [10]:
movie_index_by_id = {id: idx for idx, id in enumerate(movies["movieId"])}
user_index_by_id = {id: idx for idx, id in enumerate(users["userId"])}


class MLDataset(Dataset):
    def __init__(self, ratings: pd.DataFrame):
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, index):
        user_id = self.ratings["userId"].iloc[index]
        movie_id = self.ratings["movieId"].iloc[index]
        rating = self.ratings["rating"].iloc[index]
        user_index = user_index_by_id[user_id]
        movie_index = movie_index_by_id[movie_id]
        return user_index, movie_index, rating


training_data = MLDataset(train_ratings)
test_data = MLDataset(test_ratings)
batch_size = 1024
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=2, persistent_workers=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=2)

### SVD (Singular Value Decomposition) 

SVD is a matrix factorization technique where a user-item matrix is decomposed into lower-dimensional user and item matrices, which capture latent factors.

In [11]:
# import jdc

LR = 1
WEIGHT_DECAY = 5e-5

class MatrixFactorizationSVD(pl.LightningModule):
    """
    Attributes:
        n_users: number of users.
        n_items: number of items.
        n_factors: number of latent factors (or embedding size)
    """

    def __init__(self, n_users, n_items, n_factors = 50):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors
        self.user_biases = nn.Embedding(n_users, 1)
        self.item_biases = nn.Embedding(n_items, 1)
        self.bias = nn.Parameter(data=torch.rand(1))
        self.user_embeddings = nn.Embedding(n_users, n_factors)
        self.item_embeddings = nn.Embedding(n_items, n_factors)

    def forward(self, users, items):
        """
        For a single user and item, this looks like:
        bias + user_bias + item_bias + user_embeddings.dot(item_embeddings)
        """
        # select users and items from the batch
        batch_user_embs = self.user_embeddings(users)
        batch_item_embs = self.item_embeddings(items)

        preds = torch.reshape(
            torch.diag(
                torch.matmul(batch_user_embs, torch.transpose(batch_item_embs, 0, 1))
            ),
            (-1, 1),
        )
        # add bias
        preds += self.user_biases(users) + self.item_biases(items) + self.bias

        return torch.clip(preds.squeeze(), min=-2, max=2)

    def training_step(self, batch, batch_idx):
        users, items, rating = batch
        rating = rating.to(torch.float32)
        output = self.forward(users, items)
        loss = F.mse_loss(rating, output)
        self.log("batch_loss", loss)
        return {"loss": loss}

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
        return optimizer

In [12]:
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("logs", name=f"lr{LR}_wd{WEIGHT_DECAY}")

n_users = len(user_index_by_id)
n_movies = len(movie_index_by_id)
n_factors = 50
model = MatrixFactorizationSVD(n_users=n_users, n_items=n_movies, n_factors=n_factors)
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=100, logger=logger)
trainer.fit(model, train_dataloader, test_dataloader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type      | Params | Mode 
------------------------------------------------------
0 | user_biases     | Embedding | 6.0 K  | train
1 | item_biases     | Embedding | 3.9 K  | train
2 | user_embeddings | Embedding | 302 K  | train
3 | item_embeddings | Embedding | 194 K  | train
  | other params    | n/a       | 1      | n/a  
------------------------------------------------------
506 K     Trainable params
0         Non

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=100` reached.


In [13]:
def eval_model(model, train_dataloader):
    loss = 0
    for users, items, rating in train_dataloader:
        pred = model(users, items)
        loss += F.mse_loss(pred, rating)
    RMSE = (loss / len(train_dataloader))**.5
    return RMSE

print("Train RMSE: {:.3f}".format(eval_model(model, train_dataloader)))
print("Validation RMSE: {:.3f}".format(eval_model(model, test_dataloader)))

Train RMSE: 0.817
Validation RMSE: 0.911


### Alternating Least Squares (ALS)

ALS uses a different technique to solve the optimization problem: it alternates between updating the user and item embeddings while keeping the other fixed. In each step, it minimizes the squared error

Instead of updating all parameters simultaneously (as in SVD), ALS alternates between fixing one set of embeddings (user or item) and updating the other

In [None]:
class MatrixFactorizationALS(pl.LightningModule):
    def __init__(self, n_users, n_items, n_factors=50, reg=0.01):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors
        self.reg = reg  # Regularization term 
        self.user_embeddings = nn.Embedding(n_users, n_factors)
        self.item_embeddings = nn.Embedding(n_items, n_factors)
        torch.nn.init.xavier_normal_(self.user_embeddings.weight)
        torch.nn.init.xavier_normal_(self.item_embeddings.weight)

    def forward(self, users, items):
        batch_user_embs = self.user_embeddings(users)
        batch_item_embs = self.item_embeddings(items)

        preds = torch.sum(batch_user_embs * batch_item_embs, dim=1) # Dot product of user and item embeddings
        return preds

    def training_step(self, batch, batch_idx):
        users, items, rating = batch
        rating = rating.to(torch.float32)
        self.update_item_embeddings(users, items, rating)
        self.update_user_embeddings(users, items, rating)
        output = self.forward(users, items)
        loss = F.mse_loss(output, rating)
        self.log("train_loss", loss)
        return {"loss": loss}

    def update_user_embeddings(self, batch_users, batch_items, ratings):
        # Update users while keeping items fixed
        items_emb = self.item_embeddings(batch_items)
        # Least squares problem for users
        for user in torch.unique(batch_users):
            user_idx = batch_users == user
            X_item = items_emb[user_idx]  # Fix item embeddings
            y_rating = ratings[user_idx]
            A = X_item.t().matmul(X_item) + self.reg * torch.eye(self.n_factors).to(self.device)
            b = X_item.t().matmul(y_rating)
            user_emb = torch.linalg.solve(A, b)  # Update user embedding
            self.user_embeddings.weight.data[user] = user_emb

    def update_item_embeddings(self, batch_users, batch_items, ratings):
        # Update items while keeping users fixed
        users_emb = self.user_embeddings(batch_users)
        # Least squares problem for items
        for item in torch.unique(batch_items):
            item_idx = batch_items == item
            X_user = users_emb[item_idx]  # Fix user embeddings
            y_rating = ratings[item_idx]
            A = X_user.t().matmul(X_user) + self.reg * torch.eye(self.n_factors).to(self.device)
            b = X_user.t().matmul(y_rating)
            item_emb = torch.linalg.solve(A, b)  # Update item embedding
            self.item_embeddings.weight.data[item] = item_emb

    def configure_optimizers(self):
        return None

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("logs", name=f"lr{LR}_wd{WEIGHT_DECAY}")

n_users = len(user_index_by_id)
n_movies = len(movie_index_by_id)
n_factors = 50
model = MatrixFactorizationSVD(n_users=n_users, n_items=n_movies, n_factors=n_factors)
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=100, logger=logger)
trainer.fit(model, train_dataloader, test_dataloader)

In [None]:
def eval_model(model, train_dataloader):
    loss = 0
    for users, items, rating in train_dataloader:
        pred = model(users, items)
        loss += F.mse_loss(pred, rating)
    RMSE = (loss / len(train_dataloader))**.5
    return RMSE

print("Train RMSE: {:.3f}".format(eval_model(model, train_dataloader)))
print("Validation RMSE: {:.3f}".format(eval_model(model, test_dataloader)))