# Collaborative Filtering



## Matrix Factorization

In [1]:
import numpy as np
import pandas as pd
import time

import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.multiprocessing
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

torch.manual_seed(21)
np.random.seed(21)

In [2]:
users = pd.read_csv('/kaggle/input/gamers/game_data_prepared/users.csv')
ratings = pd.read_csv('/kaggle/input/gamers/game_data_prepared/ratings.csv')
gamesF = pd.read_csv('/kaggle/input/gamers/game_data_prepared/games.csv')

In [3]:
from sklearn.model_selection import train_test_split
train_ratings, test_ratings = train_test_split(
    ratings, test_size=0.15, random_state=21
)

In [4]:
users_in_validation = train_ratings["user_id"].unique()
all_users = users["user_id"].unique()

print(f"There are {len(users_in_validation)} users in test set.")
print(f"Total number of users: {len(all_users)}")

There are 7639 users in test set.
Total number of users: 7639


In [5]:
game_index_by_id = {id: idx for idx, id in enumerate(gamesF["app_id"])}
user_index_by_id = {id: idx for idx, id in enumerate(users["user_id"])}

class GameDataset(Dataset):
    def __init__(self, ratings: pd.DataFrame):
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, index):
        user_id = self.ratings["user_id"].iloc[index]
        game_id = self.ratings["app_id"].iloc[index]
        rating = self.ratings["is_recommended"].iloc[index]
        user_index = user_index_by_id[user_id]
        game_index = game_index_by_id[game_id]
        
        return user_index, game_index, rating

training_data =GameDataset(train_ratings)
test_data = GameDataset(test_ratings)
batch_size = 1024*50
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, persistent_workers=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

### SVD (Singular Value Decomposition) 



SVD is a matrix factorization technique where a user-item matrix is decomposed into lower-dimensional user and item matrices, which capture latent factors.

In [6]:
# import jdc
LR = 1
WEIGHT_DECAY = 5e-5

class MatrixFactorizationSVD(pl.LightningModule):
    """
    Attributes:
        n_users: number of users.
        n_items: number of items.
        n_factors: number of latent factors (or embedding size)
    """
    def __init__(self, n_users, n_items, n_factors = 50):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors
        self.user_biases = nn.Embedding(n_users, 1)
        self.item_biases = nn.Embedding(n_items, 1)
        self.bias = nn.Parameter(data=torch.rand(1))
        self.user_embeddings = nn.Embedding(n_users, n_factors)
        self.item_embeddings = nn.Embedding(n_items, n_factors)

    def forward(self, users, items):
        """
        For a single user and item, this looks like:
        bias + user_bias + item_bias + user_embeddings.dot(item_embeddings)
        """
        # select users and items from the batch
        batch_user_embs = self.user_embeddings(users)
        batch_item_embs = self.item_embeddings(items)

        preds = torch.reshape(
            torch.diag(
                torch.matmul(batch_user_embs, torch.transpose(batch_item_embs, 0, 1))
            ),
            (-1, 1),
        )
        # add bias
        preds += self.user_biases(users) + self.item_biases(items) + self.bias

        return torch.clip(preds.squeeze(), min=0, max=1)

    def training_step(self, batch, batch_idx):
        users, items, rating = batch
        rating = rating.to(torch.float32)
        output = self.forward(users, items)
        loss = F.mse_loss(rating, output)
        self.log("batch_loss", loss)

        return {"loss": loss}
        
    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

        return optimizer

In [7]:
from pytorch_lightning.loggers import TensorBoardLogger

logger = TensorBoardLogger("logs", name=f"lr{LR}_wd{WEIGHT_DECAY}")
start_time = time.time()

n_users = len(user_index_by_id)
n_games = len(game_index_by_id)
n_factors = 80
model = MatrixFactorizationSVD(n_users=n_users, n_items=n_games, n_factors=n_factors)
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=100, logger=logger)
trainer.fit(model, train_dataloader, test_dataloader)

end_time = time.time()

exe_time = end_time - start_time
print("Execution Time:", exe_time, "seconds")

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
  self.pid = os.fork()
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (21) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Execution Time: 1283.8801424503326 seconds


In [8]:
def eval_model(model, train_dataloader):
    loss = 0
    for users, items, rating in train_dataloader:
        pred = model(users, items)
        loss += F.mse_loss(pred, rating)
    RMSE = (loss / len(train_dataloader))**.5
    
    return RMSE

print("Train RMSE: {:.3f}".format(eval_model(model, train_dataloader)))
print("Validation RMSE: {:.3f}".format(eval_model(model, test_dataloader)))

Train RMSE: 0.498
Validation RMSE: 0.499
