<a href="https://colab.research.google.com/github/Marcusleeleelee/CQF-learning-materials/blob/main/FTEC4005_RecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# NeuMF model definition
class NeuMF(torch.nn.Module):
    def __init__(self, config):
        super(NeuMF, self).__init__()

        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim_mf = config['latent_dim_mf']
        self.latent_dim_mlp = config['latent_dim_mlp']
        self.config = config

        # MF part
        self.embedding_user_mf = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf)
        self.embedding_item_mf = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf)

        # MLP part
        self.embedding_user_mlp = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp)
        self.embedding_item_mlp = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp)

        # Fully connected MLP layers
        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        # Final output layer (no sigmoid for regression task)
        self.logits = torch.nn.Linear(in_features=config['layers'][-1] + config['latent_dim_mf'], out_features=1)

    def forward(self, user_indices, item_indices):
        # Get embeddings for MF part
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)
        mf_vector = torch.mul(user_embedding_mf, item_embedding_mf)
        mf_vector = torch.nn.Dropout(self.config['dropout_rate_mf'])(mf_vector)

        # Get embeddings for MLP part
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)

        # Pass through fully connected layers
        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)
            mlp_vector = torch.nn.ReLU()(mlp_vector)
        mlp_vector = torch.nn.Dropout(self.config['dropout_rate_mlp'])(mlp_vector)

        # Concatenate MF and MLP parts
        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        logits = self.logits(vector)  # No sigmoid for raw output (regression)

        return logits


# Dataset loader
class RatingDataset(Dataset):
    def __init__(self, file_path, train=True):
        data = pd.read_csv(file_path, header=None)
        if train:
            self.user_ids = torch.tensor(data[0].values, dtype=torch.long)
            self.item_ids = torch.tensor(data[1].values, dtype=torch.long)
            self.scores = torch.tensor(data[2].values, dtype=torch.float32)  # No normalization needed
        else:
            self.user_ids = torch.tensor(data[0].values, dtype=torch.long)
            self.item_ids = torch.tensor(data[1].values, dtype=torch.long)
            self.scores = None  # No target in test data

        self.train = train

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        if self.train:
            return self.user_ids[index], self.item_ids[index], self.scores[index]
        else:
            return self.user_ids[index], self.item_ids[index]


# Function to calculate number of users and items
def calculate_num_users_items(file_path):
    data = pd.read_csv(file_path, header=None, names=['user_id', 'item_id', 'score'])
    max_user_id = data['user_id'].max()
    max_item_id = data['item_id'].max()
    num_users = max_user_id + 1  # Account for zero-indexing
    num_items = max_item_id + 1  # Account for zero-indexing
    return num_users, num_items


# Load the number of users and items from the training data
train_file_path = 'train.csv'
num_users, num_items = calculate_num_users_items(train_file_path)

# Configuration for the model
config = {
    'num_users': num_users,  # Dynamically calculated
    'num_items': num_items,  # Dynamically calculated
    'latent_dim_mf': 8,      # Number of factors for MF embeddings
    'latent_dim_mlp': 16,    # Number of factors for MLP embeddings
    'layers': [32, 16, 8],   # MLP layers
    'dropout_rate_mf': 0.2,  # Dropout rate for MF part
    'dropout_rate_mlp': 0.2, # Dropout rate for MLP part
}

# Initialize the model, loss function, and optimizer
model = NeuMF(config)

# Use MSELoss for regression tasks (predicting continuous ratings)
criterion = torch.nn.MSELoss()

# Use Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create dataset and dataloaders
train_dataset = RatingDataset(train_file_path, train=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for user_ids, item_ids, scores in train_loader:
        optimizer.zero_grad()

        # Forward pass (no sigmoid, raw continuous values)
        predictions = model(user_ids, item_ids).squeeze()

        # Compute loss using MSELoss (regression)
        loss = criterion(predictions, scores)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

print("Training complete!")

Epoch 1/10, Loss: 1.3004853823884583
Epoch 2/10, Loss: 0.8792107493981944
Epoch 3/10, Loss: 0.8618717816064199
Epoch 4/10, Loss: 0.8534901107878199
Epoch 5/10, Loss: 0.8461577212859491
Epoch 6/10, Loss: 0.8400504948462938
