In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import torch.nn as nn
import torch.nn.functional as F

In [2]:
ratings_df = pd.read_csv("../user_ratings.csv")

In [3]:
ratings_df = ratings_df[ratings_df['Rating'] >= 8] # consider as positive interactions, may be better to weight in the future such as 
# 7–10 → Positive interaction
# 4–6 → Neutral (maybe ignored)
# 1–3 → Negative feedback (used for contrastive learning)
len(ratings_df)

7297542

In [81]:
unique_game_ids = sorted(list(set((list(ratings_df['BGGId'])))))
game_id_to_index = {bg_id: idx for idx, bg_id in enumerate(unique_game_ids)}

# Reverse mapping: index → BGGId (optional, for decoding predictions)
index_to_game_id = {idx: bg_id for bg_id, idx in game_id_to_index.items()}

In [82]:
# Create user interaction sequences
user_sequences = defaultdict(list)
for i, row in ratings_df.iterrows():
    user_sequences[row['Username']].append(game_id_to_index[row['BGGId']])

In [86]:
len([games for user, games in user_sequences.items() if max(games) > 21673])
# does it make sense to split each sequence up so we have more training data/sequences?

7

In [87]:
train_sequences = {}
test_sequences = {}
# not sure if this is the best way to split sequences into test and training set
for user, games in user_sequences.items():
    if len(games)> 5: # len(train_sequences) = 399342 without this, otherwise = 221,509
        split_point = int(len(games) * 0.8)
        train_sequences[user] = games[:split_point]
        test_sequences[user] = games[split_point:]

In [88]:
print(len(train_sequences))

221509


In [96]:
class SASRec(nn.Module):
    # what is good size foe embed_dim
    def __init__(self, num_games, embed_dim=64, num_heads=2, num_layers=2, dropout=0.2, max_seq_len=50):
        super(SASRec, self).__init__()
        self.num_games = num_games
        self.embed_fim = embed_dim
        self.max_seq_len = max_seq_len
        # Game embeddings and positional encoding
        self.game_embedding = nn.Embedding(num_games, embed_dim, padding_idx=0)
        self.position_embedding = nn.Embedding(max_seq_len, embed_dim)
        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        # Output layer
        self.fc = nn.Linear(embed_dim, num_games)
        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, seq):
        seq_len = seq.size(1)
        positions = torch.arange(seq_len, device=seq.device).unsqueeze(0).expand(seq.size(0), seq_len)
        # Embed games and positions
        # print(f"Max index in seq: {seqs.max()}, num_games: {self.num_games}")
        # print(f"Min index in seq: {seqs.min()}")
        game_embedded = self.game_embedding(seq)
        pos_embedded = self.position_embedding(positions)
        # Combine embeddings
        x = game_embedded + pos_embedded
        x = self.dropout(x)
        # Pass through Transformer Encoder
        x = self.transformer_encoder(x)
        # Predict next game, using last hidden state for prediction
        x = self.fc(x[:, -1, :])
        return x

In [97]:
class BoardGameDataset(Dataset):
    def __init__(self, user_sequences, num_games, max_seq_len=50):
        self.user_sequences = user_sequences
        self.num_games = num_games
        self.max_seq_len = max_seq_len
        self.users = list(user_sequences.keys())

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        seq = self.user_sequences[user]
        
        # Pad sequences
        padded_seq = [0] * (self.max_seq_len - len(seq)) + seq[-self.max_seq_len:]
        target = seq[-1]  # Last game is the target

        # Negative Sampling
        negative = np.random.randint(1, self.num_games)
        while negative in seq:
            negative = np.random.randint(1, self.num_games)
        
        return torch.tensor(padded_seq), torch.tensor(target), torch.tensor(negative)

In [98]:
# Load dataset
num_games = len(ratings_df['BGGId'].unique()) + 1
print(num_games)
train_dataset = BoardGameDataset(train_sequences, num_games)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

21676


In [99]:
# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SASRec(num_games).to(device)

In [100]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [101]:
# Training Loop
for epoch in range(10):
    model.train()
    total_loss = 0
    for i, (seqs, targets, negatives) in enumerate(train_loader):
        
        seqs, targets, negatives = seqs.to(device), targets.to(device), negatives.to(device)
        # Forward pass
        outputs = model(seqs)

        # Compute loss (use target and negative samples)
        loss = criterion(outputs, targets) + criterion(outputs, negatives)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1, Loss: 14.3793
Epoch 2, Loss: 12.4257
Epoch 3, Loss: 12.1949
Epoch 4, Loss: 12.0781
Epoch 5, Loss: 12.0122
Epoch 6, Loss: 11.9652
Epoch 7, Loss: 11.9262
Epoch 8, Loss: 11.9011
Epoch 9, Loss: 11.8788
Epoch 10, Loss: 11.8580


In [102]:
torch.save(model.state_dict(), "sasrec_model.pth")
print("Model saved successfully!")

Model saved successfully!


In [103]:
def hit_rate_at_k(predictions, targets, k=10):
    hits = 0
    for pred, target in zip(predictions, targets):
        if target in pred[:k]:
            hits += 1
    return hits / len(targets)

def evaluate(model, test_sequences, k=10):
    model.eval()
    users = list(test_sequences.keys())
    test_dataset = BoardGameDataset(test_sequences, num_games)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    all_predictions, all_targets = [], []
    with torch.no_grad():
        for seqs, targets, _ in test_loader:
            seqs = seqs.to(device)
            outputs = model(seqs)  # Get game scores
            _, top_k = torch.topk(outputs, k, dim=1)
            all_predictions.extend(top_k.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    hr = hit_rate_at_k(all_predictions, all_targets, k)
    print(f"Hit Rate @ {k}: {hr:.4f}")

evaluate(model, test_sequences, k=10)

Hit Rate @ 10: 0.6213
