In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import torch

from models.light_gcn import LightGCNStack
from utils.light_gcn_utils import bpr_loss, evaluate, build_user_item_interactions, get_positive_negative_ratings, recall_at_k, precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils.preprocess import load_dataset

# Load the dataset
dataset = 'movielens-1m'
users, items, train_ratings, test_ratings, items_features_tensor, user_features_tensor = load_dataset(dataset)

In [3]:
num_users = users['userid'].nunique()
num_items = items['itemid'].nunique()
print(f"num_users: {num_users}, num_items: {num_items}")

num_users: 6040, num_items: 3706


In [4]:
# Create edge index for bipartite graph for train set
train_user_ids = train_ratings['userid'].values
train_item_ids = train_ratings['itemid'].values + num_users 
train_edge_index = torch.tensor([train_user_ids, train_item_ids], dtype=torch.long)

# Create edge index for bipartite graph for test set
test_user_ids = test_ratings['userid'].values  
test_item_ids = test_ratings['itemid'].values + num_users  
test_edge_index = torch.tensor([test_user_ids, test_item_ids], dtype=torch.long)

  train_edge_index = torch.tensor([train_user_ids, train_item_ids], dtype=torch.long)


In [5]:
train_user_item_dict = build_user_item_interactions(train_ratings)
test_user_item_dict = build_user_item_interactions(test_ratings)

In [6]:
positive_threshold = 5
negative_threshold = 3

In [7]:
train_user_ratings = get_positive_negative_ratings(train_user_item_dict, positive_threshold, negative_threshold)
test_user_ratings = get_positive_negative_ratings(test_user_item_dict, positive_threshold, negative_threshold)

In [8]:
for i, user in enumerate(train_user_ratings):
    train_user_ratings[i] = (user[0], [item + num_users for item in user[1]], [item + num_users for item in user[2]])

for i, user in enumerate(test_user_ratings):
    test_user_ratings[i] = (user[0], [item + num_users for item in user[1]], [item + num_users for item in user[2]])

In [9]:
embedding_dim = 64
num_nodes = num_users + num_items
no_user_features = user_features_tensor.size(1)
no_item_features = items_features_tensor.size(1)

num_layers = 6
num_epochs = 50
learning_rate = 0.0005
k = 10

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

user_features_tensor = user_features_tensor.to(device)
items_features_tensor = items_features_tensor.to(device)
train_edge_index = train_edge_index.to(device)
test_edge_index = test_edge_index.to(device)

model = LightGCNStack(num_nodes, no_user_features, no_item_features, embedding_dim, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
embeddings = model(user_features_tensor, items_features_tensor, train_edge_index)
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)

print("Base recall:", recall)
print("Base precision:", precision)

Base recall: 0.11373632235960784
Base precision: 0.33025817555938036


In [12]:
calc_metrics_every = 1

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    pbar = tqdm(train_user_ratings, desc=f'Epoch {epoch+1}/{num_epochs}')
    embeddings = model(user_features_tensor, items_features_tensor, train_edge_index)

    for user_id, pos_items, neg_items in pbar:
        no_sample = min(len(pos_items), len(neg_items))
        users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
        pos_samples = random.sample(pos_items, no_sample)
        pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
        neg_samples = random.sample(neg_items, no_sample)
        neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
        
        loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
        total_loss += loss
        num_batches += 1
        avg_loss = total_loss.item() / num_batches

        pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})

    total_loss.backward()
    optimizer.step()
    
    if (epoch + 1) % calc_metrics_every == 0:
        recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
        precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
        avg_loss = total_loss / len(train_user_ratings)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Recall@{k}: {recall:.4f}, Precision@{k}: {precision:.4f}')
    else:
        avg_loss = total_loss / len(train_user_ratings)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')
    

Epoch 1/50: 100%|██████████| 5972/5972 [00:12<00:00, 477.68it/s, Avg Loss=0.6930]


Epoch 1/50, Loss: 0.6930, Recall@10: 0.1137, Precision@10: 0.3303


Epoch 2/50: 100%|██████████| 5972/5972 [00:11<00:00, 505.67it/s, Avg Loss=0.6921]


Epoch 2/50, Loss: 0.6921, Recall@10: 0.1202, Precision@10: 0.3489


Epoch 3/50: 100%|██████████| 5972/5972 [00:11<00:00, 528.76it/s, Avg Loss=0.6913]


Epoch 3/50, Loss: 0.6913, Recall@10: 0.1216, Precision@10: 0.3530


Epoch 4/50: 100%|██████████| 5972/5972 [00:10<00:00, 543.97it/s, Avg Loss=0.6905]


Epoch 4/50, Loss: 0.6905, Recall@10: 0.1221, Precision@10: 0.3544


Epoch 5/50: 100%|██████████| 5972/5972 [00:11<00:00, 500.43it/s, Avg Loss=0.6897]


Epoch 5/50, Loss: 0.6897, Recall@10: 0.1227, Precision@10: 0.3563


Epoch 6/50: 100%|██████████| 5972/5972 [00:11<00:00, 507.59it/s, Avg Loss=0.6887]


Epoch 6/50, Loss: 0.6887, Recall@10: 0.1230, Precision@10: 0.3573


Epoch 7/50: 100%|██████████| 5972/5972 [00:11<00:00, 512.81it/s, Avg Loss=0.6877]


Epoch 7/50, Loss: 0.6877, Recall@10: 0.1231, Precision@10: 0.3575


Epoch 8/50: 100%|██████████| 5972/5972 [00:12<00:00, 496.26it/s, Avg Loss=0.6866]


Epoch 8/50, Loss: 0.6866, Recall@10: 0.1238, Precision@10: 0.3596


Epoch 9/50: 100%|██████████| 5972/5972 [00:12<00:00, 477.87it/s, Avg Loss=0.6854]


Epoch 9/50, Loss: 0.6854, Recall@10: 0.1241, Precision@10: 0.3602


Epoch 10/50: 100%|██████████| 5972/5972 [00:11<00:00, 511.62it/s, Avg Loss=0.6845]


Epoch 10/50, Loss: 0.6845, Recall@10: 0.1243, Precision@10: 0.3609


Epoch 11/50: 100%|██████████| 5972/5972 [00:11<00:00, 499.85it/s, Avg Loss=0.6832]


Epoch 11/50, Loss: 0.6832, Recall@10: 0.1242, Precision@10: 0.3607


Epoch 12/50: 100%|██████████| 5972/5972 [00:12<00:00, 476.12it/s, Avg Loss=0.6818]


In [None]:
total_loss = 0
num_batches = 0
pbar = tqdm(test_user_ratings)

embeddings = model(user_features_tensor, items_features_tensor, test_edge_index)

for user_id, pos_items, neg_items in pbar:
    no_sample = min(len(pos_items), len(neg_items))
    users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
    pos_samples = random.sample(pos_items, no_sample)
    pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
    neg_samples = random.sample(neg_items, no_sample)
    neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
    loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
    total_loss += loss
    num_batches += 1
    avg_loss = total_loss / num_batches

    # Update progress bar with average loss
    pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})
    
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
avg_loss = total_loss / len(test_user_ratings)
print(f'Test Loss: {avg_loss:.4f}, Test Recall@{k}: {recall:.4f}, Test Precision@{k}: {precision:.4f}')

100%|██████████| 5539/5539 [00:19<00:00, 288.41it/s, Avg Loss=0.6944]


Test Loss: 0.6944, Test Recall@10: 0.1119, Test Precision@10: 0.3249
