In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import torch

from models.light_gcn import LightGCNStack
from utils.light_gcn_utils import bpr_loss, evaluate, build_user_item_interactions, get_positive_negative_ratings, recall_at_k, precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils.preprocess import load_dataset

# Load the dataset
dataset = 'goodbooks-10k'
users, items, train_ratings, test_ratings, items_features_tensor, user_features_tensor = load_dataset(dataset)

In [3]:
num_users = users['userid'].nunique()
num_items = items['itemid'].nunique()
print(f"num_users: {num_users}, num_items: {num_items}")

num_users: 993, num_items: 754


In [4]:
# Create edge index for bipartite graph for train set
train_user_ids = train_ratings['userid'].values
train_item_ids = train_ratings['itemid'].values + num_users 
train_edge_index = torch.tensor([train_user_ids, train_item_ids], dtype=torch.long)

# Create edge index for bipartite graph for test set
test_user_ids = test_ratings['userid'].values  
test_item_ids = test_ratings['itemid'].values + num_users  
test_edge_index = torch.tensor([test_user_ids, test_item_ids], dtype=torch.long)

  train_edge_index = torch.tensor([train_user_ids, train_item_ids], dtype=torch.long)


In [5]:
train_user_item_dict = build_user_item_interactions(train_ratings)
test_user_item_dict = build_user_item_interactions(test_ratings)

In [6]:
positive_threshold = 5
negative_threshold = 4

In [7]:
train_user_ratings = get_positive_negative_ratings(train_user_item_dict, positive_threshold, negative_threshold)
test_user_ratings = get_positive_negative_ratings(test_user_item_dict, positive_threshold, negative_threshold)

In [8]:
for i, user in enumerate(train_user_ratings):
    train_user_ratings[i] = (user[0], [item + num_users for item in user[1]], [item + num_users for item in user[2]])

for i, user in enumerate(test_user_ratings):
    test_user_ratings[i] = (user[0], [item + num_users for item in user[1]], [item + num_users for item in user[2]])

In [9]:
embedding_dim = 384
num_nodes = num_users + num_items
no_user_features = user_features_tensor.size(1)
no_item_features = items_features_tensor.size(1)

num_layers = 10
num_epochs = 50
learning_rate = 0.0005
k = 10

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

user_features_tensor = user_features_tensor.to(device)
items_features_tensor = items_features_tensor.to(device)
train_edge_index = train_edge_index.to(device)
test_edge_index = test_edge_index.to(device)

model = LightGCNStack(num_nodes, no_user_features, no_item_features, embedding_dim, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
embeddings = model(user_features_tensor, items_features_tensor, train_edge_index)
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)

print("Base recall:", recall)
print("Base precision:", precision)

Base recall: 0.1986429342703487
Base precision: 0.3600730059627042


In [12]:
calc_metrics_every = 1
losses = []
recalls = []
precisions = []

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    pbar = tqdm(train_user_ratings, desc=f'Epoch {epoch+1}/{num_epochs}')
    embeddings = model(user_features_tensor, items_features_tensor, train_edge_index)

    for user_id, pos_items, neg_items in pbar:
        no_sample = min(len(pos_items), len(neg_items))
        users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
        pos_samples = random.sample(pos_items, no_sample)
        pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
        neg_samples = random.sample(neg_items, no_sample)
        neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
        
        loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
        total_loss += loss
        num_batches += 1
        avg_loss = total_loss.item() / num_batches

        pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})

    total_loss.backward()
    optimizer.step()

    losses.append(total_loss)
    
    if (epoch + 1) % calc_metrics_every == 0:
        recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
        precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
        recalls.append(recall)
        precisions.append(precision)
        avg_loss = total_loss / len(train_user_ratings)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Recall@{k}: {recall:.4f}, Precision@{k}: {precision:.4f}')
    else:
        avg_loss = total_loss / len(train_user_ratings)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')
    

Epoch 1/50: 100%|██████████| 961/961 [00:02<00:00, 345.73it/s, Avg Loss=4.0893]


Epoch 1/50, Loss: 4.0893, Recall@10: 0.1986, Precision@10: 0.3601


Epoch 2/50: 100%|██████████| 961/961 [00:03<00:00, 305.65it/s, Avg Loss=3.6271]


Epoch 2/50, Loss: 3.6271, Recall@10: 0.2057, Precision@10: 0.3708


Epoch 3/50: 100%|██████████| 961/961 [00:03<00:00, 278.40it/s, Avg Loss=3.3519]


Epoch 3/50, Loss: 3.3519, Recall@10: 0.2160, Precision@10: 0.3857


Epoch 4/50: 100%|██████████| 961/961 [00:02<00:00, 457.28it/s, Avg Loss=3.0750]


Epoch 4/50, Loss: 3.0750, Recall@10: 0.2288, Precision@10: 0.4041


Epoch 5/50: 100%|██████████| 961/961 [00:02<00:00, 468.46it/s, Avg Loss=2.7990]


Epoch 5/50, Loss: 2.7990, Recall@10: 0.2466, Precision@10: 0.4261


Epoch 6/50: 100%|██████████| 961/961 [00:01<00:00, 573.31it/s, Avg Loss=2.4814]


Epoch 6/50, Loss: 2.4814, Recall@10: 0.2682, Precision@10: 0.4530


Epoch 7/50: 100%|██████████| 961/961 [00:01<00:00, 507.21it/s, Avg Loss=2.2555]


Epoch 7/50, Loss: 2.2555, Recall@10: 0.2837, Precision@10: 0.4738


Epoch 8/50: 100%|██████████| 961/961 [00:01<00:00, 585.39it/s, Avg Loss=1.9577]


Epoch 8/50, Loss: 1.9577, Recall@10: 0.3053, Precision@10: 0.5009


Epoch 9/50: 100%|██████████| 961/961 [00:02<00:00, 425.01it/s, Avg Loss=1.8131]


Epoch 9/50, Loss: 1.8131, Recall@10: 0.3237, Precision@10: 0.5249


Epoch 10/50: 100%|██████████| 961/961 [00:02<00:00, 461.00it/s, Avg Loss=1.6390]


Epoch 10/50, Loss: 1.6390, Recall@10: 0.3440, Precision@10: 0.5511


Epoch 11/50: 100%|██████████| 961/961 [00:02<00:00, 408.70it/s, Avg Loss=1.4600]


Epoch 11/50, Loss: 1.4600, Recall@10: 0.3684, Precision@10: 0.5822


Epoch 12/50: 100%|██████████| 961/961 [00:02<00:00, 417.28it/s, Avg Loss=1.3060]


In [None]:
# make plots
import matplotlib.pyplot as plt

plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Train Loss')
plt.title('Train Loss')
plt.show()

NameError: name 'losses' is not defined

In [None]:
# Recall
plt.plot(recalls)
plt.xlabel('Epoch')
plt.ylabel('Recall@10')
plt.title('Recall@10')
plt.show()

In [None]:
# Precision
plt.plot(precisions)
plt.xlabel('Epoch')
plt.ylabel('Precision@10')
plt.title('Precision@10')
plt.show()

In [None]:
total_loss = 0
num_batches = 0
pbar = tqdm(test_user_ratings)

embeddings = model(user_features_tensor, items_features_tensor, test_edge_index)

for user_id, pos_items, neg_items in pbar:
    no_sample = min(len(pos_items), len(neg_items))
    users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
    pos_samples = random.sample(pos_items, no_sample)
    pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
    neg_samples = random.sample(neg_items, no_sample)
    neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
    loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
    total_loss += loss
    num_batches += 1
    avg_loss = total_loss / num_batches

    # Update progress bar with average loss
    pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})
    
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
avg_loss = total_loss / len(test_user_ratings)
print(f'Test Loss: {avg_loss:.4f}, Test Recall@{k}: {recall:.4f}, Test Precision@{k}: {precision:.4f}')

100%|██████████| 2540/2540 [00:05<00:00, 423.88it/s, Avg Loss=4.7785]


Test Loss: 4.7785, Test Recall@10: 0.3739, Test Precision@10: 0.8034
