In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import torch

from models.light_gcn import LightGCNStack
from utils.graph_splitters import python_stratified_split
from utils.light_gcn_utils import bpr_loss, evaluate, build_user_movie_interactions, get_positive_negative_ratings, recall_at_k, precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
user_columns = ['userid', 'gender', 'age', 'occupation', 'zipcode']
movie_columns = ['movieid', 'title', 'genres']
rating_columns = ['userid', 'movieid', 'rating', 'timestamp']

users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=user_columns, engine='python', encoding='ISO-8859-1')
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=movie_columns, engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rating_columns, engine='python', encoding='ISO-8859-1')

print(users.head())
print(movies.head())
print(ratings.head())

   userid gender  age  occupation zipcode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455
   movieid                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   userid  movieid  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [3]:
num_users = ratings['userid'].nunique()
num_movies = ratings['movieid'].nunique()

In [4]:
# user reindexing
user_to_index = {user: i+1 for i, user in enumerate(users['userid'])}

# reindex userid in users and ratings
users['userid'] = users['userid'].map(user_to_index)
ratings['userid'] = ratings['userid'].map(user_to_index)

# movie reindexing
movie_to_index = {movie: i+1 for i, movie in enumerate(movies['movieid'])}

# reindex movieid in movies and ratings
movies['movieid'] = movies['movieid'].map(movie_to_index)
ratings['movieid'] = ratings['movieid'].map(movie_to_index)

In [5]:
occupation_dict = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}

In [6]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for movie genres
movies['genres list'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres list'] = movies['genres list'].apply(lambda x: ' '.join(x))
genres_embeddings = model.encode(movies['genres list'].tolist())

# Generate embeddings for user occupations
occupations_embeddings = model.encode(users['occupation'].apply(lambda x: occupation_dict[x]).tolist())

# Convert embeddings to tensors
movies_features_tensor = torch.tensor(genres_embeddings, dtype=torch.float)
user_features_tensor = torch.tensor(occupations_embeddings, dtype=torch.float)

In [7]:
train_ratings, test_ratings = python_stratified_split(ratings, ratio=0.75)

In [8]:
# Create edge index for bipartite graph for train set
train_user_ids = train_ratings['userid'].values - 1  
train_movie_ids = train_ratings['movieid'].values - 1 + num_users 
train_edge_index = torch.tensor([train_user_ids, train_movie_ids], dtype=torch.long)

# Create edge index for bipartite graph for test set
test_user_ids = test_ratings['userid'].values - 1  
test_movie_ids = test_ratings['movieid'].values - 1 + num_users  
test_edge_index = torch.tensor([test_user_ids, test_movie_ids], dtype=torch.long)

  train_edge_index = torch.tensor([train_user_ids, train_movie_ids], dtype=torch.long)


In [9]:
train_user_movie_dict = build_user_movie_interactions(train_ratings, 'userid', 'movieid', 'rating')
test_user_movie_dict = build_user_movie_interactions(test_ratings, 'userid', 'movieid', 'rating')

In [10]:
positive_threshold = 5
negative_threshold = 3

In [11]:
train_user_ratings = get_positive_negative_ratings(train_user_movie_dict, positive_threshold, negative_threshold)
test_user_ratings = get_positive_negative_ratings(test_user_movie_dict, positive_threshold, negative_threshold)

In [12]:
embedding_dim = 384
num_nodes = num_users + num_movies
no_user_features = user_features_tensor.size(1)
no_movie_features = movies_features_tensor.size(1)

num_layers = 10
num_epochs = 100
learning_rate = 0.0005
k = 10

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

user_features_tensor = user_features_tensor.to(device)
movies_features_tensor = movies_features_tensor.to(device)
train_edge_index = train_edge_index.to(device)
test_edge_index = test_edge_index.to(device)

model = LightGCNStack(num_nodes, no_user_features, no_movie_features, embedding_dim, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
embeddings = model(user_features_tensor, movies_features_tensor, train_edge_index)
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)

print("Base recall:", recall)
print("Base precision:", precision)

Base recall: 0.10923737152205612
Base precision: 0.31719449225473323


In [15]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    pbar = tqdm(train_user_ratings, desc=f'Epoch {epoch+1}/{num_epochs}')
    embeddings = model(user_features_tensor, movies_features_tensor, train_edge_index)

    for user_id, pos_movies, neg_movies in pbar:
        no_sample = min(len(pos_movies), len(neg_movies))
        users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
        pos_samples = random.sample(pos_movies, no_sample)
        pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
        neg_samples = random.sample(neg_movies, no_sample)
        neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
        
        loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
        total_loss += loss
        num_batches += 1
        avg_loss = total_loss.item() / num_batches

        pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})

    total_loss.backward()
    optimizer.step()
        
    recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
    precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
    avg_loss = total_loss / len(train_user_ratings)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Recall@{k}: {recall:.4f}, Precision@{k}: {precision:.4f}')
    

Epoch 1/100: 100%|██████████| 5972/5972 [00:28<00:00, 209.55it/s, Avg Loss=0.6944]


Epoch 1/100, Loss: 0.6944, Recall@10: 0.1092, Precision@10: 0.3172


Epoch 2/100:  12%|█▏        | 732/5972 [00:03<00:27, 190.95it/s, Avg Loss=0.6930]


KeyboardInterrupt: 

In [16]:
total_loss = 0
num_batches = 0
pbar = tqdm(test_user_ratings)

embeddings = model(user_features_tensor, movies_features_tensor, test_edge_index)

for user_id, pos_movies, neg_movies in pbar:
    no_sample = min(len(pos_movies), len(neg_movies))
    users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
    pos_samples = random.sample(pos_movies, no_sample)
    pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
    neg_samples = random.sample(neg_movies, no_sample)
    neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
    loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
    total_loss += loss
    num_batches += 1
    avg_loss = total_loss / num_batches

    # Update progress bar with average loss
    pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})
    
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
avg_loss = total_loss / len(test_user_ratings)
print(f'Test Loss: {avg_loss:.4f}, Test Recall@{k}: {recall:.4f}, Test Precision@{k}: {precision:.4f}')

  0%|          | 0/5539 [00:00<?, ?it/s]

100%|██████████| 5539/5539 [00:25<00:00, 215.44it/s, Avg Loss=0.6925]


Test Loss: 0.6925, Test Recall@10: 0.1138, Test Precision@10: 0.3305
