In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn.conv import MessagePassing
from torch.nn import Embedding
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
user_columns = ['userid', 'gender', 'age', 'occupation', 'zipcode']
movie_columns = ['movieid', 'title', 'genres']
rating_columns = ['userid', 'movieid', 'rating', 'timestamp']

users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=user_columns, engine='python', encoding='ISO-8859-1')
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=movie_columns, engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rating_columns, engine='python', encoding='ISO-8859-1')

print(users.head())
print(movies.head())
print(ratings.head())

   userid gender  age  occupation zipcode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455
   movieid                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   userid  movieid  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [4]:
# cut users with less than 400 ratings
user_ratings = ratings.groupby('userid').size()
# user_ratings = user_ratings[user_ratings > 400]
user_ratings = user_ratings
mvp_users = user_ratings.index
print(mvp_users.nunique())

6040


In [5]:
# filter users
users = users[users['userid'].isin(mvp_users)]
ratings = ratings[ratings['userid'].isin(mvp_users)]
print(users.shape)
print(ratings.shape)

(6040, 5)
(1000209, 4)


In [6]:
# user reindexing
user_to_index = {user: i+1 for i, user in enumerate(users['userid'])}
print(user_to_index)

# reindex userid in users and ratings
users['userid'] = users['userid'].map(user_to_index)
ratings['userid'] = ratings['userid'].map(user_to_index)

{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114: 114, 115: 115, 116: 116, 117: 117, 118: 118, 119: 119, 120: 120, 121: 121, 122: 

In [7]:
users

Unnamed: 0,userid,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [8]:
# movie reindexing
movie_to_index = {movie: i+1 for i, movie in enumerate(movies['movieid'])}
print(movie_to_index)

# reindex movieid in movies and ratings
movies['movieid'] = movies['movieid'].map(movie_to_index)
ratings['movieid'] = ratings['movieid'].map(movie_to_index)

{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 92: 91, 93: 92, 94: 93, 95: 94, 96: 95, 97: 96, 98: 97, 99: 98, 100: 99, 101: 100, 102: 101, 103: 102, 104: 103, 105: 104, 106: 105, 107: 106, 108: 107, 109: 108, 110: 109, 111: 110, 112: 111, 113: 112, 114: 113, 115: 114, 116: 115, 117: 116, 118: 117, 119: 118, 120: 119, 121: 120, 122: 121, 123:

In [9]:
movies

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3879,Meet the Parents (2000),Comedy
3879,3880,Requiem for a Dream (2000),Drama
3880,3881,Tigerland (2000),Drama
3881,3882,Two Family House (2000),Drama


# 

In [10]:
ocupation_dict = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}

In [11]:
from python_splitters import python_stratified_split
train_ratings, test_ratings = python_stratified_split(ratings, ratio=0.75)

In [12]:
num_users = ratings['userid'].nunique()
num_movies = ratings['movieid'].nunique()

In [13]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for movie genres
movies['genres list'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres list'] = movies['genres list'].apply(lambda x: ' '.join(x))
genres_embeddings = model.encode(movies['genres list'].tolist())

# Generate embeddings for user occupations
occupations_embeddings = model.encode(users['occupation'].apply(lambda x: ocupation_dict[x]).tolist())

# Convert embeddings to tensors
genres_embeddings_tensor = torch.tensor(genres_embeddings, dtype=torch.float)
occupations_embeddings_tensor = torch.tensor(occupations_embeddings, dtype=torch.float)

movies['genres embeddings'] = list(genres_embeddings_tensor)
users['occupation embeddings'] = list(occupations_embeddings_tensor)

In [14]:
movies['movieid'].nunique()

3883

In [15]:
train_ratings.describe()

Unnamed: 0,userid,movieid,rating,timestamp
count,750121.0,750121.0,750121.0,750121.0
mean,3024.528364,1823.41082,3.582239,972243500.0
std,1728.394286,1069.777116,1.116519,12152850.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1017.0,3.0,965302600.0
50%,3070.0,1767.0,4.0,973018000.0
75%,4476.0,2702.0,4.0,975221100.0
max,6040.0,3883.0,5.0,1046455000.0


In [16]:
from torch_geometric.data import Data
import torch

# Create edge index for bipartite graph for train set
train_user_ids = train_ratings['userid'].values - 1  
train_movie_ids = train_ratings['movieid'].values - 1 + num_users 
train_edge_index = torch.tensor([train_user_ids, train_movie_ids], dtype=torch.long)

# Create edge index for bipartite graph for test set
test_user_ids = test_ratings['userid'].values - 1  
test_movie_ids = test_ratings['movieid'].values - 1 + num_users  
test_edge_index = torch.tensor([test_user_ids, test_movie_ids], dtype=torch.long)

# Create node features for users and movies for train set
train_user_features = occupations_embeddings_tensor.clone().detach()
train_movie_features = genres_embeddings_tensor.clone().detach()

# Combine user and movie features into a single tensor for train set
X_train = torch.cat([train_user_features, train_movie_features], dim=0)

# Create node features for users and movies for test set
test_user_features = occupations_embeddings_tensor.clone().detach()
test_movie_features = genres_embeddings_tensor.clone().detach()

# Combine user and movie features into a single tensor for test set
X_test = torch.cat([test_user_features, test_movie_features], dim=0)

  train_edge_index = torch.tensor([train_user_ids, train_movie_ids], dtype=torch.long)


In [17]:
train_ratings['movieid'].max()
num_users

6040

In [18]:
import torch
from torch.nn import Linear, Parameter
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class LightGCN(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super().__init__(aggr='add') 

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Compute normalization.
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        # Start propagating messages.
        out = self.propagate(edge_index, x=x, norm=norm)
        return out

    def message(self, x_j, norm):
        # x_j has shape [E, out_channels]
        # Step 4: Normalize node features.
        return norm.view(-1, 1) * x_j

In [19]:
class LightGCNStack(torch.nn.Module):
    def __init__(self, num_nodes, no_user_features, no_movie_features, embedding_dim, num_layers):
        super().__init__()
        self.users_latent = nn.Linear(no_user_features, embedding_dim)
        self.movies_latent = nn.Linear(no_movie_features, embedding_dim)
        self.convs = torch.nn.ModuleList([LightGCN(embedding_dim, embedding_dim) for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, user_feature, movie_feature, edge_index):
        movie_embedding = self.movies_latent(movie_feature)
        user_embedding = self.users_latent(user_feature)
        x = torch.cat([user_embedding, movie_embedding], dim=0)
        all_embeddings = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            all_embeddings.append(x)
        
        # Aggregate embeddings with factors a_k = 1/(k+1)
        out = sum((1.0 / (k + 1)) * emb for k, emb in enumerate(all_embeddings))
        return out

In [20]:
def bpr_loss(embeddings, users, pos_items, neg_items):
    user_emb = embeddings[users]
    pos_emb = embeddings[pos_items]
    neg_emb = embeddings[neg_items]
    
    pos_scores = (user_emb * pos_emb).sum(dim=1)
    neg_scores = (user_emb * neg_emb).sum(dim=1)
    
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

def test(model, edge_index, users, pos_items, neg_items):
    model.eval()
    
    with torch.no_grad():
        user_features = occupations_embeddings_tensor[users]
        movie_features = genres_embeddings_tensor[pos_items]
        embeddings = model(user_features, movie_features, edge_index)
        loss = bpr_loss(embeddings, users, pos_items, neg_items)
    
    return loss.item()

In [21]:
from collections import defaultdict

def build_user_movie_interactions(ratings_df):
    """
    Create a user-movie interaction graph from the ratings dataframe.
    Include all interactions regardless of rating.
    """
    user_movie_dict = defaultdict(list)
    for user_id, movie_id, rating in zip(ratings_df['userid'], ratings_df['movieid'], ratings_df['rating']):
        user_movie_dict[user_id].append((movie_id, rating))
    return user_movie_dict

In [22]:
train_user_movie_dict = build_user_movie_interactions(train_ratings)
test_user_movie_dict = build_user_movie_interactions(test_ratings)

In [23]:
import random

In [24]:
positive_threshold = 5
negative_threshold = 3

def sample_positive_and_negative_samples(user_movie_dict, positive_threshold, negative_threshold,):

    user_ratings = []

    for user_id, movies in user_movie_dict.items():
        pos_movies = [movie_id for movie_id, rating in movies if rating >= positive_threshold]
        neg_movies = [movie_id for movie_id, rating in movies if rating <= negative_threshold]
        
        if len(pos_movies) == 0 or len(neg_movies) == 0:
            continue
        
        user_ratings.append((user_id, pos_movies, neg_movies))
        
    return user_ratings

In [25]:
train_user_ratings = sample_positive_and_negative_samples(train_user_movie_dict, positive_threshold, negative_threshold)
test_user_ratings = sample_positive_and_negative_samples(test_user_movie_dict, positive_threshold, negative_threshold)

In [26]:
pos = 0
neg = 0
for i in range(600):
    pos += len(train_user_ratings[i][1])
    neg += len(train_user_ratings[i][2])

pos/600, neg/600

(27.288333333333334, 47.18666666666667)

In [27]:
def recall_at_k(user_ratings, embeddings, k=10, device='cpu'):
    model.eval()
    hits = 0
    total = 0
    
    for user_id, pos_movies, neg_movies in user_ratings:    
        user_emb = embeddings[user_id]
        pos_emb = embeddings[pos_movies]
        neg_emb = embeddings[neg_movies]
        
        pos_scores = (user_emb * pos_emb).sum(dim=1)
        neg_scores = (user_emb * neg_emb).sum(dim=1)
        
        scores = torch.cat([pos_scores, neg_scores])
        if len(scores) < k:
            continue

        curr_k = min(k, len(scores))
        _, indices = torch.topk(scores, curr_k)
        hits += torch.sum(indices < k).item()
        total += len(pos_movies)
        
    return hits / total

def precision_at_k(user_ratings, embeddings, k=10, device='cpu'):
    model.eval()
    hits = 0
    total = 0
    
    for user_id, pos_movies, neg_movies in user_ratings:
        user_emb = embeddings[user_id]
        pos_emb = embeddings[pos_movies]
        neg_emb = embeddings[neg_movies]
        
        pos_scores = (user_emb * pos_emb).sum(dim=1)
        neg_scores = (user_emb * neg_emb).sum(dim=1)
        
        scores = torch.cat([pos_scores, neg_scores])
        if len(scores) < k:
            continue
    
        curr_k = min(k, len(scores))
        _, indices = torch.topk(scores, curr_k)
        hits += torch.sum(indices < k).item()
        total += k
        
    return hits / total

In [28]:
num_nodes = X_train.size(0)
embedding_dim = X_train.size(1)
num_layers = 10
num_epochs = 100
learning_rate = 0.0005
# sample_size = 32
k = 10

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_edge_index = train_edge_index.to(device)
no_user_features = occupations_embeddings_tensor.size(1)
no_movie_features = genres_embeddings_tensor.size(1)
model = LightGCNStack(num_nodes, no_user_features, no_movie_features, embedding_dim, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [39]:
occupations_embeddings_tensor = occupations_embeddings_tensor.to(device)
genres_embeddings_tensor = genres_embeddings_tensor.to(device)
train_edge_index = train_edge_index.to(device)
test_edge_index = test_edge_index.to(device)

In [36]:
embeddings = model(occupations_embeddings_tensor, genres_embeddings_tensor, train_edge_index)
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
print("Base recall:", recall)
print("Base precision:", precision)

Base recall: 0.10806373217312959
Base precision: 0.3137865748709122


In [37]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    pbar = tqdm(train_user_ratings, desc=f'Epoch {epoch+1}/{num_epochs}')
    embeddings = model(occupations_embeddings_tensor, genres_embeddings_tensor, train_edge_index)

    for user_id, pos_movies, neg_movies in pbar:
        no_sample = min(len(pos_movies), len(neg_movies))
        users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
        pos_samples = random.sample(pos_movies, no_sample)
        pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
        neg_samples = random.sample(neg_movies, no_sample)
        neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
        
        loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
        total_loss += loss
        num_batches += 1
        avg_loss = total_loss.item() / num_batches

        # Update progress bar with average loss
        pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})

    total_loss.backward()
    optimizer.step()
        
    recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
    precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
    avg_loss = total_loss / len(train_user_ratings)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Recall@{k}: {recall:.4f}, Precision@{k}: {precision:.4f}')
    

Epoch 1/100:   0%|          | 0/5972 [00:00<?, ?it/s]

Epoch 1/100: 100%|██████████| 5972/5972 [00:07<00:00, 773.09it/s, Avg Loss=0.6953]


Epoch 1/100, Loss: 0.6953, Recall@10: 0.1081, Precision@10: 0.3138


Epoch 2/100: 100%|██████████| 5972/5972 [00:08<00:00, 686.47it/s, Avg Loss=0.6931]


Epoch 2/100, Loss: 0.6931, Recall@10: 0.1126, Precision@10: 0.3269


Epoch 3/100: 100%|██████████| 5972/5972 [00:09<00:00, 627.09it/s, Avg Loss=0.6913]


Epoch 3/100, Loss: 0.6913, Recall@10: 0.1155, Precision@10: 0.3353


Epoch 4/100: 100%|██████████| 5972/5972 [00:08<00:00, 672.36it/s, Avg Loss=0.6899]


Epoch 4/100, Loss: 0.6899, Recall@10: 0.1165, Precision@10: 0.3382


Epoch 5/100: 100%|██████████| 5972/5972 [00:08<00:00, 665.05it/s, Avg Loss=0.6891]


Epoch 5/100, Loss: 0.6891, Recall@10: 0.1183, Precision@10: 0.3435


Epoch 6/100: 100%|██████████| 5972/5972 [00:10<00:00, 543.12it/s, Avg Loss=0.6883]


Epoch 6/100, Loss: 0.6883, Recall@10: 0.1186, Precision@10: 0.3445


Epoch 7/100: 100%|██████████| 5972/5972 [00:11<00:00, 535.35it/s, Avg Loss=0.6871]


Epoch 7/100, Loss: 0.6871, Recall@10: 0.1191, Precision@10: 0.3460


Epoch 8/100: 100%|██████████| 5972/5972 [00:09<00:00, 657.14it/s, Avg Loss=0.6868]


Epoch 8/100, Loss: 0.6868, Recall@10: 0.1192, Precision@10: 0.3460


Epoch 9/100: 100%|██████████| 5972/5972 [00:09<00:00, 617.74it/s, Avg Loss=0.6865]


Epoch 9/100, Loss: 0.6865, Recall@10: 0.1190, Precision@10: 0.3454


Epoch 10/100: 100%|██████████| 5972/5972 [00:09<00:00, 636.65it/s, Avg Loss=0.6862]


Epoch 10/100, Loss: 0.6862, Recall@10: 0.1184, Precision@10: 0.3439


Epoch 11/100: 100%|██████████| 5972/5972 [00:10<00:00, 572.08it/s, Avg Loss=0.6865]


Epoch 11/100, Loss: 0.6865, Recall@10: 0.1183, Precision@10: 0.3436


Epoch 12/100:  14%|█▎        | 815/5972 [00:01<00:10, 486.06it/s, Avg Loss=0.6859]


KeyboardInterrupt: 

In [42]:
total_loss = 0
num_batches = 0
pbar = tqdm(test_user_ratings)

embeddings = model(occupations_embeddings_tensor, genres_embeddings_tensor, test_edge_index)

for user_id, pos_movies, neg_movies in pbar:
    no_sample = min(len(pos_movies), len(neg_movies))
    users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
    pos_samples = random.sample(pos_movies, no_sample)
    pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
    neg_samples = random.sample(neg_movies, no_sample)
    neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
    loss = bpr_loss(embeddings, users, pos_samples, neg_samples)
    total_loss += loss
    num_batches += 1
    avg_loss = total_loss / num_batches

    # Update progress bar with average loss
    pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})
    
recall = recall_at_k(train_user_ratings, embeddings, k=k, device=device)
precision = precision_at_k(train_user_ratings, embeddings, k=k, device=device)
avg_loss = total_loss / len(test_user_ratings)
print(f'Test Loss: {avg_loss:.4f}, Test Recall@{k}: {recall:.4f}, Test Precision@{k}: {precision:.4f}')

100%|██████████| 5539/5539 [00:07<00:00, 723.71it/s, Avg Loss=0.6870]


Test Loss: 0.6870, Test Recall@10: 0.1178, Test Precision@10: 0.3420
