In [141]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn.conv import MessagePassing
from torch.nn import Embedding

In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

In [143]:
user_columns = ['userid', 'gender', 'age', 'occupation', 'zipcode']
movie_columns = ['movieid', 'title', 'genres']
rating_columns = ['userid', 'movieid', 'rating', 'timestamp']

users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=user_columns, engine='python', encoding='ISO-8859-1')
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=movie_columns, engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rating_columns, engine='python', encoding='ISO-8859-1')

print(users.head())
print(movies.head())
print(ratings.head())

   userid gender  age  occupation zipcode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455
   movieid                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   userid  movieid  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [144]:
ocupation_dict = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}

In [145]:
from python_splitters import python_stratified_split
train_ratings, test_ratings = python_stratified_split(ratings, ratio=0.75)

In [146]:
num_users = ratings['userid'].max()
num_movies = ratings['movieid'].max()

In [147]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for movie genres
movies['genres list'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres list'] = movies['genres list'].apply(lambda x: ' '.join(x))
genres_embeddings = model.encode(movies['genres list'].tolist())

# Generate embeddings for user occupations
occupations_embeddings = model.encode(users['occupation'].apply(lambda x: ocupation_dict[x]).tolist())

# Convert embeddings to tensors
genres_embeddings_tensor = torch.tensor(genres_embeddings, dtype=torch.float)
occupations_embeddings_tensor = torch.tensor(occupations_embeddings, dtype=torch.float)

movies['genres embeddings'] = list(genres_embeddings_tensor)
users['occupation embeddings'] = list(occupations_embeddings_tensor)

In [153]:
movies['movieid'].nunique()

3883

In [135]:
train_ratings.describe()

Unnamed: 0,userid,movieid,rating,timestamp
count,750121.0,750121.0,750121.0,750121.0
mean,3024.528364,1864.902114,3.582239,972243500.0
std,1728.394286,1095.64059,1.116519,12152850.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1029.0,3.0,965302600.0
50%,3070.0,1834.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975221100.0
max,6040.0,3952.0,5.0,1046455000.0


In [136]:
from torch_geometric.data import Data
import torch

# Create edge index for bipartite graph for train set
train_user_ids = train_ratings['userid'].values - 1  
train_movie_ids = train_ratings['movieid'].values - 1 + num_users 
train_edge_index = torch.tensor([train_user_ids, train_movie_ids], dtype=torch.long)

# Create edge index for bipartite graph for test set
test_user_ids = test_ratings['userid'].values - 1  
test_movie_ids = test_ratings['movieid'].values - 1 + num_users  
test_edge_index = torch.tensor([test_user_ids, test_movie_ids], dtype=torch.long)

# Create node features for users and movies for train set
# train_user_features = occupations_embeddings_tensor.clone().detach()
# train_movie_features = genres_embeddings_tensor.clone().detach()
# print(train_user_features.shape)
# print(train_movie_features.shape)

# # Combine user and movie features into a single tensor for train set
# X_train = torch.cat([train_user_features, train_movie_features], dim=0)
# print(X_train.shape)

# # Create node features for users and movies for test set
# test_user_features = occupations_embeddings_tensor.clone().detach()
# test_movie_features = genres_embeddings_tensor.clone().detach()

# # Combine user and movie features into a single tensor for test set
# X_test = torch.cat([test_user_features, test_movie_features], dim=0)

In [140]:
train_ratings['movieid'].max()
num_users

6040

In [111]:
import torch
from torch.nn import Linear, Parameter
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class LightGCN(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super().__init__(aggr='add') 

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Compute normalization.
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        # Start propagating messages.
        out = self.propagate(edge_index, x=x, norm=norm)
        return out

    def message(self, x_j, norm):
        # x_j has shape [E, out_channels]
        # Step 4: Normalize node features.
        return norm.view(-1, 1) * x_j

In [112]:
class LightGCNStack(torch.nn.Module):
    def __init__(self, num_nodes, embedding_dim, num_layers):
        super().__init__()
        self.embedding = Embedding(num_nodes, embedding_dim)
        self.convs = torch.nn.ModuleList([LightGCN(embedding_dim, embedding_dim) for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, edge_index):
        x = self.embedding.weight
        all_embeddings = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            all_embeddings.append(x)
        
        # Aggregate embeddings with factors a_k = 1/(k+1)
        out = sum((1.0 / (k + 1)) * emb for k, emb in enumerate(all_embeddings))
        return out

In [113]:
def bpr_loss(model, users, pos_items, neg_items):
    user_emb = model.embedding(users)
    pos_emb = model.embedding(pos_items)
    neg_emb = model.embedding(neg_items)
    
    pos_scores = (user_emb * pos_emb).sum(dim=1)
    neg_scores = (user_emb * neg_emb).sum(dim=1)
    
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

def train(model, edge_index, users, pos_items, neg_items, optimizer):
    model.train()
    
    optimizer.zero_grad()
    embeddings = model(edge_index)
    loss = bpr_loss(model, users, pos_items, neg_items)
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [114]:
from collections import defaultdict

def build_user_movie_interactions(ratings_df):
    """
    Create a user-movie interaction graph from the ratings dataframe.
    Include all interactions regardless of rating.
    """
    user_movie_dict = defaultdict(list)
    for user_id, movie_id, rating in zip(ratings_df['userid'], ratings_df['movieid'], ratings_df['rating']):
        user_movie_dict[user_id].append((movie_id, rating))
    return user_movie_dict

In [115]:
train_user_movie_dict = build_user_movie_interactions(train_ratings)

In [116]:
import random

In [117]:
positive_threshold = 4
negative_threshold = 2
num_neg_samples = 5

def sample_positive_and_negative_samples(user_movie_dict, positive_threshold, negative_threshold, num_neg_samples):

    user_ratings = []

    for user_id, movies in user_movie_dict.items():
        pos_movies = [movie_id for movie_id, rating in movies if rating >= positive_threshold]
        neg_movies = [movie_id for movie_id, rating in movies if rating <= negative_threshold]
        
        if len(pos_movies) == 0 or len(neg_movies) == 0:
            continue
        
        user_ratings.append((user_id, pos_movies, neg_movies))
        
    return user_ratings

In [118]:
user_ratings = sample_positive_and_negative_samples(train_user_movie_dict, positive_threshold, negative_threshold, num_neg_samples)

In [119]:
# # Dummy data for illustration
# edge_index = torch.tensor([[0, 1, 2], [1, 2, 0]], dtype=torch.long)
# users = torch.tensor([0, 1, 2], dtype=torch.long)
# pos_items = torch.tensor([1, 2, 0], dtype=torch.long)
# neg_items = torch.tensor([2, 0, 1], dtype=torch.long)

In [121]:
num_nodes = X_train.size(0)
embedding_dim = X_train.size(1)
num_layers = 12
num_epochs = 100
learning_rate = 0.001
sample_size = 32

model = LightGCNStack(num_nodes, embedding_dim, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for user_id, pos_movies, neg_movies in user_ratings:
        no_sample = min(sample_size, len(pos_movies), len(neg_movies))
        users = torch.tensor([user_id] * no_sample, dtype=torch.long)
        pos_samples = random.sample(pos_movies, no_sample)
        neg_samples = random.sample(neg_movies, no_sample)
        
        loss = train(model, train_edge_index, users, pos_samples, neg_samples, optimizer)
        total_loss += loss

    avg_loss = total_loss / len(user_ratings)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

RuntimeError: index 9932 is out of bounds for dimension 0 with size 9923

In [93]:
print(max(train_movie_ids))

9991
