In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_scatter
from torch_geometric.nn.conv import MessagePassing
from torch.nn import Embedding

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

In [53]:
user_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
movie_columns = ['MovieID', 'Title', 'Genres']
rating_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=user_columns, engine='python', encoding='ISO-8859-1')
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=movie_columns, engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rating_columns, engine='python', encoding='ISO-8859-1')

print(users.head())
print(movies.head())
print(ratings.head())

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [54]:
ocupation_dict = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}

In [55]:
# users['Occupation'] = users['Occupation'].apply(lambda x: ocupation_dict[x])

# # Apply one hot encoding to users' occupations
# mlb = MultiLabelBinarizer()
# users_one_hot = mlb.fit_transform(users['Occupation'].apply(lambda x: [x]))
# users_one_hot_df = pd.DataFrame(users_one_hot, columns=mlb.classes_)
# users = pd.concat([users, users_one_hot_df], axis=1)

# # Apply one hot encoding to movies' genres
# mlb = MultiLabelBinarizer()
# movies_one_hot = mlb.fit_transform(movies['Genres'].apply(lambda x: x.split('|')).tolist())
# movies_genres_df = pd.DataFrame(movies_one_hot, columns=mlb.classes_)
# movies = pd.concat([movies, movies_genres_df], axis=1)

# movie_features = movies.drop(columns=['MovieID', 'Title', 'Genres'])
# user_features = users.drop(columns=['UserID', 'Gender', 'Occupation', 'Zip-code'])

In [49]:
num_users = ratings['UserID'].nunique()
num_movies = ratings['MovieID'].nunique()

In [56]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for movie genres
movies['Genres List'] = movies['Genres'].apply(lambda x: x.split('|'))
movies['Genres List'] = movies['Genres List'].apply(lambda x: ' '.join(x))
genres_embeddings = model.encode(movies['Genres List'].tolist())

# Generate embeddings for user occupations
occupations_embeddings = model.encode(users['Occupation'].apply(lambda x: ocupation_dict[x]).tolist())

# Convert embeddings to tensors
genres_embeddings_tensor = torch.tensor(genres_embeddings, dtype=torch.float)
occupations_embeddings_tensor = torch.tensor(occupations_embeddings, dtype=torch.float)

print(genres_embeddings_tensor)
print(occupations_embeddings_tensor)

tensor([[-0.0672,  0.0127, -0.0253,  ...,  0.0356,  0.0603,  0.0165],
        [ 0.0160,  0.0611,  0.0130,  ...,  0.0382, -0.0584, -0.0035],
        [-0.0546, -0.0767, -0.0195,  ...,  0.0020,  0.0928, -0.1016],
        ...,
        [ 0.0118, -0.0268,  0.0126,  ...,  0.0291,  0.0226, -0.0077],
        [ 0.0118, -0.0268,  0.0126,  ...,  0.0291,  0.0226, -0.0077],
        [-0.0217, -0.0121, -0.0826,  ..., -0.0260,  0.0092,  0.0295]])
tensor([[ 0.0060,  0.1011, -0.0045,  ..., -0.0154, -0.0687,  0.0545],
        [ 0.0104,  0.0059,  0.0096,  ..., -0.0464,  0.1156,  0.0203],
        [-0.0540,  0.0827,  0.0280,  ..., -0.0165,  0.0818, -0.0158],
        ...,
        [-0.0033,  0.0550,  0.0313,  ..., -0.0333,  0.0525, -0.0002],
        [-0.0706, -0.0851, -0.0603,  ..., -0.0009,  0.0646,  0.0989],
        [-0.0343,  0.0520,  0.0667,  ..., -0.0619,  0.0823,  0.0487]])


In [43]:
from torch_geometric.data import Data
import torch

# Create edge index for bipartite graph
user_ids = ratings['UserID'].values - 1  # Subtract 1 to make indices zero-based
movie_ids = ratings['MovieID'].values - 1  # Subtract 1 to make indices zero-based
edge_index = torch.tensor([user_ids, movie_ids], dtype=torch.long)

# Create node features for users and movies
user_features = occupations_embeddings_tensor.clone().detach()
movie_features = genres_embeddings_tensor.clone().detach()

# Combine user and movie features into a single tensor
x = torch.cat([user_features, movie_features], dim=0)

In [44]:
import torch
from torch.nn import Linear, Parameter
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class LightGCN(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super().__init__(aggr='add') 

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Compute normalization.
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        # Start propagating messages.
        out = self.propagate(edge_index, x=x, norm=norm)
        return out

    def message(self, x_j, norm):
        # x_j has shape [E, out_channels]
        # Step 4: Normalize node features.
        return norm.view(-1, 1) * x_j

In [45]:
class LightGCNStack(torch.nn.Module):
    def __init__(self, num_nodes, embedding_dim, num_layers):
        super().__init__()
        self.embedding = Embedding(num_nodes, embedding_dim)
        self.convs = torch.nn.ModuleList([LightGCN(embedding_dim, embedding_dim) for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, edge_index):
        x = self.embedding.weight
        all_embeddings = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            all_embeddings.append(x)
        
        # Aggregate embeddings with factors a_k = 1/(k+1)
        out = sum((1.0 / (k + 1)) * emb for k, emb in enumerate(all_embeddings))
        return out

In [46]:
def bpr_loss(model, users, pos_items, neg_items):
    user_emb = model.embedding(users)
    pos_emb = model.embedding(pos_items)
    neg_emb = model.embedding(neg_items)
    
    pos_scores = (user_emb * pos_emb).sum(dim=1)
    neg_scores = (user_emb * neg_emb).sum(dim=1)
    
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

def train(model, edge_index, users, pos_items, neg_items, optimizer):
    model.train()
    
    optimizer.zero_grad()
    embeddings = model(edge_index)
    loss = bpr_loss(model, users, pos_items, neg_items)
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [48]:
from torch_geometric.utils import negative_sampling
import random

def sample_positives_and_negatives(user_movie_dict, num_users, num_movies, positive_threshold=4, num_neg_samples=5):
    positive_samples = []
    negative_samples = []

    for user_id, movies in user_movie_dict.items():
        pos_movies = [movie_id for movie_id, rating in movies if rating >= positive_threshold]
        neg_movies = [movie_id for movie_id, rating in movies if rating <= 2]
        
        unwatched_movies = set(range(num_movies)) - set(movie_id for movie_id, _ in movies)
        sampled_neg_movies = random.sample(unwatched_movies, num_neg_samples)
        
        positive_samples.append((user_id, pos_movies))
        negative_samples.append((user_id, sampled_neg_movies))
    
    return positive_samples, negative_samples

In [34]:
# Dummy data for illustration
edge_index = torch.tensor([[0, 1, 2], [1, 2, 0]], dtype=torch.long)
users = torch.tensor([0, 1, 2], dtype=torch.long)
pos_items = torch.tensor([1, 2, 0], dtype=torch.long)
neg_items = torch.tensor([2, 0, 1], dtype=torch.long)

In [33]:
num_nodes = x.size(0)
embedding_dim = x.size(1)
num_layers = 12

model = LightGCNStack(num_nodes, embedding_dim, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, edge_index, users, pos_items, neg_items, optimizer)
    print(f'Epoch {epoch}, Loss: {loss}')

Epoch 0, Loss: 13.26748275756836
Epoch 1, Loss: 12.675753593444824
Epoch 2, Loss: 12.086258888244629
Epoch 3, Loss: 11.499045372009277
Epoch 4, Loss: 10.91415023803711
Epoch 5, Loss: 10.331609725952148
Epoch 6, Loss: 9.751479148864746
Epoch 7, Loss: 9.173940658569336
Epoch 8, Loss: 8.599502563476562
Epoch 9, Loss: 8.029385566711426
Epoch 10, Loss: 7.466070652008057
Epoch 11, Loss: 6.914175033569336
Epoch 12, Loss: 6.381109714508057
Epoch 13, Loss: 5.875225067138672
Epoch 14, Loss: 5.399388790130615
Epoch 15, Loss: 4.946715831756592
Epoch 16, Loss: 4.505642890930176
Epoch 17, Loss: 4.066825866699219
Epoch 18, Loss: 3.6251628398895264
Epoch 19, Loss: 3.178927421569824
Epoch 20, Loss: 2.728624105453491
Epoch 21, Loss: 2.2765581607818604
Epoch 22, Loss: 1.827765941619873
Epoch 23, Loss: 1.3943151235580444
Epoch 24, Loss: 1.0084911584854126
Epoch 25, Loss: 0.7470748424530029
Epoch 26, Loss: 0.7072804570198059
Epoch 27, Loss: 0.8624014854431152
Epoch 28, Loss: 1.0795778036117554
Epoch 29, Lo

KeyboardInterrupt: 

In [195]:
class MovieDataset:
    def __init__(self, num_users, num_items, x, edge_index):
        self.x = x
        self.num_users = num_users
        self.num_items = num_items
        self.edge_index = edge_index
        
        # Create edge indices for items to users and users to items
        self.edge_index_a2u = edge_index
        self.edge_index_u2a = torch.stack([edge_index[1], edge_index[0]], dim=0)

    def __getitem__(self, idx):
        return Data(x=self.x, edge_index=self.edge_index)

    def __len__(self):
        return 1

In [196]:
data = MovieDataset(num_users=users.shape[0], num_items=movies.shape[0], x=x, edge_index=edge_index)

In [234]:
class LightGCN(MessagePassing):
    def __init__(self, latent_dim, **kwargs):
        super(LightGCN, self).__init__(node_dim=0, **kwargs)
        self.latent_dim = latent_dim

    def forward(self, x, edge_index, size=None):
        return self.propagate(edge_index=edge_index, x=(x[0], x[1]), size=size)

    def message(self, x_j):
        return x_j

    def aggregate(self, inputs, index, dim_size=None):
        return torch_scatter.scatter(src=inputs, index=index, dim=0, dim_size=dim_size, reduce='mean')

In [246]:
class LightGCNStack(torch.nn.Module):
    def __init__(self, latent_dim, num_layers):
        super(LightGCNStack, self).__init__()
        conv_model = LightGCN
        self.convs = nn.ModuleList()
        self.convs.append(conv_model(latent_dim))
        
        assert (num_layers >= 1), 'Number of layers is not >=1'
        
        for l in range(num_layers-1):
            self.convs.append(conv_model(latent_dim))

        self.latent_dim = latent_dim
        self.num_layers = num_layers
        self.dataset = None
        self.embeddings_users = None
        self.embeddings_items = None

    def reset_parameters(self):
        if self.embeddings_users is not None:
            self.embeddings_users.reset_parameters()
        if self.embeddings_items is not None:
            self.embeddings_items.reset_parameters()

    def init_data(self, dataset):
        self.dataset = dataset
        self.embeddings_users = torch.nn.Embedding(num_embeddings=dataset.num_users, embedding_dim=self.latent_dim).to('cuda')
        self.embeddings_items = torch.nn.Embedding(num_embeddings=dataset.num_items, embedding_dim=self.latent_dim).to('cuda')

    def forward(self):
        x_users, x_items = self.embeddings_users.weight, self.embeddings_items.weight
        final_embeddings_users = torch.zeros(size=x_users.size(), device='cuda')
        final_embeddings_items = torch.zeros(size=x_items.size(), device='cuda')
        final_embeddings_users = final_embeddings_users + x_users/(self.num_layers + 1)
        final_embeddings_items = final_embeddings_items + x_items/(self.num_layers + 1)
        
        for i in range(self.num_layers):
            x_users = self.convs[i](
                x=(x_items, x_users), 
                edge_index=self.dataset.edge_index_a2u, 
                size=(self.dataset.num_items, self.dataset.num_users)
            )
            
            x_items = self.convs[i](
                x=(x_users, x_items), 
                edge_index=self.dataset.edge_index_u2a, 
                size=(self.dataset.num_users, self.dataset.num_items)
            )
            
            final_embeddings_users = final_embeddings_users + x_users/(self.num_layers+1)
            final_embeddings_items = final_embeddings_items + x_items/(self.num_layers + 1)

        return final_embeddings_users, final_embeddings_items

    def decode(self, z1, z2, pos_edge_index, neg_edge_index):  # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)  # concatenate pos and neg edges
        logits = (z1[edge_index[0]] * z2[edge_index[1]]).sum(dim=-1)  # dot product
        return logits

    def decode_all(self, z_users, z_items):
        prob_adj = z_users @ z_items.t()  # get adj NxN
        #return (prob_adj > 0).nonzero(as_tuple=False).t()  # get predicted edge_list
        return prob_adj

    def BPRLoss(self, prob_adj, real_adj, edge_index):
        loss = 0
        pos_scores = prob_adj[edge_index.cpu().numpy()]
        for pos_score, node_index in zip(pos_scores, edge_index[0]):
            neg_scores = prob_adj[node_index, real_adj[node_index] == 0]
            loss = loss - torch.sum(torch.log(torch.sigmoid(pos_score.repeat(neg_scores.size()[0]) - neg_scores))) / \
                   neg_scores.size()[0]

        return loss / edge_index.size()[1]

    def topN(self, user_id, n):
        z_users, z_items = self.forward()
        scores = torch.squeeze(z_users[user_id] @ z_items.t())
        return torch.topk(scores, k=n)

In [247]:
model = LightGCNStack(latent_dim=384, num_layers=3)

In [252]:
def train_lightgcn(model, data, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        embeddings_users, embeddings_items = model()
        loss = model.BPRLoss(model.decode_all(embeddings_users, embeddings_items), data.edge_index, data.edge_index)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    return embeddings_users, embeddings_items


In [253]:
num_epochs = 100

model = LightGCNStack(latent_dim=384, num_layers=3)
model.init_data(data)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
embeddings_users, embeddings_items = train_lightgcn(model, data, optimizer, num_epochs)

TypeError: propagate() got an unexpected keyword argument 'x'