In [192]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn.conv import MessagePassing
from torch.nn import Embedding
from tqdm import tqdm

In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

In [194]:
user_columns = ['userid', 'gender', 'age', 'occupation', 'zipcode']
movie_columns = ['movieid', 'title', 'genres']
rating_columns = ['userid', 'movieid', 'rating', 'timestamp']

users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=user_columns, engine='python', encoding='ISO-8859-1')
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=movie_columns, engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rating_columns, engine='python', encoding='ISO-8859-1')

print(users.head())
print(movies.head())
print(ratings.head())

   userid gender  age  occupation zipcode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455
   movieid                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   userid  movieid  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [195]:
# cut users with less than 200 ratings
user_ratings = ratings.groupby('userid').size()
user_ratings = user_ratings[user_ratings > 200]
mvp_users = user_ratings.index
print(mvp_users.nunique())

1578


In [196]:
# filter users
users = users[users['userid'].isin(mvp_users)]
ratings = ratings[ratings['userid'].isin(mvp_users)]
print(users.shape)
print(ratings.shape)

(1578, 5)
(654781, 4)


In [197]:
# user reindexing
user_to_index = {user: i+1 for i, user in enumerate(users['userid'])}
print(user_to_index)

# reindex userid in users and ratings
users['userid'] = users['userid'].map(user_to_index)
ratings['userid'] = ratings['userid'].map(user_to_index)

{10: 1, 15: 2, 17: 3, 18: 4, 19: 5, 22: 6, 23: 7, 26: 8, 33: 9, 36: 10, 42: 11, 45: 12, 48: 13, 53: 14, 58: 15, 59: 16, 62: 17, 73: 18, 90: 19, 92: 20, 93: 21, 117: 22, 118: 23, 123: 24, 131: 25, 136: 26, 137: 27, 139: 28, 146: 29, 148: 30, 149: 31, 150: 32, 151: 33, 157: 34, 161: 35, 163: 36, 166: 37, 169: 38, 173: 39, 175: 40, 181: 41, 187: 42, 192: 43, 193: 44, 195: 45, 198: 46, 199: 47, 202: 48, 204: 49, 216: 50, 223: 51, 224: 52, 225: 53, 229: 54, 235: 55, 238: 56, 242: 57, 245: 58, 255: 59, 261: 60, 264: 61, 268: 62, 271: 63, 272: 64, 284: 65, 293: 66, 300: 67, 301: 68, 302: 69, 303: 70, 308: 71, 310: 72, 314: 73, 319: 74, 321: 75, 326: 76, 327: 77, 329: 78, 331: 79, 333: 80, 338: 81, 343: 82, 346: 83, 349: 84, 352: 85, 355: 86, 366: 87, 368: 88, 386: 89, 390: 90, 392: 91, 398: 92, 402: 93, 403: 94, 409: 95, 411: 96, 412: 97, 415: 98, 424: 99, 426: 100, 429: 101, 438: 102, 442: 103, 445: 104, 451: 105, 453: 106, 454: 107, 457: 108, 461: 109, 474: 110, 475: 111, 476: 112, 477: 113

In [198]:
users

Unnamed: 0,userid,gender,age,occupation,zipcode
9,1,F,35,1,95370
14,2,M,25,7,22903
16,3,M,50,1,95350
17,4,F,18,3,95825
18,5,M,1,10,48073
...,...,...,...,...,...
6024,1574,F,25,1,32607
6034,1575,F,25,1,78734
6035,1576,F,25,15,32603
6036,1577,F,45,1,76006


In [199]:
# movie reindexing
movie_to_index = {movie: i+1 for i, movie in enumerate(movies['movieid'])}
print(movie_to_index)

# reindex movieid in movies and ratings
movies['movieid'] = movies['movieid'].map(movie_to_index)
ratings['movieid'] = ratings['movieid'].map(movie_to_index)

{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 92: 91, 93: 92, 94: 93, 95: 94, 96: 95, 97: 96, 98: 97, 99: 98, 100: 99, 101: 100, 102: 101, 103: 102, 104: 103, 105: 104, 106: 105, 107: 106, 108: 107, 109: 108, 110: 109, 111: 110, 112: 111, 113: 112, 114: 113, 115: 114, 116: 115, 117: 116, 118: 117, 119: 118, 120: 119, 121: 120, 122: 121, 123:

In [200]:
movies

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3879,Meet the Parents (2000),Comedy
3879,3880,Requiem for a Dream (2000),Drama
3880,3881,Tigerland (2000),Drama
3881,3882,Two Family House (2000),Drama


# 

In [201]:
ocupation_dict = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}

In [202]:
from python_splitters import python_stratified_split
train_ratings, test_ratings = python_stratified_split(ratings, ratio=0.75)

In [203]:
num_users = ratings['userid'].nunique()
num_movies = ratings['movieid'].nunique()

In [204]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for movie genres
movies['genres list'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres list'] = movies['genres list'].apply(lambda x: ' '.join(x))
genres_embeddings = model.encode(movies['genres list'].tolist())

# Generate embeddings for user occupations
occupations_embeddings = model.encode(users['occupation'].apply(lambda x: ocupation_dict[x]).tolist())

# Convert embeddings to tensors
genres_embeddings_tensor = torch.tensor(genres_embeddings, dtype=torch.float)
occupations_embeddings_tensor = torch.tensor(occupations_embeddings, dtype=torch.float)

movies['genres embeddings'] = list(genres_embeddings_tensor)
users['occupation embeddings'] = list(occupations_embeddings_tensor)

In [205]:
movies['movieid'].nunique()

3883

In [206]:
train_ratings.describe()

Unnamed: 0,userid,movieid,rating,timestamp
count,491089.0,491089.0,491089.0,491089.0
mean,776.134876,1813.3552,3.49601,972964200.0
std,452.843476,1065.925419,1.118226,13055180.0
min,1.0,1.0,1.0,956703900.0
25%,380.0,1007.0,3.0,965349200.0
50%,778.0,1773.0,4.0,973297200.0
75%,1157.0,2681.0,4.0,975419300.0
max,1578.0,3883.0,5.0,1046454000.0


In [207]:
from torch_geometric.data import Data
import torch

# Create edge index for bipartite graph for train set
train_user_ids = train_ratings['userid'].values - 1  
train_movie_ids = train_ratings['movieid'].values - 1 + num_users 
train_edge_index = torch.tensor([train_user_ids, train_movie_ids], dtype=torch.long)

# Create edge index for bipartite graph for test set
test_user_ids = test_ratings['userid'].values - 1  
test_movie_ids = test_ratings['movieid'].values - 1 + num_users  
test_edge_index = torch.tensor([test_user_ids, test_movie_ids], dtype=torch.long)

# Create node features for users and movies for train set
train_user_features = occupations_embeddings_tensor.clone().detach()
train_movie_features = genres_embeddings_tensor.clone().detach()

# Combine user and movie features into a single tensor for train set
X_train = torch.cat([train_user_features, train_movie_features], dim=0)

# Create node features for users and movies for test set
test_user_features = occupations_embeddings_tensor.clone().detach()
test_movie_features = genres_embeddings_tensor.clone().detach()

# Combine user and movie features into a single tensor for test set
X_test = torch.cat([test_user_features, test_movie_features], dim=0)

In [208]:
train_ratings['movieid'].max()
num_users

1578

In [209]:
import torch
from torch.nn import Linear, Parameter
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class LightGCN(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super().__init__(aggr='add') 

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Compute normalization.
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        # Start propagating messages.
        out = self.propagate(edge_index, x=x, norm=norm)
        return out

    def message(self, x_j, norm):
        # x_j has shape [E, out_channels]
        # Step 4: Normalize node features.
        return norm.view(-1, 1) * x_j

In [210]:
class LightGCNStack(torch.nn.Module):
    def __init__(self, num_nodes, embedding_dim, num_layers):
        super().__init__()
        self.embedding = Embedding(num_nodes, embedding_dim)
        self.convs = torch.nn.ModuleList([LightGCN(embedding_dim, embedding_dim) for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, edge_index):
        x = self.embedding.weight
        all_embeddings = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            all_embeddings.append(x)
        
        # Aggregate embeddings with factors a_k = 1/(k+1)
        out = sum((1.0 / (k + 1)) * emb for k, emb in enumerate(all_embeddings))
        return out

In [211]:
def bpr_loss(model, users, pos_items, neg_items):
    user_emb = model.embedding(users)
    pos_emb = model.embedding(pos_items)
    neg_emb = model.embedding(neg_items)
    
    pos_scores = (user_emb * pos_emb).sum(dim=1)
    neg_scores = (user_emb * neg_emb).sum(dim=1)
    
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

def train(model, edge_index, users, pos_items, neg_items, optimizer):
    model.train()
    
    optimizer.zero_grad()
    embeddings = model(edge_index)
    loss = bpr_loss(model, users, pos_items, neg_items)
    loss.backward()
    optimizer.step()
    
    return loss.item()

def test(model, edge_index, users, pos_items, neg_items):
    model.eval()
    
    with torch.no_grad():
        embeddings = model(edge_index)
        loss = bpr_loss(model, users, pos_items, neg_items)
    
    return loss.item()


In [212]:
from collections import defaultdict

def build_user_movie_interactions(ratings_df):
    """
    Create a user-movie interaction graph from the ratings dataframe.
    Include all interactions regardless of rating.
    """
    user_movie_dict = defaultdict(list)
    for user_id, movie_id, rating in zip(ratings_df['userid'], ratings_df['movieid'], ratings_df['rating']):
        user_movie_dict[user_id].append((movie_id, rating))
    return user_movie_dict

In [213]:
train_user_movie_dict = build_user_movie_interactions(train_ratings)
test_user_movie_dict = build_user_movie_interactions(test_ratings)

In [214]:
import random

In [215]:
def sample_positive_and_negative_samples(user_movie_dict, positive_threshold, negative_threshold):

    user_ratings = []

    for user_id, movies in user_movie_dict.items():
        pos_movies = [movie_id for movie_id, rating in movies if rating >= positive_threshold]
        neg_movies = [movie_id for movie_id, rating in movies if rating <= negative_threshold]
        
        if len(pos_movies) == 0 or len(neg_movies) == 0:
            continue
        
        user_ratings.append((user_id, pos_movies, neg_movies))
        
    return user_ratings

In [216]:
positive_threshold = 5
negative_threshold = 3

train_user_ratings = sample_positive_and_negative_samples(train_user_movie_dict, positive_threshold, negative_threshold)
test_user_ratings = sample_positive_and_negative_samples(test_user_movie_dict, positive_threshold, negative_threshold)

In [217]:
def recall_at_k(user_ratings, model, k=10, device='cpu'):
    model.eval()
    hits = 0
    total = 0
    
    for user_id, pos_movies, neg_movies in user_ratings:
        user = torch.tensor([user_id], dtype=torch.long).to(device)
        pos = torch.tensor(pos_movies, dtype=torch.long).to(device)
        neg = torch.tensor(neg_movies, dtype=torch.long).to(device)
        
        user_emb = model.embedding(user)
        pos_emb = model.embedding(pos)
        neg_emb = model.embedding(neg)
        
        pos_scores = (user_emb * pos_emb).sum(dim=1)
        neg_scores = (user_emb * neg_emb).sum(dim=1)
        
        scores = torch.cat([pos_scores, neg_scores])

        curr_k = min(k, len(scores))
        _, indices = torch.topk(scores, curr_k)
        hits += torch.sum(indices < k).item()
        total += k
        
    return hits / total

def precision_at_k(user_ratings, model, k=10, device='cpu'):
    model.eval()
    hits = 0
    total = 0
    
    for user_id, pos_movies, neg_movies in user_ratings:
        user = torch.tensor([user_id], dtype=torch.long).to(device)
        pos = torch.tensor(pos_movies, dtype=torch.long).to(device)
        neg = torch.tensor(neg_movies, dtype=torch.long).to(device)
        
        user_emb = model.embedding(user)
        pos_emb = model.embedding(pos)
        neg_emb = model.embedding(neg)
        
        pos_scores = (user_emb * pos_emb).sum(dim=1)
        neg_scores = (user_emb * neg_emb).sum(dim=1)
        
        scores = torch.cat([pos_scores, neg_scores])

        curr_k = min(k, len(scores))
        _, indices = torch.topk(scores, curr_k)
        hits += torch.sum(indices < len(pos_movies)).item()
        total += k
        
    return hits / total

In [218]:
num_nodes = X_train.size(0)
embedding_dim = X_train.size(1)
num_layers = 6
num_epochs = 10
learning_rate = 0.01
sample_size = 4
k = 10

In [219]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_edge_index = train_edge_index.to(device)
model = LightGCNStack(num_nodes, embedding_dim, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [220]:
recall = recall_at_k(train_user_ratings, model, k=k, device=device)
precision = precision_at_k(train_user_ratings, model, k=k, device=device)
print("Base recall:", recall)
print("Base precision:", precision)

for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    pbar = tqdm(train_user_ratings, desc=f'Epoch {epoch+1}/{num_epochs}')

    for user_id, pos_movies, neg_movies in pbar:
        no_sample = min(sample_size, len(pos_movies), len(neg_movies))
        users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
        pos_samples = random.sample(pos_movies, no_sample)
        pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
        neg_samples = random.sample(neg_movies, no_sample)
        neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
        
        loss = train(model, train_edge_index, users, pos_samples, neg_samples, optimizer)
        total_loss += loss
        num_batches += 1
        avg_loss = total_loss / num_batches

        # Update progress bar with average loss
        pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})
        
    recall = recall_at_k(train_user_ratings, model, k=k, device=device)
    precision = precision_at_k(train_user_ratings, model, k=k, device=device) 
    avg_loss = total_loss / len(train_user_ratings)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Recall@{k}: {recall:.4f}, Precision@{k}: {precision:.4f}')
    

Base recall: 0.06467977171845275
Base precision: 0.3365250475586557


Epoch 1/10:  41%|████▏     | 652/1577 [01:02<01:28, 10.48it/s, Avg Loss=11.5160]


KeyboardInterrupt: 

In [None]:
total_loss = 0
num_batches = 0
pbar = tqdm(test_user_ratings)

for user_id, pos_movies, neg_movies in pbar:
    no_sample = min(sample_size, len(pos_movies), len(neg_movies))
    users = torch.tensor([user_id] * no_sample, dtype=torch.long).to(device)
    pos_samples = random.sample(pos_movies, no_sample)
    pos_samples = torch.tensor(pos_samples, dtype=torch.long).to(device)
    neg_samples = random.sample(neg_movies, no_sample)
    neg_samples = torch.tensor(neg_samples, dtype=torch.long).to(device)
    loss = bpr_loss(model, users, pos_samples, neg_samples)
    total_loss += loss
    num_batches += 1
    avg_loss = total_loss / num_batches

    # Update progress bar with average loss
    pbar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})
    
recall = recall_at_k(test_user_ratings, model, k=k, device=device)
precision = precision_at_k(test_user_ratings, model, k=k, device=device)
avg_loss = total_loss / len(test_user_ratings)
print(f'Test Loss: {avg_loss:.4f}, Test Recall@{k}: {recall:.4f}, Test Precision@{k}: {precision:.4f}')

100%|██████████| 600/600 [00:02<00:00, 296.03it/s, Avg Loss=11.6235]


Test Loss: 11.6235, Test Recall@10: 0.0940
