# GNN-based Link Prediction for Personalized Video Recommendations in Bipartite User-Item Networks

In [1]:
import pandas as pd

# Load ratings
ratings = pd.read_csv('u.data', sep='\t', names=["user_id", "movie_id", "rating", "timestamp"])

# Load movie metadata (optional)
movies = pd.read_csv('u.item', sep='|', encoding='latin-1', header=None, usecols=[0, 1], names=["movie_id", "title"])


In [2]:
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [4]:
positive_ratings = ratings[ratings["rating"] >= 4]
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
movie_enc = LabelEncoder()

positive_ratings["user"] = user_enc.fit_transform(positive_ratings["user_id"])
positive_ratings["movie"] = movie_enc.fit_transform(positive_ratings["movie_id"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_ratings["user"] = user_enc.fit_transform(positive_ratings["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_ratings["movie"] = movie_enc.fit_transform(positive_ratings["movie_id"])


In [7]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import train_test_split_edges

# Create edge index: shape [2, num_edges]
edge_index = torch.tensor(positive_ratings[["user", "movie"]].values.T, dtype=torch.long)

# Offset movie IDs so they don’t overlap with users
num_users = positive_ratings["user"].nunique()
edge_index[1] += num_users  # shift movie IDs

# Create PyG data object
data = Data(edge_index=edge_index)
data.num_nodes = num_users + positive_ratings["movie"].nunique()

# Split into train/test edges
data = train_test_split_edges(data)




In [8]:
data

Data(num_nodes=2389, val_pos_edge_index=[2, 2768], test_pos_edge_index=[2, 5537], train_pos_edge_index=[2, 94140], train_neg_adj_mask=[2389, 2389], val_neg_edge_index=[2, 2768], test_neg_edge_index=[2, 5537])

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling

class GNNRecommender(nn.Module):
    def __init__(self, num_nodes, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_nodes, embed_dim)
        self.conv1 = SAGEConv(embed_dim, embed_dim)
        self.conv2 = SAGEConv(embed_dim, embed_dim)

    def forward(self, edge_index):
        x = self.embedding.weight
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

    def decode(self, z, edge_index):
        z_i = z[edge_index[0]]
        z_j = z[edge_index[1]]
        return (z_i * z_j).sum(dim=1)


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GNNRecommender(num_nodes=data.num_nodes, embed_dim=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [15]:
def train():
    model.train()
    optimizer.zero_grad()

    z = model(data.train_pos_edge_index.to(device))

    # Positive scores
    pos_edge = data.train_pos_edge_index.to(device)
    pos_score = model.decode(z, pos_edge)

    # Negative sampling
    neg_edge = negative_sampling(
        edge_index=pos_edge,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge.size(1)
    )
    neg_score = model.decode(z, neg_edge)

    # Loss
    score = torch.cat([pos_score, neg_score])
    labels = torch.cat([
        torch.ones(pos_score.size(0)),
        torch.zeros(neg_score.size(0))
    ]).to(device)

    loss = F.binary_cross_entropy_with_logits(score, labels)
    loss.backward()
    optimizer.step()

    return loss.item()


In [16]:
from sklearn.metrics import roc_auc_score

@torch.no_grad()
def test():
    model.eval()
    z = model(data.train_pos_edge_index.to(device))

    pos_edge = data.val_pos_edge_index.to(device)
    neg_edge = data.val_neg_edge_index.to(device)

    pos_score = model.decode(z, pos_edge).sigmoid()
    neg_score = model.decode(z, neg_edge).sigmoid()

    y_true = torch.cat([torch.ones(pos_score.size(0)), torch.zeros(neg_score.size(0))])
    y_scores = torch.cat([pos_score, neg_score])

    return roc_auc_score(y_true.cpu(), y_scores.cpu())


In [17]:
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        auc = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test AUC: {auc:.4f}')


Epoch: 010, Loss: 0.7006, Test AUC: 0.5470
Epoch: 020, Loss: 0.6848, Test AUC: 0.6661
Epoch: 030, Loss: 0.6665, Test AUC: 0.7359
Epoch: 040, Loss: 0.6356, Test AUC: 0.7694
Epoch: 050, Loss: 0.5965, Test AUC: 0.7987
Epoch: 060, Loss: 0.5723, Test AUC: 0.8170
Epoch: 070, Loss: 0.5588, Test AUC: 0.8325
Epoch: 080, Loss: 0.5493, Test AUC: 0.8423
Epoch: 090, Loss: 0.5421, Test AUC: 0.8520
Epoch: 100, Loss: 0.5346, Test AUC: 0.8586
Epoch: 110, Loss: 0.5298, Test AUC: 0.8615
Epoch: 120, Loss: 0.5257, Test AUC: 0.8638
Epoch: 130, Loss: 0.5177, Test AUC: 0.8706
Epoch: 140, Loss: 0.5108, Test AUC: 0.8733
Epoch: 150, Loss: 0.5095, Test AUC: 0.8770
Epoch: 160, Loss: 0.5055, Test AUC: 0.8794
Epoch: 170, Loss: 0.5001, Test AUC: 0.8802
Epoch: 180, Loss: 0.4992, Test AUC: 0.8819
Epoch: 190, Loss: 0.4949, Test AUC: 0.8840
Epoch: 200, Loss: 0.4928, Test AUC: 0.8839


In [18]:
@torch.no_grad()
def recommend(user_id, top_k=5):
    model.eval()
    z = model(data.train_pos_edge_index.to(device))

    # Movie node indices
    movie_ids = torch.arange(num_users, data.num_nodes).to(device)

    user_node = torch.full_like(movie_ids, user_id)
    edge_pairs = torch.stack([user_node, movie_ids])

    scores = model.decode(z, edge_pairs).sigmoid()
    top_indices = scores.topk(top_k).indices
    recommended_movie_ids = movie_ids[top_indices] - num_users  # relative movie IDs

    return recommended_movie_ids.cpu().numpy()


In [20]:
recommend(20)

array([255,  49, 353,  95,   0])

In [21]:
movies[movies["movie_id"].isin(recommend(20))]

Unnamed: 0,movie_id,title
48,49,I.Q. (1994)
94,95,Aladdin (1992)
254,255,My Best Friend's Wedding (1997)
352,353,Deep Rising (1998)


In [22]:
movies[movies["movie_id"].isin(recommend(55))]

Unnamed: 0,movie_id,title
48,49,I.Q. (1994)
77,78,Free Willy (1993)
164,165,Jean de Florette (1986)
170,171,Delicatessen (1991)
179,180,Apocalypse Now (1979)


In [23]:
movies[movies["movie_id"].isin(recommend(200))]

Unnamed: 0,movie_id,title
48,49,I.Q. (1994)
123,124,Lone Star (1996)
164,165,Jean de Florette (1986)
170,171,Delicatessen (1991)
179,180,Apocalypse Now (1979)


In [24]:
movies[movies["movie_id"].isin(recommend(800))]

Unnamed: 0,movie_id,title
177,178,12 Angry Men (1957)
284,285,Secrets & Lies (1996)
296,297,Ulee's Gold (1997)
309,310,"Rainmaker, The (1997)"
733,734,Made in America (1993)
