In [None]:
# 02 - Bipartite Graph Construction
## Book Recommendation System - Lightweight LightGCN

import pandas as pd
import check_torch
from torch_geometric.data import Data
import pickle
import os
import psutil
import gc



Current RAM usage: 0.35 GB


In [3]:
## 1. Load Filtered Data

# Paths from Notebook 01
filtered_ratings = pd.read_csv('data/processed/filtered_ratings.csv')
filtered_books = pd.read_csv('data/processed/books_filtered.csv')

print(f"Loaded ratings: {len(filtered_ratings)}")
print(f"Unique users: {filtered_ratings['User-ID'].nunique()}")
print(f"Unique books (ISBNs): {filtered_ratings['ISBN'].nunique()}")

# Quick check
display(filtered_ratings.head())

Loaded ratings: 47610
Unique users: 3404
Unique books (ISBNs): 2178


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276847,3426029553,8
1,277157,316154059,5
2,277157,345452550,7
3,277157,399148639,6
4,277157,671024094,7


In [4]:
## 2. Create Mappings: Original ID → Consecutive Node Index
"""
 We need two separate node spaces:
- User nodes: 0 to num_users-1
- Book nodes: 0 to num_books-1

PyG will handle bipartite nature via edge_index. */
"""
# Unique users and books
unique_users = filtered_ratings['User-ID'].unique()
unique_books = filtered_ratings['ISBN'].unique()

# Sort for reproducibility
unique_users.sort()
unique_books.sort()

# Create mappings
user_to_node = {uid: i for i, uid in enumerate(unique_users)}
book_to_node = {isbn: i for i, isbn in enumerate(unique_books)}

num_users = len(unique_users)
num_books = len(unique_books)

print(f"Number of user nodes: {num_users}")
print(f"Number of book nodes: {num_books}")
print(f"Total nodes: {num_users + num_books}")

Number of user nodes: 3404
Number of book nodes: 2178
Total nodes: 5582


In [None]:
## 3. Build Edge Index (Undirected Bipartite Graph)
"""
LightGCN treats the graph as undirected, so we add both directions:
- user → book (rating edge)
- book → user

Edge weights are not needed for standard LightGCN (it uses uniform propagation).
"""
# Map ratings to node indices
source_nodes = filtered_ratings['User-ID'].map(user_to_node).values   # users
target_nodes = filtered_ratings['ISBN'].map(book_to_node).values      # books

# Offset book nodes by num_users so they don't overlap with user nodes
target_nodes_offset = target_nodes + num_users

# Create edges in both directions
edge_index = torch.tensor([
    list(source_nodes) + list(target_nodes_offset),      # from
    list(target_nodes_offset) + list(source_nodes)        # to
], dtype=torch.long)

print(f"Edge index shape: {edge_index.shape}")  # Should be [2, 2 * num_ratings]


Edge index shape: torch.Size([2, 95220])
Current RAM usage: 0.34 GB


In [8]:
## 4. (Optional) Simple Node Features

#LightGCN famously works **without input features** — it learns embeddings purely from graph structure.

#We'll start with **zero features** (common practice). Later you can experiment with random if you want.

# Zero features (recommended for pure LightGCN)
node_features = torch.zeros(num_users + num_books, 64)  # 64-dim is a good default size

# Alternative: random features (uncomment if you want to try)
# node_features = torch.randn(num_users + num_books, 64)

print(f"Node features shape: {node_features.shape}")

Node features shape: torch.Size([5582, 64])


In [None]:
## 5. Create PyTorch Geometric Data Object

data = Data(
    x = node_features,          # node features (can be zero)
    edge_index = edge_index,    # bipartite edges
    num_nodes = num_users + num_books
)

# Add metadata for later use
data.num_users = num_users
data.num_books = num_books

print(data)
print(f"Is bipartite: {data.is_undirected()}")  # Should be True


Data(x=[5582, 64], edge_index=[2, 95220], num_nodes=5582, num_users=3404, num_books=2178)
Is bipartite: True
Current RAM usage: 0.34 GB


In [None]:
## 6. Save Graph and Mappings

# Create processed folder if needed
os.makedirs('data/processed', exist_ok=True)

# Save the PyG data object
torch.save(data, 'data/processed/graph_data.pt')

# Save mappings (needed for inference and display)
with open('data/processed/user_mapping.pkl', 'wb') as f:
    pickle.dump({'user_to_node': user_to_node, 'node_to_user': {v: k for k, v in user_to_node.items()}}, f)

with open('data/processed/book_mapping.pkl', 'wb') as f:
    pickle.dump({'book_to_node': book_to_node, 'node_to_book': {v: k for k, v in book_to_node.items()}}, f)

# Save book metadata for display (ISBN → Title + Author)
book_metadata = filtered_books[['ISBN', 'Book-Title', 'Book-Author']].set_index('ISBN')
book_metadata.to_csv('data/processed/book_metadata.csv')

print("Saved:")
print("- graph_data.pt")
print("- user_mapping.pkl")
print("- book_mapping.pkl")
print("- book_metadata.csv")

gc.collect()

Saved:
- graph_data.pt
- user_mapping.pkl
- book_mapping.pkl
- book_metadata.csv
Current RAM usage: 0.34 GB


66

In [14]:
## 7. Quick Validation (Fixed for PyTorch 2.6+)

from torch_geometric.data.data import DataTensorAttr, DataEdgeAttr
from torch_geometric.data.storage import GlobalStorage
import torch.serialization

# Allowlist the PyG internals (safe because we created the file)
torch.serialization.add_safe_globals([DataTensorAttr, DataEdgeAttr, GlobalStorage])

# Now load safely with default weights_only=True
loaded_data = torch.load('data/processed/graph_data.pt')
print(loaded_data)

# Load mappings (no change needed)
with open('data/processed/user_mapping.pkl', 'rb') as f:
    user_map = pickle.load(f)
with open('data/processed/book_mapping.pkl', 'rb') as f:
    book_map = pickle.load(f)

print(f"Sample: User-ID {list(user_map['node_to_user'].values())[0]} → node 0")
print(f"Sample: ISBN {list(book_map['node_to_book'].values())[0]} → node {num_users} (offset)")

print("\nBipartite graph successfully loaded with PyTorch 2.6+ compatibility!")
print("Ready for the next notebook.")

Data(x=[5582, 64], edge_index=[2, 95220], num_nodes=5582, num_users=3404, num_books=2178)
Sample: User-ID 242 → node 0
Sample: ISBN 000649840X → node 3404 (offset)

Bipartite graph successfully loaded with PyTorch 2.6+ compatibility!
Ready for the next notebook.
