In [1]:
import torch_geometric
from torch_geometric.data import Data, HeteroData
from torch_geometric.loader import DataLoader, LinkNeighborLoader
import torch_geometric.transforms as T
import torch

import networkx as nx

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
anime = pd.read_csv("../data/anime.csv").sort_values(by=['anime_id']).reset_index(drop=True)
rating = pd.read_csv("../data/rating.csv")

anime['genre'] = anime['genre'].str.split(', ')
#anime = anime.dropna(axis=1)
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi, Space]",TV,26,8.82,486824
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Drama, Mystery, Sci-Fi, Space]",Movie,1,8.4,137636
2,6,Trigun,"[Action, Comedy, Sci-Fi]",TV,26,8.32,283069
3,7,Witch Hunter Robin,"[Action, Drama, Magic, Mystery, Police, Supern...",TV,26,7.36,64905
4,8,Beet the Vandel Buster,"[Adventure, Fantasy, Shounen, Supernatural]",TV,52,7.06,9848


In [3]:
anime.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [4]:
nan_anime_id = anime.loc[anime.isna().any(axis=1), 'anime_id'].values
nan_anime_id.shape

(277,)

In [5]:
# Remove NaN values from both DataFrames
anime = anime.dropna()
rating = rating.loc[~rating.loc[:, 'anime_id'].isin(nan_anime_id), :]

In [6]:
# Remove animes that are in `rating`, but not exist in `anime`
rating = rating.loc[rating.loc[:, 'anime_id'].isin(anime.loc[:, 'anime_id'].unique()), :]
rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [7]:
anime = anime.sort_values(by=['anime_id']).reset_index(drop=True)
anime["type"].unique()

array(['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music'], dtype=object)

In [8]:
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = rating.loc[:, 'user_id'].unique()
unique_user_id = pd.DataFrame(data={
    'user_id': unique_user_id,
    'mapped_id': pd.RangeIndex(len(unique_user_id)),
})
unique_user_id.head()

Unnamed: 0,user_id,mapped_id
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [9]:
unique_anime_id = pd.DataFrame(data={
    'anime_id': anime.loc[:, 'anime_id'],
    'mapped_id': pd.RangeIndex(len(anime)),
})
unique_anime_id.head()

Unnamed: 0,anime_id,mapped_id
0,1,0
1,5,1
2,6,2
3,7,3
4,8,4


In [10]:
# Perform merge to obtain the edges from users and animes:
ratings_user_id = pd.merge(rating.loc[:, 'user_id'], unique_user_id,
                            left_on='user_id', right_on='user_id', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mapped_id'].values)
ratings_user_id

tensor([    0,     0,     0,  ..., 73513, 73514, 73514])

In [11]:
ratings_anime_id = pd.merge(rating.loc[:, 'anime_id'], unique_anime_id,
                            left_on='anime_id', right_on='anime_id', how='left')
ratings_anime_id = torch.from_numpy(ratings_anime_id.loc[:, 'mapped_id'].values)
ratings_anime_id

tensor([  10,   14,   58,  ..., 8616,  718, 5225])

In [12]:
assert ratings_user_id.shape == ratings_anime_id.shape, (
    'The shape must be the same.'
)

In [13]:
# Constuct `edge_index` for our PyG Dataset
edge_index_user_to_anime = torch.stack([ratings_user_id, ratings_anime_id], dim=0)
edge_index_user_to_anime

tensor([[    0,     0,     0,  ..., 73513, 73514, 73514],
        [   10,    14,    58,  ...,  8616,   718,  5225]])

### Encode features

In [14]:
types = anime["type"].unique()
genres = anime["genre"].explode().unique()

print(f"Unique types: {len(types)}")
print(f"Unique genres: {len(genres)}")

Unique types: 6
Unique genres: 43


In [15]:
anime['genre'] = anime.loc[:, 'genre'].apply(func=lambda x: '|'.join(x))
anime.loc[:, 'genre']

0            Action|Adventure|Comedy|Drama|Sci-Fi|Space
1                     Action|Drama|Mystery|Sci-Fi|Space
2                                  Action|Comedy|Sci-Fi
3        Action|Drama|Magic|Mystery|Police|Supernatural
4                Adventure|Fantasy|Shounen|Supernatural
                              ...                      
12012                              Sci-Fi|Slice of Life
12013                                            Comedy
12014                                            Comedy
12015                Action|Adventure|Fantasy|Game|Kids
12016                                            Comedy
Name: genre, Length: 12017, dtype: object

In [16]:
anime.loc[:, 'type']

0             TV
1          Movie
2             TV
3             TV
4             TV
          ...   
12012    Special
12013         TV
12014         TV
12015        ONA
12016    Special
Name: type, Length: 12017, dtype: object

In [17]:
# Use One-Hot Encoding to encode types and Ordinal encoding to encode genres.
types = anime.loc[:, 'type'].str.get_dummies()
genres = anime.loc[:, 'genre'].str.get_dummies('|')

### Construct PyG `HeteroData`

In [18]:
# Define Heterogeneous graph
data = HeteroData()

# Add users and anime to our graph
data['user'].node_id = torch.arange(len(unique_user_id))
data['anime'].node_id = torch.arange(len(edge_index_user_to_anime[1].unique()))

# Add node features and edge indices
data['anime'].genre = genres.values
data['anime'].type = types.values

# Add edges and edge attributes
data['user', 'rates', 'anime'].edge_index = edge_index_user_to_anime
data['user', 'rates', 'anime'].rating = torch.from_numpy(rating.loc[:, 'rating'].values).to(torch.int64)

# Transform data to undirected graph
data = T.ToUndirected()(data)

In [19]:
data

HeteroData(
  user={ node_id=[73515] },
  anime={
    node_id=[11162],
    genre=[12017, 43],
    type=[12017, 6],
  },
  (user, rates, anime)={
    edge_index=[2, 7813611],
    rating=[7813611],
  },
  (anime, rev_rates, user)={
    edge_index=[2, 7813611],
    rating=[7813611],
  }
)

### Split `data` into 3 subgraphs
Note: We will not create any negative samples here because 

In [54]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2,
    add_negative_train_samples=True,
    edge_types=("user", "rates", "anime"),
    rev_edge_types=("anime", "rev_rates", "user"),
)

train_data, val_data, test_data = transform(data)

In [55]:
print(f"Train data:\n{train_data}\n")
print(f"Val data:\n{val_data}")

Train data:
HeteroData(
  user={ node_id=[73515] },
  anime={
    node_id=[11162],
    genre=[12017, 43],
    type=[12017, 6],
  },
  (user, rates, anime)={
    edge_index=[2, 4375623],
    rating=[4375623],
    edge_label=[5625798],
    edge_label_index=[2, 5625798],
  },
  (anime, rev_rates, user)={
    edge_index=[2, 4375623],
    rating=[4375623],
  }
)

Val data:
HeteroData(
  user={ node_id=[73515] },
  anime={
    node_id=[11162],
    genre=[12017, 43],
    type=[12017, 6],
  },
  (user, rates, anime)={
    edge_index=[2, 6250889],
    rating=[6250889],
    edge_label=[2344083],
    edge_label_index=[2, 2344083],
  },
  (anime, rev_rates, user)={
    edge_index=[2, 6250889],
    rating=[6250889],
  }
)


In [102]:
num_user_nodes = train_data['user'].num_nodes
num_anime_nodes = train_data['anime'].num_nodes

num_rates_edges = train_data['user', 'rates', 'anime'].edge_index.size(1)
num_rev_rates_edges = train_data['anime', 'rev_rates', 'user'].edge_index.size(1)

max_possible_rates_edges = num_user_nodes * num_anime_nodes
max_possible_rev_rates_edges = num_anime_nodes * num_user_nodes

rates_edge_density = num_rates_edges / max_possible_rates_edges
rev_rates_edge_density = num_rev_rates_edges / max_possible_rev_rates_edges

print(f"Number of user nodes: {num_user_nodes}")
print(f"Number of anime nodes: {num_anime_nodes}")
print(f"Number of 'rates' edges: {num_rates_edges}")
print(f"Number of 'rev_rates' edges: {num_rev_rates_edges}")
print(f"Maximum possible 'rates' edges: {max_possible_rates_edges}")
print(f"Maximum possible 'rev_rates' edges: {max_possible_rev_rates_edges}")
print(f"'rates' edge density: {rates_edge_density:.6f}")
print(f"'rev_rates' edge density: {rev_rates_edge_density:.6f}")

Number of user nodes: 73515
Number of anime nodes: 11162
Number of 'rates' edges: 4375623
Number of 'rev_rates' edges: 4375623
Maximum possible 'rates' edges: 820574430
Maximum possible 'rev_rates' edges: 820574430
'rates' edge density: 0.005332
'rev_rates' edge density: 0.005332


In [56]:
# Assuming train_data is your HeteroData object
positive_edges_count = (train_data['user', 'rates', 'anime'].edge_label == 1).sum().item()
negative_edges_count = (train_data['user', 'rates', 'anime'].edge_label == 0).sum().item()

print(f"Number of positive edges: {positive_edges_count}")
print(f"Number of negative edges: {negative_edges_count}")

Number of positive edges: 1875266
Number of negative edges: 3750532


In [42]:
train_data['user', 'rates', 'anime']

{'edge_index': tensor([[ 2118, 72934, 16458,  ..., 68155, 73194, 67633],
        [ 7134,  1822,   404,  ...,   185,   804,  7531]]), 'rating': tensor([10, -1,  9,  ..., -1,  9,  7]), 'edge_label': tensor([1., 1., 1.,  ..., 1., 1., 1.]), 'edge_label_index': tensor([[27517, 46448, 34290,  ..., 17950, 34590, 41831],
        [ 8877,  1393,  6069,  ...,  5242,  3408,   890]])}

In [110]:
# Define train, val, and test loaders
# During training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=(5, 3),
    neg_sampling_ratio=2,
    edge_label_index=(("user", "rates", "anime"), train_data['user', 'rates', 'anime'].edge_label_index),
    edge_label=train_data['user', 'rates', 'anime'].edge_label,
    batch_size=32,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=(5, 3),
    neg_sampling_ratio=2,
    edge_label_index=(("user", "rates", "anime"), val_data['user', 'rates', 'anime'].edge_label_index),
    edge_label=val_data['user', 'rates', 'anime'].edge_label,
    batch_size=32,
    shuffle=False,
)

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=(5, 3),
    neg_sampling_ratio=2,
    edge_label_index=(("user", "rates", "anime"), test_data['user', 'rates', 'anime'].edge_label_index),
    edge_label=test_data['user', 'rates', 'anime'].edge_label,
    batch_size=32,
    shuffle=False,
)

Our graph is too sparse. Even reducing the number of neighbors and batchs does not help us. Look at the next line of code.

As you can see, each time you run the next line, you may get indexes of some batches until you face an error.

In [108]:
for idx, batch in enumerate(train_loader):
    print(idx)

AssertionError: 