In [120]:
import torch_geometric
from torch_geometric.data import Data, HeteroData
from torch_geometric.loader import DataLoader, LinkNeighborLoader, NeighborLoader
import torch_geometric.transforms as T
import torch

import networkx as nx

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [22]:
anime = pd.read_csv("../data/anime.csv").sort_values(by=['anime_id']).reset_index(drop=True)
rating = pd.read_csv("../data/rating.csv")

anime['genre'] = anime['genre'].str.split(', ')
#anime = anime.dropna(axis=1)
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi, Space]",TV,26,8.82,486824
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Drama, Mystery, Sci-Fi, Space]",Movie,1,8.4,137636
2,6,Trigun,"[Action, Comedy, Sci-Fi]",TV,26,8.32,283069
3,7,Witch Hunter Robin,"[Action, Drama, Magic, Mystery, Police, Supern...",TV,26,7.36,64905
4,8,Beet the Vandel Buster,"[Adventure, Fantasy, Shounen, Supernatural]",TV,52,7.06,9848


In [23]:
anime.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [24]:
nan_anime_id = anime.loc[anime.isna().any(axis=1), 'anime_id'].values
nan_anime_id.shape

(277,)

In [25]:
# Remove NaN values from both DataFrames
anime = anime.dropna()
rating = rating.loc[~rating.loc[:, 'anime_id'].isin(nan_anime_id), :]

In [26]:
# Remove animes that are in `rating`, but not exist in `anime`
rating = rating.loc[rating.loc[:, 'anime_id'].isin(anime.loc[:, 'anime_id'].unique()), :]
rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [27]:
anime = anime.sort_values(by=['anime_id']).reset_index(drop=True)
anime["type"].unique()

array(['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music'], dtype=object)

In [28]:
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = rating.loc[:, 'user_id'].unique()
unique_user_id = pd.DataFrame(data={
    'user_id': unique_user_id,
    'mapped_id': pd.RangeIndex(len(unique_user_id)),
})
unique_user_id.head()

Unnamed: 0,user_id,mapped_id
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [29]:
unique_anime_id = pd.DataFrame(data={
    'anime_id': anime.loc[:, 'anime_id'],
    'mapped_id': pd.RangeIndex(len(anime)),
})
unique_anime_id.head()

Unnamed: 0,anime_id,mapped_id
0,1,0
1,5,1
2,6,2
3,7,3
4,8,4


In [50]:
mapped_anime_id_rating = pd.merge(unique_anime_id, anime, on='anime_id').loc[:, ['anime_id', 'mapped_id', 'rating']]
mapped_anime_id_rating.head()

Unnamed: 0,anime_id,mapped_id,rating
0,1,0,8.82
1,5,1,8.4
2,6,2,8.32
3,7,3,7.36
4,8,4,7.06


In [10]:
# Perform merge to obtain the edges from users and animes:
ratings_user_id = pd.merge(rating.loc[:, 'user_id'], unique_user_id,
                            left_on='user_id', right_on='user_id', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mapped_id'].values)
ratings_user_id

tensor([    0,     0,     0,  ..., 73513, 73514, 73514])

In [11]:
ratings_anime_id = pd.merge(rating.loc[:, 'anime_id'], unique_anime_id,
                            left_on='anime_id', right_on='anime_id', how='left')
ratings_anime_id = torch.from_numpy(ratings_anime_id.loc[:, 'mapped_id'].values)
ratings_anime_id

tensor([  10,   14,   58,  ..., 8616,  718, 5225])

In [12]:
assert ratings_user_id.shape == ratings_anime_id.shape, (
    'The shape must be the same.'
)

In [13]:
# Constuct `edge_index` for our PyG Dataset
edge_index_user_to_anime = torch.stack([ratings_user_id, ratings_anime_id], dim=0)
edge_index_user_to_anime

tensor([[    0,     0,     0,  ..., 73513, 73514, 73514],
        [   10,    14,    58,  ...,  8616,   718,  5225]])

### Encode features

In [30]:
types = anime["type"].unique()
genres = anime["genre"].explode().unique()

print(f"Unique types: {len(types)}")
print(f"Unique genres: {len(genres)}")

type2id = {t:i for i, t in enumerate(types)}
id2type = {i:t for i, t in enumerate(types)}

genre2id = {g:i for i, g in enumerate(genres)}
id2genre = {i:g for i, g in enumerate(genres)}

Unique types: 6
Unique genres: 43


In [31]:
anime['genre'] = anime.loc[:, 'genre'].apply(func=lambda x: '|'.join(x))
anime.loc[:, 'genre']

0            Action|Adventure|Comedy|Drama|Sci-Fi|Space
1                     Action|Drama|Mystery|Sci-Fi|Space
2                                  Action|Comedy|Sci-Fi
3        Action|Drama|Magic|Mystery|Police|Supernatural
4                Adventure|Fantasy|Shounen|Supernatural
                              ...                      
12012                              Sci-Fi|Slice of Life
12013                                            Comedy
12014                                            Comedy
12015                Action|Adventure|Fantasy|Game|Kids
12016                                            Comedy
Name: genre, Length: 12017, dtype: object

In [32]:
anime.loc[:, 'type']

0             TV
1          Movie
2             TV
3             TV
4             TV
          ...   
12012    Special
12013         TV
12014         TV
12015        ONA
12016    Special
Name: type, Length: 12017, dtype: object

In [62]:
# Use One-Hot Encoding to encode types and Ordinal encoding to encode genres.
types_encoded = anime.loc[:, 'type'].str.get_dummies()
genres_encoded = anime.loc[:, 'genre'].str.get_dummies('|')

In [63]:
# Edge index for 'anime' - 'genre' edges
genres_encoded.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,1,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [64]:
genres_encoded['mapped_id'] = mapped_anime_id_rating.loc[:, 'mapped_id']
genres_encoded.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,mapped_id
0,1,1,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,3
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,4


In [61]:
def edge_index_anime_to_enc(mapped_df: pd.DataFrame) -> pd.DataFrame:

    # Initialize an empty list to store the results
    result_data = []

    # Iterate through each row of the dataframe
    for index, row in mapped_df.iterrows():
        mapped_id = row['mapped_id']
        # Iterate through each genre column
        for genre_index, genre in enumerate(mapped_df.columns[:-1]):  # Exclude the last column 'mapped_id'
            if row[genre] == 1:
                result_data.append([mapped_id, genre_index])

    # Create a new dataframe from the result data
    result_df = pd.DataFrame(result_data, columns=['mapped_id', 'genre'])

    return result_df

In [68]:
edge_index_anime_to_genre = edge_index_anime_to_enc(genres_encoded)
edge_index_anime_to_genre

Unnamed: 0,mapped_id,genre
0,0,0
1,0,1
2,0,3
3,0,6
4,0,28
...,...,...
35589,12015,1
35590,12015,8
35591,12015,9
35592,12015,15


In [72]:
edge_index_anime_to_genre = torch.from_numpy(edge_index_anime_to_genre.T.values).to(torch.int64)
edge_index_anime_to_genre

tensor([[    0,     0,     0,  ..., 12015, 12015, 12016],
        [    0,     1,     3,  ...,     9,    15,     3]])

In [73]:
edge_index_anime_to_genre.shape

torch.Size([2, 35594])

In [66]:
# Do the same with 'types'
types_encoded['mapped_id'] = mapped_anime_id_rating.loc[:, 'mapped_id']
types_encoded.head()

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV,mapped_id
0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,1
2,0,0,0,0,0,1,2
3,0,0,0,0,0,1,3
4,0,0,0,0,0,1,4


In [69]:
edge_index_anime_to_type = edge_index_anime_to_enc(types_encoded)
edge_index_anime_to_type

Unnamed: 0,mapped_id,genre
0,0,5
1,1,0
2,2,5
3,3,5
4,4,5
...,...,...
12012,12012,4
12013,12013,5
12014,12014,5
12015,12015,2


In [74]:
edge_index_anime_to_type = torch.from_numpy(edge_index_anime_to_type.T.values).to(torch.int64)
edge_index_anime_to_type

tensor([[    0,     1,     2,  ..., 12014, 12015, 12016],
        [    5,     0,     5,  ...,     5,     2,     4]])

In [75]:
edge_index_anime_to_type.shape

torch.Size([2, 12017])

### Construct PyG `HeteroData`

In [88]:
# Define Heterogeneous graph
data = HeteroData()

# Add users and anime to our graph
data['user'].node_id = torch.arange(len(unique_user_id))
data['anime'].node_id = torch.arange(len(edge_index_user_to_anime[1].unique()))

# Add genres and types to our graph
data['genre'].node_id = torch.arange(len(genres))
data['type'].node_id = torch.arange(len(types))

# Add node features for 'anime'
data['anime'].mean_rating = torch.from_numpy(mapped_anime_id_rating.loc[:, 'rating'].values).to(torch.float64)

# Add edges and edge attributes
data['user', 'rates', 'anime'].edge_index = edge_index_user_to_anime
data['user', 'rates', 'anime'].rating = torch.from_numpy(rating.loc[:, 'rating'].values).to(torch.int64)

data['anime', 'has_genres', 'genre'].edge_index = edge_index_anime_to_genre
data['anime', 'has_type', 'type'].edge_index = edge_index_anime_to_type

# Transform data to undirected graph
data = T.ToUndirected()(data)

In [89]:
data

HeteroData(
  user={ node_id=[73515] },
  anime={
    node_id=[11162],
    mean_rating=[12017],
  },
  genre={ node_id=[43] },
  type={ node_id=[6] },
  (user, rates, anime)={
    edge_index=[2, 7813611],
    rating=[7813611],
  },
  (anime, has_genres, genre)={ edge_index=[2, 35594] },
  (anime, has_type, type)={ edge_index=[2, 12017] },
  (anime, rev_rates, user)={
    edge_index=[2, 7813611],
    rating=[7813611],
  },
  (genre, rev_has_genres, anime)={ edge_index=[2, 35594] },
  (type, rev_has_type, anime)={ edge_index=[2, 12017] }
)

### Split `data` into 3 subgraphs
Note: We will not create any negative samples here because 

In [112]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    disjoint_train_ratio=0.0,
    neg_sampling_ratio=2,
    add_negative_train_samples=False,
    edge_types=[("user", "rates", "anime")],
    rev_edge_types=[("anime", "rev_rates", "user")],
)

train_data, val_data, test_data = transform(data)

In [113]:
print(f"Train data:\n{train_data}\n")
print(f"Val data:\n{val_data}")

Train data:
HeteroData(
  user={ node_id=[73515] },
  anime={
    node_id=[11162],
    mean_rating=[12017],
  },
  genre={ node_id=[43] },
  type={ node_id=[6] },
  (user, rates, anime)={
    edge_index=[2, 6250889],
    rating=[6250889],
    edge_label=[6250889],
    edge_label_index=[2, 6250889],
  },
  (anime, has_genres, genre)={ edge_index=[2, 35594] },
  (anime, has_type, type)={ edge_index=[2, 12017] },
  (anime, rev_rates, user)={
    edge_index=[2, 6250889],
    rating=[6250889],
  },
  (genre, rev_has_genres, anime)={ edge_index=[2, 35594] },
  (type, rev_has_type, anime)={ edge_index=[2, 12017] }
)

Val data:
HeteroData(
  user={ node_id=[73515] },
  anime={
    node_id=[11162],
    mean_rating=[12017],
  },
  genre={ node_id=[43] },
  type={ node_id=[6] },
  (user, rates, anime)={
    edge_index=[2, 6250889],
    rating=[6250889],
    edge_label=[2344083],
    edge_label_index=[2, 2344083],
  },
  (anime, has_genres, genre)={ edge_index=[2, 35594] },
  (anime, has_type, typ

In [114]:
num_user_nodes = train_data['user'].num_nodes
num_anime_nodes = train_data['anime'].num_nodes

num_rates_edges = train_data['user', 'rates', 'anime'].edge_index.size(1) + train_data['anime', 'has_genres', 'genre'].edge_index.size(1) + train_data['anime', 'has_type', 'type'].edge_index.size(1)
num_rev_rates_edges = train_data['anime', 'rev_rates', 'user'].edge_index.size(1) + train_data['genre', 'rev_has_genres', 'anime'].edge_index.size(1) + train_data['type', 'rev_has_type', 'anime'].edge_index.size(1)

max_possible_rates_edges = num_user_nodes * num_anime_nodes
max_possible_rev_rates_edges = num_anime_nodes * num_user_nodes

rates_edge_density = num_rates_edges / max_possible_rates_edges
rev_rates_edge_density = num_rev_rates_edges / max_possible_rev_rates_edges

print(f"Number of user nodes: {num_user_nodes}")
print(f"Number of anime nodes: {num_anime_nodes}")
print(f"Number of 'rates' edges: {num_rates_edges}")
print(f"Number of 'rev_rates' edges: {num_rev_rates_edges}")
print(f"Maximum possible 'rates' edges: {max_possible_rates_edges}")
print(f"Maximum possible 'rev_rates' edges: {max_possible_rev_rates_edges}")
print(f"'rates' edge density: {rates_edge_density:.6f}")
print(f"'rev_rates' edge density: {rev_rates_edge_density:.6f}")

Number of user nodes: 73515
Number of anime nodes: 11162
Number of 'rates' edges: 6298500
Number of 'rev_rates' edges: 6298500
Maximum possible 'rates' edges: 820574430
Maximum possible 'rev_rates' edges: 820574430
'rates' edge density: 0.007676
'rev_rates' edge density: 0.007676


In [115]:
# Assuming train_data is your HeteroData object
positive_edges_count = (train_data['user', 'rates', 'anime'].edge_label == 1).sum().item()
negative_edges_count = (train_data['user', 'rates', 'anime'].edge_label == 0).sum().item()

print(f"Number of positive edges: {positive_edges_count}")
print(f"Number of negative edges: {negative_edges_count}")

Number of positive edges: 6250889
Number of negative edges: 0


In [140]:
link_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors={
        ('user', 'rates', 'anime'): [5, 3],
        ('anime', 'rev_rates', 'user'): [5, 3],
        ('anime', 'has_genres', 'genre'): [5, 3],
        ('genre', 'rev_has_genres', 'anime'): [5, 3],
        ('anime', 'has_type', 'type'): [5, 3],
        ('type', 'rev_has_type', 'anime'): [5, 3]
    },
    edge_label_index=('user', 'rates', 'anime'),
    edge_label=torch.arange(train_data['user', 'rates', 'anime'].edge_index.size(1)), 
    batch_size=32,
)

In [143]:
for link_batch in link_loader:
    print(link_batch)
    break

HeteroData(
  user={
    node_id=[504],
    n_id=[504],
    num_sampled_nodes=[3],
  },
  anime={
    node_id=[438],
    mean_rating=[12017],
    n_id=[438],
    num_sampled_nodes=[3],
  },
  genre={
    node_id=[40],
    n_id=[40],
    num_sampled_nodes=[3],
  },
  type={
    node_id=[5],
    n_id=[5],
    num_sampled_nodes=[3],
  },
  (user, rates, anime)={
    edge_index=[2, 520],
    rating=[520],
    edge_label=[32],
    edge_label_index=[2, 32],
    e_id=[520],
    num_sampled_edges=[2],
    input_id=[32],
  },
  (anime, has_genres, genre)={
    edge_index=[2, 90],
    e_id=[90],
    num_sampled_edges=[2],
  },
  (anime, has_type, type)={
    edge_index=[2, 15],
    e_id=[15],
    num_sampled_edges=[2],
  },
  (anime, rev_rates, user)={
    edge_index=[2, 565],
    rating=[565],
    e_id=[565],
    num_sampled_edges=[2],
  },
  (genre, rev_has_genres, anime)={
    edge_index=[2, 482],
    e_id=[482],
    num_sampled_edges=[2],
  },
  (type, rev_has_type, anime)={
    edge_index=[