# Anime Recommendation Using MAL Database 2020 Dataset and Graph Machine Learning
Author: Zach Kangas

Date: 2/14/2023

Class: Graph Machine Learning

Professor: Dr. Urbain

# Importing Relevant Packages

In [4]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric 
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch import Tensor
from torch_geometric.nn.conv import HeteroConv, GCNConv, SAGEConv, GATConv, HEATConv, GATConv
from torch_geometric.nn import Linear, to_hetero
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from google.colab import drive

In [6]:
drive.mount('/content/drive')
base_path = "drive/My Drive/Colab Notebooks/"

Mounted at /content/drive


In [7]:
PORTION_OF_RATINGS_TO_USE = 1/10

# Processing and Loading the Data

In [8]:
anime_dataset_path = base_path + "dataset/anime.csv"
anime_df = pd.read_csv(anime_dataset_path)
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [9]:
ratings_dataset_path = base_path + "dataset/rating_complete.csv"
ratings_df = pd.read_csv(ratings_dataset_path)
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [10]:
ratings_df_without_negative_ratings = ratings_df.loc[ratings_df.rating != 0]
ratings_df_without_negative_ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


# Creating the graph

## Setting up Node Utilities

In [11]:
def load_node_csv(path, index_col, encoders=None, portion=0, **kwargs):
    df = pd.read_csv(path, index_col=index_col, **kwargs)
    if portion != 0:
        df = df.head(int(ratings_df.shape[0]*portion))
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [12]:
class SequenceEncoder(object):
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [13]:
class GenresEncoder(object):
    def __init__(self, sep=','):
        self.sep = sep

    def __call__(self, df):
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

## Creating Mapping

In [14]:
anime_x, anime_mapping = load_node_csv(anime_dataset_path, index_col='MAL_ID', encoders={'Name': SequenceEncoder(),
                                                                                         'Genres': GenresEncoder()
                                                                                         })

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/549 [00:00<?, ?it/s]

In [15]:
anime_x.shape
anime_x.shape

torch.Size([17562, 468])

In [16]:
user_ratings_x, user_mapping = load_node_csv(ratings_dataset_path, index_col='user_id', portion=PORTION_OF_RATINGS_TO_USE)

In [17]:
## Creating Base Data Object

In [18]:
data = HeteroData()

data["user"].node_id = torch.arange(len(user_mapping))
data['user'].num_nodes = len(user_mapping)  # Users do not have any features.
data['anime'].node_id = torch.arange(len(anime_df))
data['anime'].x = anime_x

print(data)


HeteroData(
  [1muser[0m={
    node_id=[30954],
    num_nodes=30954
  },
  [1manime[0m={
    node_id=[17562],
    x=[17562, 468]
  }
)


In [19]:
## Setting up Edge Utilities

In [20]:
def load_edge_csv(path, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, portion=0, **kwargs):
    df = pd.read_csv(path, **kwargs)
    if portion != 0:
        df = df.head(int(ratings_df.shape[0]*portion))
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_idx = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_idx, edge_attr

In [21]:
class IdentityEncoder(object):
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

In [22]:
## Creating Edges

In [23]:
edge_index, edge_label = load_edge_csv(
    ratings_dataset_path,
    src_index_col='user_id',
    src_mapping=user_mapping,
    dst_index_col='anime_id',
    dst_mapping=anime_mapping,
    portion=PORTION_OF_RATINGS_TO_USE,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

In [24]:
edge_index.shape

torch.Size([2, 5763327])

In [25]:
edge_label.shape

torch.Size([5763327, 1])

In [26]:
assert edge_index.shape[1] == edge_label.shape[0]

In [27]:
## Creating Labeled Graph then Reversing Edges (ToUndirected)

In [28]:
#data['user', 'rates', 'anime'].edge_label = edge_label
data['user', 'rates', 'anime'].edge_index = edge_index
data = T.ToUndirected()(data)

print(data)

HeteroData(
  [1muser[0m={
    node_id=[30954],
    num_nodes=30954
  },
  [1manime[0m={
    node_id=[17562],
    x=[17562, 468]
  },
  [1m(user, rates, anime)[0m={ edge_index=[2, 5763327] },
  [1m(anime, rev_rates, user)[0m={ edge_index=[2, 5763327] }
)


In [29]:
data['anime'].x.shape

torch.Size([17562, 468])

In [30]:
## Processing the data for pipeline

In [31]:
transform = T.Compose([
    T.RandomLinkSplit(num_test=0.2,
                      num_val=0.0,
                      is_undirected=True,
                      edge_types=[("user", "rates", "anime")],
                      add_negative_train_samples=True,
                      neg_sampling_ratio=1.0,
                      rev_edge_types=[("anime", "rev_rates", "user")],
                      disjoint_train_ratio=0.3),
])
#data.anime_node_mask = torch.ones(data['anime'].x.shape[0], dtype=torch.long)
#data.user_node_mask = torch.ones(data['user'].num_nodes, dtype=torch.long)
train, val, test = transform(data)

In [32]:
print(train)
print(test)

HeteroData(
  [1muser[0m={
    node_id=[30954],
    num_nodes=30954
  },
  [1manime[0m={
    node_id=[17562],
    x=[17562, 468]
  },
  [1m(user, rates, anime)[0m={
    edge_index=[2, 3227464],
    edge_label=[2766396],
    edge_label_index=[2, 2766396]
  },
  [1m(anime, rev_rates, user)[0m={ edge_index=[2, 3227464] }
)
HeteroData(
  [1muser[0m={
    node_id=[30954],
    num_nodes=30954
  },
  [1manime[0m={
    node_id=[17562],
    x=[17562, 468]
  },
  [1m(user, rates, anime)[0m={
    edge_index=[2, 4610662],
    edge_label=[2305330],
    edge_label_index=[2, 2305330]
  },
  [1m(anime, rev_rates, user)[0m={ edge_index=[2, 4610662] }
)


## Creating batch loader for training

In [33]:
from torch_geometric.loader import NeighborLoader, LinkNeighborLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# the larger the batch size the faster things will be
batch_size = 16384
edge_label_index = train["user", "rates", "anime"].edge_label_index
edge_label = train["user", "rates", "anime"].edge_label
train_loader = LinkNeighborLoader(
    data=train,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "anime"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

# Creating the Model
## Model Setup

In [34]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_anime: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_anime = x_anime[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_anime).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.anime_lin = torch.nn.Linear(468, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.anime_emb = torch.nn.Embedding(data["anime"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "anime": self.anime_lin(data["anime"].x) + self.anime_emb(data["anime"].node_id),
        }
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["anime"],
            data["user", "rates", "anime"].edge_label_index,
        )
        return pred
        

## Training the Model

In [36]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = Model(hidden_channels=64)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(0, 2):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "anime"].edge_label
        loss = F.mse_loss(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cuda'


100%|██████████| 21613/21613 [39:48<00:00,  9.05it/s]


Epoch: 000, Loss: 0.3461


100%|██████████| 21613/21613 [39:33<00:00,  9.10it/s]

Epoch: 001, Loss: 0.3168





## Validating the Model

### Creating Validation Loader from Test dataset

In [37]:
# Define the validation seed edges:
edge_label_index = test["user", "rates", "anime"].edge_label_index
edge_label = test["user", "rates", "anime"].edge_label
val_loader = LinkNeighborLoader(
    data=test,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "anime"), edge_label_index),
    edge_label=edge_label,
    batch_size=8192,
    shuffle=False,
)
sampled_data = next(iter(val_loader))

### AUC and MSE

In [38]:
from sklearn.metrics import roc_auc_score, mean_squared_error
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "anime"].edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
mse = mean_squared_error(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")
print(f"Validation MSE/l2: {mse:.4f}")

100%|██████████| 282/282 [02:19<00:00,  2.02it/s]



Validation AUC: 0.9685
Validation MSE/l2: 0.2752
