# Measurement of benchmarks and comparison with SOTA 

In [2]:
from cool_graph.datasets import *
from cool_graph.train.metrics import calc_metrics
from cool_graph.runners import Runner, HypeRunner

import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import os.path as osp

In [4]:
names = [
    'AntiFraud Amazon',
    'AntiFraud YelpChi',
    'Multitarget 10k',
    'Multitarget 50k',
    'NonHomophilous Penn94',
    'NonHomophilous Genius',
    'S_FFSD',
    'OgbnProteins',
]

Genius dataset is processed for quite a long time, but quickly comes to good results. <br>
OgbnProteins takes a very long time to process.

In [5]:
overrides = {
    'AntiFraud Amazon': [],
    'AntiFraud YelpChi': [],
    'Multitarget 10k': [],
    'Multitarget 50k': [],
    'NonHomophilous Penn94': [],
    'NonHomophilous Genius': ['training.n_epochs=20'],
    'S_FFSD': [],
    'OgbnProteins': ['training.n_epochs=20'],
}

The processing time of datasets is different, so the number of trials is different

In [6]:
n_trials = {
    'AntiFraud Amazon': 50,
    'AntiFraud YelpChi': 20,
    'Multitarget 10k': 50,
    'Multitarget 50k': 50,
    'NonHomophilous Penn94': 40,
    'NonHomophilous Genius': 20,
    'S_FFSD': 40,
    'OgbnProteins': 3,
}

The metrics are taken from paperswithcode.com

In [7]:
main_metrics = {
    'AntiFraud Amazon': 'roc_auc',
    'AntiFraud YelpChi': 'roc_auc',
    'Multitarget 10k': 'roc_auc',
    'Multitarget 50k': 'roc_auc',
    'NonHomophilous Penn94': 'accuracy',
    'NonHomophilous Genius': 'accuracy',
    'S_FFSD': 'accuracy',
    'OgbnProteins': 'roc_auc',
}

In [8]:
root = './data/benchmarks'

In [9]:
all_data = {}
for dataname in names:
    if ' ' in dataname:
        dataset, name = dataname.split(' ')
        all_data[dataname] = globals()[dataset](root=root, name=name).data
    else:
        folder = dataname.lower()
        all_data[dataname] = globals()[dataname](root=osp.join(root, folder)).data
    

Using existing file ./data/benchmarks/amazon/Amazon_data.pt
Using existing file ./data/benchmarks/yelpchi/YelpChi_data.pt
Using existing file ./data/benchmarks/10k/10k_data.pt
Using existing file ./data/benchmarks/50k/50k_data.pt
Using existing file ./data/benchmarks/penn94/Penn94_data.pt
Using existing file ./data/benchmarks/genius/Genius_data.pt
Using existing file ./data/benchmarks/s_ffsd/S-FFSD_data.pt
Using existing file ./data/benchmarks/ogbnproteins/ogbn-proteins_data.pt


In the Penn94 dataset, all features are categorical

In [10]:
all_data['NonHomophilous Penn94'].x_cat = all_data['NonHomophilous Penn94'].x[:, 0:6]
all_data['NonHomophilous Penn94'].x = all_data['NonHomophilous Penn94'].x[:, 0:1]

In the Genius dataset, some features are categorical

In [11]:
all_data['NonHomophilous Genius'].x_cat = all_data['NonHomophilous Genius'].x[:, 7:12]
all_data['NonHomophilous Genius'].x = all_data['NonHomophilous Genius'].x[:, 0:7]

Process only the homogeneous part of Multitarget datasets

In [12]:
hetero_data = all_data['Multitarget 10k']
all_data['Multitarget 10k'] = Data(**hetero_data['node_1'], **hetero_data[('node_1', 'to', 'node_1')])
all_data['Multitarget 10k'].y = torch.tensor(all_data['Multitarget 10k'].y.max(dim=1).values)

In [13]:
hetero_data = all_data['Multitarget 50k']
all_data['Multitarget 50k'] = Data(**hetero_data['node_1'], **hetero_data[('node_1', 'to', 'node_1')])
all_data['Multitarget 50k'].y = torch.tensor(all_data['Multitarget 50k'].y.max(dim=1).values)

In [14]:
metrics = ['accuracy', 'roc_auc']

Divide the node indexes into train, validation and test with sizes 0.6, 0.2, 0.2

In [15]:
def train_val_test_split(data, seed=None):
    idx = list(range(data.x.shape[0]))
    train_idx, test_idx = train_test_split(idx, test_size=0.4, random_state=seed)
    valid_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=seed)
    return train_idx, valid_idx, test_idx

Each dataset is processed using HypeRunner <br>
The model is trained on the train and validation nodes, after which the quality is calculated on the test nodes

In [16]:
results = {}
seed=42
for name, data in all_data.items():
    train_idx, valid_idx, test_idx = train_val_test_split(data, seed)
    if hasattr(data, 'label_mask') and data.label_mask != None:
        test_idx = [id for id in test_idx if data.label_mask[id]]
        valid_idx = [id for id in valid_idx if data.label_mask[id]]
        train_idx = [id for id in train_idx if data.label_mask[id]]

    print(name)
    runner = HypeRunner(
        data,
        seed=seed,
        use_edge_attr = hasattr(data, 'edge_attr') and data.edge_attr != None,
        train_idx=train_idx,
        test_idx=valid_idx,
        overrides=overrides[name],
        main_metric=main_metrics[name],
        verbose=False
    )
    runner.optimize_run(n_trials=n_trials[name])
    preds, indices = runner.predict_proba(data, test_idx)
    results[name] = calc_metrics(
        data,
        preds,
        metrics,
        indices,
    )

AntiFraud Amazon


                                              

AntiFraud YelpChi


                                               

Multitarget 10k


                                     

Multitarget 50k


                                               

NonHomophilous Penn94


                                               

NonHomophilous Genius


                                                  

S_FFSD


                                                

OgbnProteins


                                                 

for OgbnProteins, the metric is the arithmetic mean of the metrics for all targets

In [17]:
results['OgbnProteins'] = {
    'y': {
        'accuracy': np.mean([results['OgbnProteins']['y' + str(i)]['accuracy'] for i in range(112)]),
        'roc_auc': np.mean([results['OgbnProteins']['y' + str(i)]['roc_auc'] for i in range(112)])
    }
}

Process the results and tabulate them

In [18]:
for dataset in results.keys():
    results[dataset] = results[dataset]['y']

In [19]:
benchmarks = pd.DataFrame(results).transpose()

In [20]:
sota_roc_auc = {
    'AntiFraud Amazon': 0.9750,
    'AntiFraud YelpChi': 0.9498,
    'Multitarget 10k': '-',
    'Multitarget 50k': '-',
    'NonHomophilous Penn94': '-',
    'NonHomophilous Genius': '-',
    'S_FFSD': 0.8461,
    'OgbnProteins': 0.8942,
}
sota_accuracy = {
    'AntiFraud Amazon': '-',
    'AntiFraud YelpChi': '-',
    'Multitarget 10k': '-',
    'Multitarget 50k': '-',
    'NonHomophilous Penn94': 0.8609,
    'NonHomophilous Genius': 0.9145,
    'S_FFSD': '-',
    'OgbnProteins': '-',
}

In [21]:
benchmarks['sota_accuracy'] = pd.Series(sota_accuracy)
benchmarks['sota_roc_auc'] = pd.Series(sota_roc_auc)

In [22]:
pd.options.display.float_format = '{:,.4f}'.format

In [23]:
benchmarks

Unnamed: 0,accuracy,roc_auc,sota_accuracy,sota_roc_auc
AntiFraud Amazon,0.9828,0.9617,-,0.9750
AntiFraud YelpChi,0.905,0.9176,-,0.9498
Multitarget 10k,0.8651,0.7415,-,-
Multitarget 50k,0.8637,0.7973,-,-
NonHomophilous Penn94,0.7829,0.8714,0.8609,-
NonHomophilous Genius,0.8405,0.9016,0.9145,-
S_FFSD,0.8961,0.8932,-,0.8461
OgbnProteins,0.8967,0.8058,-,0.8942


In [24]:
results

{'AntiFraud Amazon': {'accuracy': 0.9828380075345333,
  'roc_auc': 0.9617322331178102},
 'AntiFraud YelpChi': {'accuracy': 0.905015776302905,
  'roc_auc': 0.9176270039168578},
 'Multitarget 10k': {'accuracy': 0.8651102464332037,
  'roc_auc': 0.7415073352573353},
 'Multitarget 50k': {'accuracy': 0.8637289862455425,
  'roc_auc': 0.7973168463199725},
 'NonHomophilous Penn94': {'accuracy': 0.78285566744126,
  'roc_auc': 0.8713995158595165},
 'NonHomophilous Genius': {'accuracy': 0.8404844003649592,
  'roc_auc': 0.9015878814798921},
 'S_FFSD': {'accuracy': 0.8960579514824798, 'roc_auc': 0.8932157615617693},
 'OgbnProteins': {'accuracy': 0.8967105724094443,
  'roc_auc': 0.8057667823357261}}