# How to work with predict_proba and calc_metrics in Runner and HypeRunner

 1. [Runner (with calc_metrics example) (Amazon Computers)](#p1)
 2. [Runner (Multitarget)](#p2)
 3. [HypeRunner (Amazon Computers)](#p3)
 4. [HypeRunner (Multitarget)](#p4)

# 1. <a id="p1">Runner (Amazon Computers) </a>

### Summary:
#### Training Runner on Amazon Computers Dataset with `runner.run()`. <br> Getting predictions on test sample of Amazon computers using `runner.predict_proba()`. <br> Calculating score on predictions using `calc_metrics()` function

In [2]:
from torch_geometric import datasets
import torch
import pandas as pd

In [3]:
from cool_graph.runners import Runner
from torch_geometric.data import Data

In [4]:
# use simple Amazon dataset with Computers
dataset = datasets.Amazon(root='./data/Amazon', name='Computers')
data = dataset.data
data

Data(x=[13752, 767], edge_index=[2, 491722], y=[13752])

In [6]:
# initializing Runner
runner = Runner(data, verbose=False)

In [7]:
# let's check metrics
runner.cfg["metrics"]

['accuracy', 'cross_entropy', 'f1_weighted']

In [8]:
%%time
# training 
result = runner.run()

CPU times: user 43min 18s, sys: 4min 48s, total: 48min 6s
Wall time: 1min 20s


In [45]:
# checking train/test split in runner
print(runner.train_idx)
print(runner.test_idx)

tensor([11144,  8484, 10273,  ..., 12441,  3984,  8292])
tensor([7095, 9842, 1553,  ..., 7552, 3983, 8723])


In [46]:
# getting prediction of probabilities of test nodes belonging to each of 10 classes
# and indices of nodes from data.x on which predictions were made
# now with metrics accuracy and roc_auc
preds, indices = runner.predict_proba(data, test_mask=runner.test_idx)

Sample data: 100%|██████████| 14/14 [00:01<00:00,  8.55it/s]
                                               

In [47]:
# looking at predictions
preds

{'y': array([[9.7961672e-10, 8.9330232e-04, 2.8586419e-08, ..., 1.6820866e-08,
         9.9570686e-01, 1.6694018e-07],
        [8.2801978e-07, 8.3723069e-07, 9.9650887e-12, ..., 7.4552543e-12,
         3.3368945e-06, 3.9771731e-13],
        [3.6074553e-06, 1.9758058e-05, 2.0433931e-10, ..., 4.5269075e-10,
         1.1946617e-04, 3.9859172e-10],
        ...,
        [2.4543593e-05, 1.7357592e-02, 1.4163599e-04, ..., 2.2546052e-04,
         9.1188085e-01, 3.9362791e-04],
        [4.3684584e-05, 6.1099266e-04, 2.0017833e-07, ..., 3.0814186e-07,
         1.5478279e-03, 3.5494182e-07],
        [2.9600825e-10, 6.3976549e-09, 1.8611238e-17, ..., 4.7020179e-17,
         4.6064081e-08, 1.2216913e-17]], dtype=float32)}

In [48]:
# shape of prediction tensor matches with number of nodes in test sample
print(runner.test_idx.shape)
print(preds["y"].shape)
print(len(indices))

torch.Size([3438])
(3438, 10)
3438


In [12]:
#lets try to calculate accuracy on our predictions with calc_metrics function
from cool_graph.train.metrics import calc_metrics
# passing data as our true labels, predictions from predict_proba and indices of nodes from predict_proba
metrics = calc_metrics(data, preds, metrics=["accuracy"], indices=indices)
metrics

{'y': {'accuracy': 0.9162303664921466}}

# 2. <a id="p2"> Runner (Multitarget dataset) </a>

### Summary:
#### Let's try to work with dataset with multiple targets. We will use Multitarget dataset from `cool_graph.datasets` which has two versions: `50k` - bigger one, `10k` - smaller one.  <br> Training Runner on Multitarget Dataset `50k` with `runner.run()`. <br> Getting predictions for each task on test sample of Multitarget dataset `10k` using `runner.predict_proba()`. <br> Calculating score on predictions for each task of Multitarget `10k` using `calc_metrics()` function

In [9]:
#importing Multitarget dataset loader
from cool_graph.datasets.multitarget import Multitarget

In [10]:
# Load heterogenious data for 50k dataset
multitarget_50k = Multitarget(root="./data", name="50k")
hetero_data_50k = multitarget_50k.data

Using existing file ./data/50k/50k_data.pt


In [11]:
# Take just <node_1> node type
# So we use homogenious data (standard)
data_50k = Data(**hetero_data_50k['node_1'], **hetero_data_50k[('node_1', 'to', 'node_1')])

In [12]:
data_50k

Data(x=[5860353, 162], edge_index=[2, 4444748], edge_attr=[4444748, 44], y=[5860353, 4], label_3=[5860353], label_4=[5860353], label_5=[5860353], label_6=[5860353], label_mask=[5860353], index=[5860353])

In [13]:
# Initializing runner 
runner = Runner(data_50k, metrics=['roc_auc','cross_entropy'],
                use_edge_attr=True)

In [18]:
# Training
result = runner.run()

Sample data: 100%|██████████| 60/60 [00:02<00:00, 25.43it/s]
Sample data: 100%|██████████| 20/20 [00:01<00:00, 17.39it/s]
2024-07-24 21:09:41.070 | INFO     | cool_graph.train.helpers:eval_epoch:218 - test:
 {'roc_auc': 0.668, 'cross_entropy': 0.461, 'calc_time': 0.02, 'main_metric': 0.668}
2024-07-24 21:09:45.261 | INFO     | cool_graph.train.helpers:eval_epoch:218 - train:
 {'roc_auc': 0.672, 'cross_entropy': 0.458, 'calc_time': 0.07, 'main_metric': 0.672}
2024-07-24 21:10:16.846 | INFO     | cool_graph.train.helpers:eval_epoch:218 - test:
 {'roc_auc': 0.759, 'cross_entropy': 0.39, 'calc_time': 0.025, 'main_metric': 0.759}
2024-07-24 21:10:21.095 | INFO     | cool_graph.train.helpers:eval_epoch:218 - train:
 {'roc_auc': 0.787, 'cross_entropy': 0.371, 'calc_time': 0.071, 'main_metric': 0.787}
2024-07-24 21:10:50.742 | INFO     | cool_graph.train.helpers:eval_epoch:218 - test:
 {'roc_auc': 0.76, 'cross_entropy': 0.384, 'calc_time': 0.014, 'main_metric': 0.76}
2024-07-24 21:10:52.777 | 

In [14]:
# Load heterogenious data for 50k dataset
hetero_data_10k = Multitarget(root="./data", name="10k")

Using existing file ./data/10k/10k_data.pt


In [15]:
# Load heterogenious data for 10k dataset
multitarget_10k = Multitarget(root="./data", name="10k")
hetero_data_10k = multitarget_10k.data
data_10k = Data(**hetero_data_10k['node_1'], **hetero_data_10k[('node_1', 'to', 'node_1')])

Using existing file ./data/10k/10k_data.pt


In [16]:
data_10k

Data(x=[1318527, 162], edge_index=[2, 908184], edge_attr=[908184, 44], y=[1318527, 4], label_3=[1318527], label_4=[1318527], label_5=[1318527], label_6=[1318527], label_mask=[1318527], index=[1318527])

In [22]:
# getting prediction of probabilities of nodes from dataset 10k belonging to each of 2 classes 
# for each of 4 tasks
preds, indices = runner.predict_proba(data_10k)

Sample data: 100%|██████████| 5275/5275 [00:21<00:00, 246.41it/s]
                                                    

In [23]:
preds

{'y0': array([[0.72181535, 0.27818468],
        [0.84333646, 0.15666354],
        [0.7893555 , 0.2106445 ],
        ...,
        [0.82104754, 0.17895252],
        [0.85739976, 0.14260027],
        [0.9642006 , 0.03579935]], dtype=float32),
 'y1': array([[0.70675004, 0.29324993],
        [0.9430329 , 0.05696703],
        [0.8798542 , 0.12014585],
        ...,
        [0.90974367, 0.0902563 ],
        [0.9509039 , 0.04909613],
        [0.9894342 , 0.01056579]], dtype=float32),
 'y2': array([[0.7779555 , 0.22204451],
        [0.9375671 , 0.06243285],
        [0.83785546, 0.16214459],
        ...,
        [0.8952226 , 0.10477737],
        [0.9379956 , 0.0620044 ],
        [0.9786699 , 0.0213301 ]], dtype=float32),
 'y3': array([[0.84889936, 0.1511006 ],
        [0.964528  , 0.03547203],
        [0.9046428 , 0.09535722],
        ...,
        [0.9203122 , 0.07968783],
        [0.9610196 , 0.0389805 ],
        [0.9877758 , 0.01222418]], dtype=float32)}

In [24]:
# shape of prediction matches with data_10k label mask
print(preds["y0"].shape)
print(data_10k.label_mask.sum())

(3999, 2)
tensor(3999)


In [25]:
#lets try to calculate accuracy on our predictions with calc_metrics function
from cool_graph.train.metrics import calc_metrics
# passing data_10k as our true labels, predictions from predict_proba and indices of nodes from predict_proba
metrics = calc_metrics(data_10k, preds, metrics=["roc_auc", "cross_entropy"], indices=indices)
metrics

{'y0': {'roc_auc': 0.7217017954722872, 'cross_entropy': tensor(0.5613)},
 'y1': {'roc_auc': 0.7675213178153693, 'cross_entropy': tensor(0.4800)},
 'y2': {'roc_auc': 0.8303489127642535, 'cross_entropy': tensor(0.4944)},
 'y3': {'roc_auc': 0.8077940777724796, 'cross_entropy': tensor(0.4204)}}

# 3. <a id="p3"> HypeRunner (Amazon Computers) </a>

### Summary:
#### Let's use HypeRunner to find best hyperparameters on Amazon Computers Dataset validation sample with `hyperunner.optimize_run()`. <br> Getting predictions on test sample of Amazon computers using model with best score on validation sample with `hyperunner.predict_proba()`. <br> Calculating score on predictions using `calc_metrics()` function

In [17]:
from cool_graph.runners import HypeRunner

In [18]:
# use simple Amazon dataset with Computers
dataset = datasets.Amazon(root='./data/Amazon', name='Computers')
data = dataset.data
data

Data(x=[13752, 767], edge_index=[2, 491722], y=[13752])

In [20]:
# initializing hyperunner
hyperunner = HypeRunner(data, 
                seed=42,
                gc_after_trial=True, 
                verbose=False)

In [None]:
%%time
# training and optimizing model
result = hyperunner.optimize_run(n_trials=10)

In [30]:
# getting predictions of model with best score on test sample 
preds, indices = hyperunner.predict_proba(data, hyperunner.test_idx)

Sample data: 100%|██████████| 14/14 [00:02<00:00,  4.88it/s]
                                               

In [31]:
preds

{'y': array([[5.9231610e-05, 7.5862028e-02, 5.6601886e-05, ..., 5.6856737e-04,
         6.9100869e-01, 5.8191729e-04],
        [1.3799192e-07, 7.3974996e-05, 7.7756324e-10, ..., 7.6186633e-11,
         1.3674887e-04, 3.1267817e-08],
        [2.3962748e-05, 2.4321581e-04, 2.6174927e-09, ..., 3.2020650e-11,
         1.5519845e-04, 3.2113654e-09],
        ...,
        [1.3722478e-05, 5.2405079e-04, 5.2292695e-08, ..., 2.6697531e-09,
         1.3829605e-03, 1.3681969e-07],
        [9.2862070e-01, 2.8701435e-04, 5.0376218e-02, ..., 5.1513049e-03,
         2.3940602e-05, 5.9606627e-06],
        [2.0959496e-04, 9.3683237e-01, 1.5941503e-05, ..., 2.1401379e-06,
         3.6720554e-03, 5.9063996e-08]], dtype=float32)}

In [32]:
torch.tensor(indices)

tensor([ 2620, 10377, 10084,  ...,  4026,  6051, 12505])

In [33]:
#lets try to calculate accuracy on our predictions with calc_metrics function
from cool_graph.train.metrics import calc_metrics
# passing data as our true labels, predictions from predict_proba and indices of nodes from predict_proba
metrics = calc_metrics(data, preds, metrics=["accuracy"], indices=indices)
metrics

{'y': {'accuracy': 0.9185573007562536}}

# 4. <a id="p4"> HypeRunner (Multitarget Dataset) </a>

### Summary:
#### Let's try to work with dataset with multiple targets. We will use Multitarget dataset from `cool_graph.datasets` which has two versions: `50k` - bigger one, `10k` - smaller one.  <br> Using HypeRunner to find best hyperparameters on Multitarget `50k` validation sample with `hyperunner.optimize_run()`. <br> Getting predictions on test sample of Multitarget `10k` using model with best score on validation sample of Multitarget `50k` with `hyperunner.predict_proba()`. <br> Calculating score on predictions for each task of Multitarget `10k` using `calc_metrics()` function

In [21]:
# initializing HypeRunner on 50k dataset
hyperunner2 = HypeRunner(data_50k, metrics=['roc_auc','cross_entropy'],
                use_edge_attr=True,
                verbose=False)

In [None]:
%%time
# training and optimizing model
hyperunner2.optimize_run(n_trials=10)

In [36]:
# getting predictions of model with best score on test sample on dataset 10k
preds, indices = hyperunner2.predict_proba(data_10k)

Sample data: 100%|██████████| 5275/5275 [00:23<00:00, 219.85it/s]
                                                    

In [37]:
preds

{'y0': array([[0.89899224, 0.10100773],
        [0.8007417 , 0.1992583 ],
        [0.9439511 , 0.05604896],
        ...,
        [0.7928705 , 0.20712945],
        [0.96414244, 0.0358576 ],
        [0.9658472 , 0.0341528 ]], dtype=float32),
 'y1': array([[0.9562268 , 0.04377312],
        [0.87865096, 0.12134909],
        [0.9824949 , 0.01750516],
        ...,
        [0.86804354, 0.13195646],
        [0.99174833, 0.00825167],
        [0.9919612 , 0.00803886]], dtype=float32),
 'y2': array([[0.95747185, 0.04252816],
        [0.8765057 , 0.12349433],
        [0.98343325, 0.01656678],
        ...,
        [0.86716884, 0.13283113],
        [0.99209213, 0.0079079 ],
        [0.9924378 , 0.00756218]], dtype=float32),
 'y3': array([[0.97587603, 0.02412396],
        [0.91085714, 0.08914288],
        [0.99231064, 0.00768937],
        ...,
        [0.9003777 , 0.09962231],
        [0.99687046, 0.00312955],
        [0.9970144 , 0.00298564]], dtype=float32)}

In [38]:
#lets try to calculate accuracy on our predictions with calc_metrics function
from cool_graph.train.metrics import calc_metrics
# passing data_10k as our true labels, predictions from predict_proba and indices of nodes from predict_proba
metrics = calc_metrics(data_10k, preds, metrics=["roc_auc", "cross_entropy"], indices=indices)
metrics

{'y0': {'roc_auc': 0.7244340359094457, 'cross_entropy': tensor(0.5673)},
 'y1': {'roc_auc': 0.8165323020332796, 'cross_entropy': tensor(0.4702)},
 'y2': {'roc_auc': 0.864522417153996, 'cross_entropy': tensor(0.4782)},
 'y3': {'roc_auc': 0.8731919628247922, 'cross_entropy': tensor(0.4100)}}