# Graph Neural Network

## Importing

In [85]:
from selfdist_toolkit.pyg_tools import gnn_load, GIN_nn, execution
import pandas as pd
import torch
import torch_geometric
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

## Loading aid_list

In [86]:
aid_list = pd.read_csv("results/random_forest/experiments_check/chem-desc_good-aid_1.csv").aid.to_numpy().astype(int)

In [87]:
aid_list

array([    884,     891,     899,     914,    1418,    1431,    1770,
          1771,    1795,  493073,  493102,  493177,  493191,  493240,
        588834,  651741,  651812,  651814,  686978,  687022,  720691,
        743036,  743040,  743065, 1053173, 1259381, 1346982])

In [88]:
# for one aid now
aid = aid_list[0]
random_state = 131313

## Select mode: smooth for 2 dim label and hard for 1-dim label

In [89]:
# mode = "smooth"
mode = "hard"

## Load pytorch data

In [90]:
whole_data = gnn_load.load_pyg_data_aid(aid=aid, label_type=mode, do_in_parallel=True)

In [91]:
whole_data[:4]

[Data(x=[19, 9], edge_index=[2, 42], edge_attr=[42, 3], smiles='CC1(C=CC2=C(O1)C3=C(C=CC(=C3)OC)NC2=O)C', y=[1]),
 Data(x=[10, 9], edge_index=[2, 20], edge_attr=[20, 3], smiles='C1CN=C(N1)SCC(=O)O', y=[1]),
 Data(x=[29, 9], edge_index=[2, 64], edge_attr=[64, 3], smiles='CC(=O)OCC(=O)[C@]1(CC[C@@H]2[C@@]1(CC(=O)[C@H]3[C@H]2CCC4=CC(=O)CC[C@]34C)C)O', y=[1]),
 Data(x=[25, 9], edge_index=[2, 56], edge_attr=[56, 3], smiles='C[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2CC[C@]4([C@H]3CC[C@@]4(C(=O)CO)O)C', y=[1])]

In [98]:
# build labels for split
labels_hard = np.array([
    data.y.detach().cpu().numpy()
    for data in whole_data
]).flatten().astype(int)

In [99]:
labels_hard.shape

(9593,)

In [100]:
labels_hard.sum()

3274

In [101]:
# data splitting
for train_idx, test_idx in StratifiedShuffleSplit(n_splits=1, random_state=random_state, test_size=0.2).split(whole_data, labels_hard):
    break

In [102]:
labels_hard[train_idx].sum()/len(train_idx)

0.34128225175918686

In [103]:
labels_hard[test_idx].sum()/len(test_idx)

0.341323606044815

In [118]:
# create the data loader
dl_train = torch_geometric.loader.DataLoader([whole_data[idx] for idx in train_idx])
dl_test = torch_geometric.loader.DataLoader(np.array(whole_data, dtype=object)[test_idx])

## Instantiate model

In [112]:
# GNN model
model = GIN_nn.GIN_basic(1)

In [113]:
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [114]:
# loss
loss = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([len(train_idx)/labels_hard[train_idx].sum()]))

In [115]:
# device
device = torch.device('cpu')

In [120]:
# training:
execution.training(model, dl_train, device, optimizer, loss)

  0%|                                                                                                                                                                                                                                                                                         | 0/7674 [00:00<?, ?it/s]


ValueError: Target size (torch.Size([1])) must be the same as input size (torch.Size([1, 1]))

In [121]:
for batch in torch_geometric.loader.DataLoader(whole_data, batch_size=5):
    break

In [122]:
batch

DataBatch(x=[96, 9], edge_index=[2, 212], edge_attr=[212, 3], smiles=[5], y=[5], batch=[96], ptr=[6])

In [123]:
model(batch)

tensor([[-0.2602],
        [ 0.5218],
        [ 0.1675],
        [ 0.3979],
        [-0.2678]], grad_fn=<AddmmBackward0>)