In [54]:
! cat /etc/os-release # OS version
! lsb_release -a # OS version
! uname -r # Linux kernel version
! nvcc --version # CUDA version

NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 20.04.5 LTS
Release:	20.04
Codename:	focal
5.10.147+
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [55]:
import torch 
print(torch.__version__)

1.13.1+cu116


In [56]:
# Install PyG
! pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cu116.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.13.0+cu116.html


In [57]:
! pip install tqdm
! pip install japanize-matplotlib


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Introduction by Example 

[This Document](https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html)

[Install Page](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html)

In [58]:
import random
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import japanize_matplotlib


import torch 
print(torch.__version__)
import torch_geometric
print(torch_geometric.__version__)


1.13.1+cu116
2.2.0


In [59]:

def torch_fix_seed(seed=0):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

SEED=1
torch_fix_seed(SEED)



## Data handring of Graph

-  Graph : a data format for objectcts(Node) and rerations(Edge)
- Data Object
  - data.x: Node feature matrix with shape [num_nodes, num_node_features]

 - data.edge_index: Graph connectivity in COO format with shape [2(edge_deg), num_edges] and type torch.long

 - data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]

 - data.y: Target to train against (may have arbitrary shape), e.g., node-level targets of shape [num_nodes, *] or graph-level targets of shape [1, *]

 - data.pos: Node position matrix with shape [num_nodes, num_dimensions]

- 

In [60]:
from torch_geometric.data import Data

edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
#>>> Data(edge_index=[2, 4], x=[3, 1])


In [61]:
print(Data(edge_index=[2, 4], x=[3, 1]))

Data(x=[2], edge_index=[2])


In [62]:
# Transfer data object to GPU.
device = torch.device('cuda')
data = data.to(device)

In [63]:
print(Data(edge_index=[2, 4], x=[3, 1]))

Data(x=[2], edge_index=[2])


## common datasets

- [Benchmark data source]( http://graphkernels.cs.tu-dortmund.de )


In [64]:

from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid


from torch_geometric.utils import scatter

from torch_geometric.loader import DataLoader

torch_fix_seed(SEED)
dataset = Planetoid(root='/tmp/Cora', name='Cora')
# dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for data in loader:
    print(f"{data=}")
    print(f"{data.num_graphs=}")
    x = scatter(data.x, data.batch, dim=0, reduce='mean')
    print(x.size())
    print()

# print(dataset)
# # ENZYMES(600)
# len(dataset)
# #>>> 600
# print(f"{dataset.num_classes=}\t{dataset.num_node_features=}")
# #>>> 6 #>>> 3

data=DataBatch(edge_index=[2, 4636], x=[1278, 21], y=[32], batch=[1278], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 3634], x=[969, 21], y=[32], batch=[969], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 4018], x=[1070, 21], y=[32], batch=[1070], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 3582], x=[932, 21], y=[32], batch=[932], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 4356], x=[1083, 21], y=[32], batch=[1083], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 4158], x=[1063, 21], y=[32], batch=[1063], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 4048], x=[1051, 21], y=[32], batch=[1051], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch(edge_index=[2, 3878], x=[961, 21], y=[32], batch=[961], ptr=[33])
data.num_graphs=32
torch.Size([32, 21])

data=DataBatch

For Cora()

this time, the Data objects holds a label for each node, and additional node-level attributes: train_mask, val_mask and test_mask, where

train_mask denotes against which nodes to train (140 nodes),

val_mask denotes which nodes to use for validation, e.g., to perform early stopping (500 nodes),

test_mask denotes against which nodes to test (1000 nodes).

In [65]:
# data =  dataset[0]
# print(f"{data=}")
# print(f"{data.validate()=}\t{data.is_undirected=}")

For TUDatast We can see that the first graph in the dataset contains 37 nodes, each one having 3 features. There are 168/2 = 84 undirected edges and the graph is assigned to exactly one class. In addition, the data object is holding exactly one graph-level target.



We can even use slices, long or bool tensors to split the dataset. E.g., to create a 90/10 train/test split, type:



In [66]:
# torch_fix_seed(SEED)
# train_test_splitNum = int(len(dataset)/0.9)
# dataset_shuffle = dataset.shuffle()
# #>>> ENZYMES(600)

# train_dataset = dataset_shuffle[:train_test_splitNum]
# #>>> ENZYMES(540)
# test_dataset = dataset_shuffle[train_test_splitNum:]
# #>>> ENZYMES(60)
# print(f"{train_dataset=}\n{test_dataset=}")

In [67]:
# torch_fix_seed(SEED)
# perm = torch.randperm(len(dataset))
# dataset_perm = dataset[perm]
# #>> ENZYMES(600)
# print(f"{(dataset_perm==dataset_shuffle)=}")

まじ？？やばくね？

In [68]:
# loader = DataLoader(dataset, batch_size=32, shuffle=True)


## Data Transforms

In [69]:
from torch_geometric.datasets import ShapeNet

# dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'])

# dataset[0]
# >>> Data(pos=[2518, 3], y=[2518])

We can convert the point cloud dataset into a graph dataset by generating nearest neighbor graphs from the point clouds via transforms:

In addition, we can use the transform argument to randomly augment a Data object, e.g., translating each node position by a small number:

In [70]:
import torch_geometric.transforms as T
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
                  pre_transform=T.KNNGraph(k=6),
                  transform=T.RandomJitter(0.01),
                   )
#NOTE:: The pre_transform is only applied when processing the dataset for the first time.

dataset[0]

Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1], edge_index=[2, 15108])

## Learning Methods On Graphs



In [73]:
# import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)


def train(model,  optimizer, data, ):
  model.train()
  optimizer.zero_grad()

  out = model(data)
  loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
  loss.backward()
  optimizer.step()

  return model

@torch.no_grad()
def eval(model, data):
  model.eval()
  pred = model(data).argmax(dim=1)
  correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
  acc = int(correct) / int(data.test_mask.sum())
  return acc

The constructor defines two GCNConv layers which get called in the forward pass of our network. Note that the non-linearity is not integrated in the conv calls and hence needs to be applied afterwards (something which is consistent accross all operators in PyG). Here, we chose to use ReLU as our intermediate non-linearity and finally output a softmax distribution over the number of classes. Let’s train this model on the training nodes for 200 epochs:

In [72]:
# from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='/tmp/Cora', name='Cora')



In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
# criterion = F.nll_loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
MaxEpoch =200

for epoch in range(MaxEpoch):
    model =train(model,  optimizer, data)
acc = eval(model,data)
print(f'Accuracy: {acc:.4f}')


Accuracy: 0.8040


ひとつのグラフに対して学習しているので精度が高いのは当たり前のように思われる．

データセットのデータを流石に変えないと意味ないわ

In [85]:

dataset = Planetoid(root='/tmp/Cora', name='Cora')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)

# criterion = F.nll_loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
MaxEpoch =200

for epoch in range(MaxEpoch):
    model =train(model,  optimizer, data)
test_data = dataset[-1].to(device)
acc = eval(model,test_data)
print(f'Accuracy: {acc:.4f}')


Accuracy: 0.8080


た．．．たけぇ

恐れ入ります

In [75]:
# !pwd


## ALL Things Up

In [82]:
#
! rm -R ../tmp/ShapeNet/processed
#NOTE:: The pre_transform is only applied when processing the dataset for the first time.

In [84]:
# dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
print(dataset)
print(dataset[0])
print(dataset[0].x[0])

ENZYMES(600)
Data(edge_index=[2, 168], x=[37, 21], y=[1])
tensor([11.0000, 15.8870, 37.7800, -0.5100,  1.7010, 93.9000,  4.0000,  5.0000,
         2.0000,  4.0000,  4.0000,  3.0000,  3.0000,  4.0000,  4.0000,  3.0000,
         6.0000,  2.0000,  1.0000,  0.0000,  0.0000])


In [83]:
from statistics import mean, stdev
from tqdm import tqdm
from torch_geometric.datasets import Planetoid

from torch_geometric.loader import DataLoader

SEED=1
torch_fix_seed(SEED)

MaxEpoch =200
BATCH =34
# train : validation : test = 0.8: 0.1: 0.1
tr_val_percent =0.8
val_te_percent =0.1 

# PREPARE DATASET and  DEVICE
# dataset = Planetoid(root='/tmp/Cora', name='Cora')
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
# print(len(dataset))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#SPLIT DATA SET
train_val_splitNum = int(len(dataset)*tr_val_percent)
val_test_splitNum = int(len(dataset)*(val_te_percent+tr_val_percent))
dataset_shuffle = dataset.shuffle()
print(f"{train_val_splitNum=}\t{val_test_splitNum=}")

# PREPARE DATA SET
train_dataset = dataset_shuffle[:train_val_splitNum]
val_dataset = dataset_shuffle[train_val_splitNum:val_test_splitNum]
test_dataset = dataset_shuffle[val_test_splitNum:]
print(f"{train_dataset=}\n{val_dataset=}\n{test_dataset=}")

# PREPARE DATA LOADER
train_loader = DataLoader(train_dataset, batch_size=BATCH, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH, shuffle=True )


model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
# Loss Func : nll Loss

acc_val= list()
acc_test = list()

for epoch in range(MaxEpoch):
    dataloader_iterator = iter(val_loader)
    for train_data in tqdm(train_loader): 
        try:
            val_data = next(dataloader_iterator)
        except StopIteration:
            dataloader_iterator = iter(val_loader)
            val_data = next(dataloader_iterator)
        train_data = train_data.to(device)
        val_data = val_data.to(device)

        model =train(model,  optimizer, train_data)
        # AttributeError: 'GlobalStorage' object has no attribute 'train_mask'
        ### そらそう，クラス分類タスクしたいのに，ターゲットのクラスが一つの上に
        ### 前回使用したモデルを流用しようとしているため

        ### グラフデータのattr属性の意味がわかってない ###

        acc = eval(model,val_data)
        
        acc_val.append(acc)

for test_data in tqdm(test_loader):
    test_data=test_data.to(device)
    acc =eval(model, test_data)
    acc_test.append(acc)
print(f'VAL Accuracy: {mean(acc_val)}±{stdev(acc_val)}')
print(f'TEST Accuracy: {mean(acc_test)}±{stdev(acc_test)}')

train_val_splitNum=480	val_test_splitNum=540
train_dataset=ENZYMES(480)
val_dataset=ENZYMES(60)
test_dataset=ENZYMES(60)


  0%|          | 0/15 [00:00<?, ?it/s]


AttributeError: ignored