In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.datasets import QM9
from torch_geometric.nn import GCNConv, NNConv
from torch_geometric.nn.models import MLP
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import NormalizeFeatures
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import time
import math

dataset = QM9(root="./QM9") #Shuffleいらない？
#無向グラフの例
#edge_index = torch.tensor([[0,1,1,2],[1,0,2,1]], dtype=torch.long) # エッジの定義
#x = torch.tensor([[-1],[0],[1]], dtype=torch.float) # ノードの属性
#data = Data(x=x, edge_index=edge_index) # コンストラクタ
# Data(x=[3, 1], edge_index=[2, 4])

QM9の属性

x:ノードの特徴量

y:ラベル

z:原子番号

edge_attr:エッジ特徴量(結合次数)

edge_index:エッジリスト

pos:3Dグリッドでの各原子の位置

In [42]:
#https://www.graphcore.ai/posts/getting-started-with-pytorch-geometric-pyg-on-graphcore-ipus

# GCN
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv2 = GCNConv(32, 32)
        self.linear1 = nn.Linear(16,1)
        self.out = nn.Linear(32, 1)
        #self.conv3 = GCNConv(32, dataset.num_classes) #num_classes:ラベルの数
    #バッチノルム(正則化)
    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        # Dropout:一定割合のノードを不活性化(0になる)させ、過学習を緩和する。pはゼロになるノードの確率で、0.5がデフォルト。
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch) #これが必要やった
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

class GCN_3(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv2 = GCNConv(32, 32)
        self.conv3 = GCNConv(32, 32)
        self.linear1 = nn.Linear(16,1)
        self.out = nn.Linear(32, 1)
        #self.conv3 = GCNConv(32, dataset.num_classes) #num_classes:ラベルの数
    #バッチノルム(正則化)
    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        # Dropout:一定割合のノードを不活性化(0になる)させ、過学習を緩和する。pはゼロになるノードの確率で、0.5がデフォルト。
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = torch_geometric.nn.global_add_pool(x, batch) #これが必要やった
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

In [3]:
class GNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, dataset.num_node_features*32))
        self.conv2_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, 32*16))        
        self.conv1 = NNConv(dataset.num_node_features, 32, self.conv1_net)
        self.conv2 = NNConv(32, 16, self.conv2_net)
        self.linear1 = torch.nn.Linear(16, 32)
        self.out = nn.Linear(32,1)
    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.out(x)
        return x

In [44]:
#データの分割(total: 130831)
num_train, num_val = int(len(dataset)*0.6), int(len(dataset)*0.2)
num_test = len(dataset) - (num_train + num_val)

train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])

#Dataloaderの生成
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=32, shuffle=True)
test_loader = DataLoader(test_set, batch_size=32, shuffle=True)

# GNNの初期化
model = GCN_3()
#model = GNNModel()
# 損失関数
criterion = F.mse_loss
# Optimizerの初期化
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [45]:
# 学習したいラベルのインデックス位置
target_idx = 1

for epoch in range(50):
    # train
    model.train()
    train_loss = 0
    total_graphs = 0
    for batch in train_loader:
        batch = batch.to("cpu")
        optimizer.zero_grad()
        prediction = model(batch)
        loss = torch.sqrt(criterion(prediction, batch.y[:, target_idx].unsqueeze(1)))
        loss.backward()
        train_loss += loss.item()
        total_graphs += batch.num_graphs
        optimizer.step()
    train_loss /= math.sqrt(total_graphs) #損失の平均

    # validation
    model.eval()
    valid_loss = 0
    total_graphs = 0
    with torch.inference_mode(): # 自動微分無効。torch.no_grad()よりさらに高速化
        for batch in valid_loader:
            prediction = model(batch)
            loss = torch.sqrt(criterion(prediction, batch.y[:, target_idx].unsqueeze(1)))
            valid_loss += loss.item()
            total_graphs += batch.num_graphs
    valid_loss /= math.sqrt(total_graphs)

    print(f"Epoch {epoch+1} | train_loss:{train_loss}, valid_loss:{valid_loss}")
    #なんかvalid_lossのほうが小さい

Epoch 1 | train_loss:65.4039874082681, valid_loss:18.62198985430231
Epoch 2 | train_loss:57.933457065797214, valid_loss:12.317043181037585
Epoch 3 | train_loss:53.82730070268863, valid_loss:13.697528622156096
Epoch 4 | train_loss:50.657058423137435, valid_loss:13.777025634297116
Epoch 5 | train_loss:47.89562332669134, valid_loss:14.104269056742627
Epoch 6 | train_loss:45.96767395948817, valid_loss:13.994373593543704
Epoch 7 | train_loss:43.97539488080315, valid_loss:14.786800212294732
Epoch 8 | train_loss:43.123820455739576, valid_loss:23.397368051810307
Epoch 9 | train_loss:41.71788068769881, valid_loss:17.344646799014814
Epoch 10 | train_loss:41.21392678059878, valid_loss:15.739677071111618
Epoch 11 | train_loss:40.61787937327033, valid_loss:16.48979098962544
Epoch 12 | train_loss:40.34612042065677, valid_loss:18.066754609918384
Epoch 13 | train_loss:40.06238893775574, valid_loss:16.677697632402555
Epoch 14 | train_loss:39.868636586507, valid_loss:18.20998824153071
Epoch 15 | train_l

In [34]:
F.mse_loss(torch.Tensor([1,1,1]),torch.Tensor([1,1,2])).item()

0.3333333432674408

In [12]:
torch.Tensor(1)

tensor([1.8560e-35])

lossの計算がおかしい　batchかグラフが多いほど誤差が大きくなるようになってる？

In [191]:
dataset.

QM9(130831)

In [182]:
batch.num_graphs


32

52.97473512632437

In [134]:
type(train_loss)

int

In [132]:
sqrt

NameError: name 'sqrt' is not defined

In [105]:
prediction[:, target_idx]

tensor([82.1755, 81.4774, 58.7662, 66.0292, 73.3747, 88.9727, 74.6518, 71.2338,
        74.3432, 69.8561, 79.1986, 86.8198, 62.3695, 73.2541, 77.7144, 81.5433,
        67.5248, 73.8379, 74.6417, 78.3584, 73.6930, 74.0313, 72.8392, 81.3193,
        59.6216, 81.7011, 75.3242, 62.2873, 81.1038, 69.8394, 61.3492, 81.1513],
       grad_fn=<SelectBackward0>)

In [106]:
batch.y[:, target_idx]

tensor([78.9400, 68.8500, 79.3400, 85.9600, 77.0400, 80.6700, 64.5700, 78.6200,
        69.5200, 81.2700, 70.3800, 76.9300, 70.3300, 62.9300, 76.5500, 65.3100,
        89.1800, 82.7900, 80.0900, 75.8200, 50.7900, 85.8600, 85.0100, 61.2400,
        87.6700, 94.2200, 76.3200, 68.6600, 75.3300, 70.3300, 72.0200, 76.4000])

In [None]:
model.eval() # 評価開始
predictions = []
real = []
for batch in test_loader:
    output = model(batch.to("cpu"))
    predictions.append(output.detach().cpu().numpy())
    real.append(batch.y[:,target_idx].detach().cpu().numpy())
real = np.concatenate(real)
predictions = np.concatenate(predictions)

plt.scatter(real, predictions)
plt.ylabel('Predicted')
plt.xlabel('real')
plt.show()


Creating Your Own Datasets

https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html

https://qiita.com/maskot1977/items/4aa6322459eb3a78955f


Datasets

https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html


TORCH.NN.FUNCTIONAL

https://pytorch.org/docs/stable/nn.functional.html


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

# エタンのグラフ構造の作成
mol = Chem.MolFromSmiles("CC")
mol = Chem.AddHs(mol)
atoms = mol.GetAtoms()
bonds = mol.GetBonds()
bonds[0].GetEndAtomIdx()

edge_list = []
for bond in bonds:
    edge_list.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])
    edge_list.append([bond.GetEndAtomIdx(),bond.GetBeginAtomIdx()])
edge_index = torch.tensor(edge_list) #エッジのリスト作成
x = torch.tensor([[atom.GetAtomicNum()] for atom in atoms]) # 原子番号

edge_attr = []
for bond in bonds:
    edge_attr.append([])
data = Data(x=x, edge_index=edge_index.t().contiguous())
data



In [None]:
# グラフ構造の可視化
import networkx
from matplotlib import pyplot as plt
import numpy as np
from torch_geometric.utils import to_networkx
from IPython.display import SVG, display
data = dataset[4921]
nxg = to_networkx(data)

pagerank = networkx.pagerank(nxg) #pagerankはノードの中心性(重要性の指標)
pagerank_max = np.array(list(pagerank.values())).max()

#可視化する時のノード位置
draw_position = networkx.spring_layout(nxg,seed=0)

# 色指定
color_map = plt.get_cmap("tab10")
labels = data.x.numpy()
colors = [color_map(i) for i in labels]

svg = SVG(networkx.nx_agraph.to_agraph(nxg).draw(prog='fdp', format='svg'))
display(svg)

In [None]:
# 学習したいラベルのインデックス位置
target_idx = 1

for epoch in range(50):
    model.train() #訓練モード
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y[:, target_idx].unsqueeze(1))
        loss.backward()
        epoch_loss += loss.item()
        total_graphs += batch.num_graphs
        optimizer.step()
    
    train_avg_loss = epoch_loss / total_graphs
    val_loss = 0
    total_graphs = 0
    model.eval()
    for batch in valid_loader:
        output = model(batch)
        loss = criterion(output,batch.y[:, target_idx].unsqueeze(1)) #平方根で比較
        val_loss += loss.item()
        total_graphs += batch.num_graphs
    
    val_avg_loss = val_loss / total_graphs
    print(f"Epochs: {i} | epoch avg. loss: {train_avg_loss:.2f} | validation avg. loss: {val_avg_loss:.2f}")
    