In [6]:
import torch
import torch_geometric as pyg
from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.datasets import QM9
from torch_geometric.nn import GCNConv, NNConv
from torch_geometric.nn.conv import GATv2Conv
from torch_geometric.nn.models import MLP
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import NormalizeFeatures
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import numpy as np
import time
import matplotlib.pyplot as plt

dataset = QM9(root="./QM9") #Shuffleいらない？
#無向グラフの例
#edge_index = torch.tensor([[0,1,1,2],[1,0,2,1]], dtype=torch.long) # エッジの定義
#x = torch.tensor([[-1],[0],[1]], dtype=torch.float) # ノードの属性
#data = Data(x=x, edge_index=edge_index) # コンストラクタ
# Data(x=[3, 1], edge_index=[2, 4])

https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html

In [3]:
data = dataset[0]
data

Data(x=[5, 11], edge_index=[2, 8], edge_attr=[8, 4], y=[1, 19], pos=[5, 3], z=[5], name='gdb_1', idx=[1])

QM9の属性

x:ノードの特徴量(原子数×特徴量数=11)

y:ラベル(ラベル数)

z:原子番号(原子数)

edge_attr:エッジ特徴量=結合次数(エッジ数×結合次数)

edge_index:エッジリスト(2×エッジ数)

pos:3Dグリッドでの各原子の位置(原子数×3)

正則化の手法
・L1正則化(重み減衰)
・L2正則化(重み減衰)
・Dropout
・ラベル平滑化
・バッチ正則化
・

In [45]:
#https://www.graphcore.ai/posts/getting-started-with-pytorch-geometric-pyg-on-graphcore-ipus

# GCN
#NNでは64層くらい使ってる場合もある
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv2 = GCNConv(32, 32)
        self.linear1 = nn.Linear(16,1)
        self.out = nn.Linear(32, 1)
        #self.conv3 = GCNConv(32, dataset.num_classes) #num_classes:ラベルの数
    #バッチノルム(正則化)
    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        # Dropout:一定割合のノードを不活性化(0になる)させ、過学習を緩和する。pはゼロになるノードの確率で、0.5がデフォルト。
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch) #これが必要やった
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

class GCN_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32):
        super().__init__()
        self.layer = layer
        self.dim = dim
        self.conv1 = GCNConv(dataset.num_node_features, self.dim)
        self.convn = GCNConv(self.dim, self.dim)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        for i in range(2, self.layer + 1):
            x = self.convn(x, edge_index)
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

class GATv2_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32):
        super().__init__()
        self.layer = layer
        self.dim = dim
        self.conv1 = GATv2Conv(dataset.num_node_features, self.dim)
        self.convn = GATv2Conv(self.dim, self.dim)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        for i in range(2, self.layer + 1):
            x = self.convn(x, edge_index)
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

In [None]:
GATv2Conv

In [None]:
class GNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, dataset.num_node_features*32))
        self.conv2_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, 32*16))        
        self.conv1 = NNConv(dataset.num_node_features, 32, self.conv1_net)
        self.conv2 = NNConv(32, 16, self.conv2_net)
        self.linear1 = torch.nn.Linear(16, 32)
        self.out = nn.Linear(32,1)
    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.out(x)
        return x

In [None]:
# ECFP
# https://qiita.com/kimisyo/items/55a01e27aa03852d84e9
# https://pubs.acs.org/doi/10.1021/acsomega.1c01266
# https://pubs.acs.org/doi/10.1021/acs.jcim.0c01208

import pandas as pd
df = pd.read_csv("./qm9_dataset.csv")

from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import numpy as np

def ECFPGen(smiles, radius=3, nBits=12):
    mol = Chem.MolFromSmiles(smiles)
    bit_morgan1 = {}
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, bitInfo=bit_morgan1)
    bit1 = list(fp1)
    return bit1

df["ECFP"] =  [ECFPGen(smiles, radius=2, nBits=2048) for smiles in df["smiles"]]

In [None]:
# dataset作成
polar = df["alpha"]
ECFP = df["ECFP"]


In [None]:
df.describe()

In [None]:
if [0,0,0]:
    print("a")

In [None]:
#dipole予測　＋　ECFPのみ　＋　ECFP,dipole
#層とdimentionを増やすとどう変わるか
#NN:３層以上 層を増やすより隠れ層を増やすほうが良さそう

In [46]:
#データの分割(total: 130831)
num_train, num_val = int(len(dataset)*0.6), int(len(dataset)*0.2)
num_test = len(dataset) - (num_train + num_val)
batch_size = 32

# 乱数の固定
device = torch.device("cpu")
seed = 0
pyg.seed_everything(seed=seed)
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
"""
train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])

#Dataloaderの生成
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, worker_init_fn=pyg.seed_everything(seed))
valid_loader = DataLoader(valid_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
test_loader = DataLoader(test_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))

layer = 5
dim = 32
model = GCN_N(layer=layer, dim=dim) 
# 損失関数
criterion = F.mse_loss
# Optimizerの初期化
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)
#optimizer = torch.optim.Adam(params=lr=0.01, weight_decay=5e-4)



In [None]:
GATv2Conv()

In [47]:
# 学習したいラベルのインデックス位置
target_idx = 1
loss_two_50 = []
start = time.time() #時間計測開始
for epoch in range(50):
    # train
    model.train()
    train_loss = 0
    total_graphs = 0
    for batch in train_loader:
        batch = batch.to("cpu")
        optimizer.zero_grad()
        prediction = model(batch)
        loss = torch.sqrt(criterion(prediction, batch.y[:, target_idx].unsqueeze(1)))
        loss.backward()
        train_loss += loss.item()
        total_graphs += batch.num_graphs
        optimizer.step()
    train_loss /=  len(train_loader) #損失の平均(batchあたり)

    # validation
    model.eval()
    valid_loss = 0
    total_graphs = 0
    with torch.inference_mode(): # 自動微分無効。torch.no_grad()よりさらに高速化
        for batch in valid_loader:
            prediction = model(batch)
            loss = torch.sqrt(criterion(prediction, batch.y[:, target_idx].unsqueeze(1)))
            valid_loss += loss.item()
            total_graphs += batch.num_graphs
    valid_loss /= len(valid_loader)

    print(f"Epoch {epoch+1} | train_loss:{train_loss}, valid_loss:{valid_loss}")
    #loss_three_50.append({"Epoch":epoch + 1 , "train_loss":train_loss, "valid_loss":valid_loss})
used_time = time.time() - start
"""
loss_two_50.append(
    {
        "model": "GCN",
        "layer": layer,
        "dim": dim,
        "batch_size": 32,
        "loss": "RMSE",
        "lr": 0.01,
        "decay": 5e-4,
        "seed": seed,
        "data_split":[
            0.6,
            0.6,
            0.2
        ],
        "time": round(used_time, 4)
    }
)
"""

Epoch 1 | train_loss:10.25600228682707, valid_loss:3.590306756257428
Epoch 2 | train_loss:8.320184898259878, valid_loss:4.613319931519935
Epoch 3 | train_loss:7.289234750136769, valid_loss:2.8820002542439767
Epoch 4 | train_loss:6.332641606323099, valid_loss:3.3591150989742324
Epoch 5 | train_loss:5.660093116002444, valid_loss:3.097269449665377
Epoch 6 | train_loss:5.305509730967061, valid_loss:3.2060100251130197
Epoch 7 | train_loss:5.058760640673649, valid_loss:3.7497190406095138
Epoch 8 | train_loss:4.883322293911897, valid_loss:3.692441489702332
Epoch 9 | train_loss:4.790022389570095, valid_loss:4.097221960648348
Epoch 10 | train_loss:4.6485292300666945, valid_loss:3.4444717740079884
Epoch 11 | train_loss:4.594971790290404, valid_loss:3.711006890299268
Epoch 12 | train_loss:4.559684370578355, valid_loss:3.9429823072731933
Epoch 13 | train_loss:4.586165953284857, valid_loss:3.8241416178006125
Epoch 14 | train_loss:4.56323061488757, valid_loss:3.6345318223561516
Epoch 15 | train_loss

'\nloss_two_50.append(\n    {\n        "model": "GCN",\n        "layer": layer,\n        "dim": dim,\n        "batch_size": 32,\n        "loss": "RMSE",\n        "lr": 0.01,\n        "decay": 5e-4,\n        "seed": seed,\n        "data_split":[\n            0.6,\n            0.6,\n            0.2\n        ],\n        "time": round(used_time, 4)\n    }\n)\n'

In [None]:
loss_three_50

In [None]:
loss_two_50 = loss_three_50[51:]

loss_two_50.append(
    {
        "model": "GCN",
        "layer": layer,
        "dim": dim,
        "batch_size": 32,
        "loss": "RMSE",
        "lr": 0.01,
        "decay": 5e-4,
        "seed": seed,
        "data_split":[
            0.6,
            0.6,
            0.2
        ],
        "time": round(used_time, 4)
    }
)

In [None]:
import json
with open("./loss_two_50.json", "a") as f:
    json.dump(loss_two_50, f, indent=4)

In [None]:
with open("./loss_two_50.json", "r") as f:
    two = json.load(f)
dim16 = two[1][:-1]
dim32 = two[0][:-1]

In [None]:
#2層
epoch = [i for i in range(1, 51)] 

plt.subplot(121)
plt.plot(epoch, [i["train_loss"] for i in dim16])
plt.plot(epoch, [i["valid_loss"] for i in dim16])
plt.title("dim=16")
plt.ylim(0,14)
plt.subplot(122)
plt.plot(epoch, [i["train_loss"] for i in dim32])
plt.plot(epoch, [i["valid_loss"] for i in dim32])
plt.title("dim=32")
plt.ylim(0,14)

In [None]:
dim16

In [None]:
# お前はもう必要ない
import re
def parser(text):
    text = text.split("\n")
    loss = [{"train_loss":re.sub("train_loss:", "", i.split("| ")[1].split(",")[0]), "valid_loss":re.sub(" valid_loss:", "", i.split("| ")[1].split(",")[1])}for i in text if i]
    train_loss = [float(i["train_loss"]) for i in loss]
    valid_loss = [float(i["valid_loss"]) for i in loss]
    return np.array([train_loss, valid_loss])
loss_two = parser(two_layers)
loss_three = parser(theree_layers)

In [None]:
# 2層
import matplotlib.pyplot as plt
import math
epoch = [i for i in range(1, len(loss_two[0]) + 1)]
plt.subplot(1,2,1) 
loss_two = np.log(loss_two)
plt.plot(epoch, loss_two[0])
plt.subplot(1,2,2)
plt.plot(epoch, loss_two[1])

In [None]:
# 3層
import matplotlib.pyplot as plt
epoch = [i for i in range(1, len(loss_three[0]) + 1)]
loss_three = np.log(loss_three)
plt.subplot(1,2,1)
plt.plot(epoch, loss_three[0])
plt.subplot(1,2,2)
plt.plot(epoch, loss_three[1])

In [None]:
model.eval() # 評価開始
predictions = []
real = []
for batch in test_loader:
    output = model(batch.to("cpu"))
    predictions.append(output.detach().cpu().numpy())
    real.append(batch.y[:,target_idx].detach().cpu().numpy())
real = np.concatenate(real)
predictions = np.concatenate(predictions)

plt.scatter(real, predictions)
plt.ylabel('Predicted')
plt.xlabel('real')
plt.show()


Creating Your Own Datasets

https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html

https://qiita.com/maskot1977/items/4aa6322459eb3a78955f


Datasets

https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html


TORCH.NN.FUNCTIONAL

https://pytorch.org/docs/stable/nn.functional.html


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

# エタンのグラフ構造の作成
mol = Chem.MolFromSmiles("CC")
mol = Chem.AddHs(mol)
atoms = mol.GetAtoms()
bonds = mol.GetBonds()
bonds[0].GetEndAtomIdx()

edge_list = []
for bond in bonds:
    edge_list.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])
    edge_list.append([bond.GetEndAtomIdx(),bond.GetBeginAtomIdx()])
edge_index = torch.tensor(edge_list) #エッジのリスト作成
x = torch.tensor([[atom.GetAtomicNum()] for atom in atoms]) # 原子番号

edge_attr = []
for bond in bonds:
    edge_attr.append([])
data = Data(x=x, edge_index=edge_index.t().contiguous())
data



In [None]:
# グラフ構造の可視化
import networkx
from matplotlib import pyplot as plt
import numpy as np
from torch_geometric.utils import to_networkx
from IPython.display import SVG, display
data = dataset[4921]
nxg = to_networkx(data)

pagerank = networkx.pagerank(nxg) #pagerankはノードの中心性(重要性の指標)
pagerank_max = np.array(list(pagerank.values())).max()

#可視化する時のノード位置
draw_position = networkx.spring_layout(nxg,seed=0)

# 色指定
color_map = plt.get_cmap("tab10")
labels = data.x.numpy()
colors = [color_map(i) for i in labels]

svg = SVG(networkx.nx_agraph.to_agraph(nxg).draw(prog='fdp', format='svg'))
display(svg)

In [None]:
# 学習したいラベルのインデックス位置
target_idx = 1

for epoch in range(50):
    model.train() #訓練モード
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y[:, target_idx].unsqueeze(1))
        loss.backward()
        epoch_loss += loss.item()
        total_graphs += batch.num_graphs
        optimizer.step()
    
    train_avg_loss = epoch_loss / total_graphs
    val_loss = 0
    total_graphs = 0
    model.eval()
    for batch in valid_loader:
        output = model(batch)
        loss = criterion(output,batch.y[:, target_idx].unsqueeze(1)) #平方根で比較
        val_loss += loss.item()
        total_graphs += batch.num_graphs
    
    val_avg_loss = val_loss / total_graphs
    print(f"Epochs: {i} | epoch avg. loss: {train_avg_loss:.2f} | validation avg. loss: {val_avg_loss:.2f}")
    