In [9]:
import torch
import torch_geometric as pyg
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.utils import one_hot, scatter
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.datasets import QM9
from torch_geometric.nn import GCNConv, NNConv
from torch_geometric.nn.conv import GATv2Conv, GATConv, TransformerConv
from torch_geometric.nn.models import MLP
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import NormalizeFeatures
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
import rdkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
from rdkit.Chem.rdchem import BondType, HybridizationType
import os
import matplotlib.pyplot as plt
import pickle
import time

device = torch.device("cuda")
dataset = QM9(root="./QM9")
#無向グラフの例
#edge_index = torch.tensor([[0,1,1,2],[1,0,2,1]], dtype=torch.long) # エッジの定義
#x = torch.tensor([[-1],[0],[1]], dtype=torch.float) # ノードの属性
#data = Data(x=x, edge_index=edge_index) # コンストラクタ
# Data(x=[3, 1], edge_index=[2, 4])

# Cheatsheet
# https://pytorch-geometric.readthedocs.io/en/latest/notes/cheatsheet.html

AttributeError: 'QM9' object has no attribute 'to'

In [None]:
#類似度予測

def ECFPGen(smiles, radius=4, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    return morgan

def npECFP(morgan):
    array = np.zeros(morgan.GetNumBits())
    rdkit.DataStructs.ConvertToNumpyArray(morgan, array)
    return np.nonzero(array)

# ECFP
# 回帰の手法(https://chemrxiv.org/engage/chemrxiv/article-details/60c75208bdbb899737a3a1c2)
# MLP, kNN, KRR, SVM, RF, LightGBM, GBRT
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import random
import math
from sklearn.metrics import r2_score
random.seed(0)

df = pd.read_csv("./qm9_dataset.csv")

In [None]:
# ECFP
# 回帰の手法(https://chemrxiv.org/engage/chemrxiv/article-details/60c75208bdbb899737a3a1c2)
# MLP, kNN, KRR, SVM, RF, LightGBM, GBRT

#ECFP = [list(ECFPGen(smiles)) for smiles in df["smiles"].values]
with open("QM9_ECFP", "rb") as f:
    ECFP = pickle.load(f)
# 説明変数と説明変数
X = [np.array(i) for i in ECFP]
Y = df["alpha"].values # 双極子モーメント
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
#1m54.2でできる

In [None]:
radius = 2
nBits = 2048
ECFP = [list(ECFPGen(smiles, radius=radius, nBits=nBits)) for smiles in df["smiles"].values]

In [None]:
hidden_layer_sizes = (100,)
mlp = MLPRegressor(max_iter=10000, activation="relu", solver="adam", verbose=True, hidden_layer_sizes=hidden_layer_sizes, early_stopping=True)
mlp.loss = "squared_error"
X = [np.array(i) for i in ECFP]
Y = df["alpha"].values #alpha:分極率 idx=1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
r2 = []
start = time.time()
mlp.fit(X_train,Y_train)
end = time.time()
time_diff = end - start
with open(f"results/ECFP_alpha_{radius}_{nBits}_{hidden_layer_sizes}", "wb") as f:
    rmse = [math.sqrt(i*2) for i in mlp.loss_curve_] #scikit-learnのMSEは2で割られているので、2をかけてから平方根を取る。
    result = [rmse, mlp.validation_scores_, time_diff]
    pickle.dump(mlp.loss_curve_, f)


In [None]:
#仮word用
with open("results/ECFP_alpha_2_2048_(100,)", "rb") as f:
    test = pickle.load(f)
plt.plot(test)

In [None]:
mlp = MLPRegressor(max_iter=1000, activation="relu", solver="adam", verbose=True, hidden_layer_sizes=hidden_layer_sizes)
mlp.loss 

In [None]:
# Pytorch ECFP
X_tensor = torch.tensor(X)
Y_tensor = torch.tensor(Y)

In [None]:
# SVR
#SVM = LinearSVR(loss="squared_epsilon_insensitive", random_state=0, max_iter=10000)
SVM = SVR(kernel="linear", max_iter=10000)
SVM.fit(X_train, Y_train)
Y_pred = SVM.predict(X_test)
mean_squared_error(Y_test, Y_pred, squared=False)
error = mean_squared_error(Y_test, Y_pred, squared=False)
error


In [None]:
mlp = MLPRegressor(hidden_layer_sizes=64, max_iter=1000, activation="relu", solver="adam", random_state=0)
mlp.fit(X_train,Y_train)
Y_pred = mlp.predict(X_test)
error = mean_squared_error(Y_test, Y_pred, squared=False)
error

In [None]:
import matplotlib.pyplot as plt
errors = []
for i in range(1, 10):
    train_X = X[:80]
    train_Y = Y[:80] 
    test_X = X[80:]
    test_Y = Y[80:]
    hidden_layer_sizes = 2**i
    mlp = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, max_iter=100000, activation="relu", solver="adam", random_state=0)
    mlp.fit(train_X,train_Y)
    pred_Y = mlp.predict(test_X)
    error = mean_squared_error(test_Y, pred_Y, squared=False)
    errors.append((i,error))

In [None]:
plt.plot([i[0] for i in errors], [i[1] for i in errors]) 
plt.xlabel("number of hidden layers(2^x)")
plt.ylabel("RMSE")
plt.title("max_iter=100000, adam, relu")

In [None]:
df = pd.read_csv("./qm9_dataset.csv")
list(df["smiles"])
atomrefs

In [None]:
dataset = MyFirstDataset(root="MyFirstDataset")

In [None]:
for name, i in df.iterrows():
    target = [float(x) for x in i.values[2:]]
    target = torch.tensor(target, dtype=torch.float)
    #target = torch.cat([target[:, 3:], target[:, 3:]], dim=0)
    #target = target * conversion.view(1, -1)


https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html

QM9の属性

x:ノードの特徴量(原子数×特徴量数=11)

y:ラベル(ラベル数)

z:原子番号(原子数)

edge_attr:エッジ特徴量=結合次数(エッジ数×結合次数)

edge_index:エッジリスト(2×エッジ数)

pos:3Dグリッドでの各原子の位置(原子数×3)

正則化の手法
・L1正則化(重み減衰)
・L2正則化(重み減衰)
・Dropout
・ラベル平滑化
・バッチ正則化

In [10]:
#https://www.graphcore.ai/posts/getting-started-with-pytorch-geometric-pyg-on-graphcore-ipus

# GCN
#NNでは64層くらい使ってる場合もある
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv2 = GCNConv(32, 32)
        self.linear1 = nn.Linear(16,1)
        self.out = nn.Linear(32, 1)
        #self.conv3 = GCNConv(32, dataset.num_classes) #num_classes:ラベルの数
    #バッチノルム(正則化)
    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        # Dropout:一定割合のノードを不活性化(0になる)させ、過学習を緩和する。pはゼロになるノードの確率で、0.5がデフォルト。
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch) #これが必要やった
        #x = F.dropout(x, p=0.2, training=self.training) # 取ってみる
        x = self.out(x)
        return x

class GCN_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32, dataset=dataset):
        super().__init__()
        self.layer = layer
        self.dataset = dataset
        self.dim = dim
        self.conv1 = GCNConv(self.dataset.num_node_features, self.dim, improved=True)
        self.convn = GCNConv(self.dim, self.dim, improved=True)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        for i in range(2, self.layer + 1):
            x = self.convn(x, edge_index)
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        #x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

class GATv2_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32):
        super().__init__()
        self.layer = layer
        self.dim = dim
        self.conv1 = GATv2Conv(dataset.num_node_features, self.dim)
        self.convn = GATv2Conv(self.dim, self.dim)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        for i in range(1, self.layer):
            x = self.convn(x, edge_index, edge_attr)
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        #x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

class GAT_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32):
        super().__init__()
        self.layer = layer
        self.dim = dim
        self.conv1 = GATConv(dataset.num_node_features, self.dim)
        self.convn = GATConv(self.dim, self.dim)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        for i in range(2, self.layer + 1):
            x = self.convn(x, edge_index, edge_attr)
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        #x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

class trans_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32):
        super().__init__()
        self.layer = layer
        self.dim = dim
        self.conv1 = GCNConv(dataset.num_node_features, self.dim, improved=True)
        self.convn = GCNConv(self.dim, self.dim, improved=True)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr[0])
        x = F.relu(x)
        for i in range(2, self.layer + 1):
            x = self.convn(x, edge_index, edge_attr[0])
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        #x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

In [11]:
class GNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, dataset.num_node_features*32))
        self.conv2_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, 32*16))        
        self.conv1 = NNConv(dataset.num_node_features, 32, self.conv1_net)
        self.conv2 = NNConv(32, 16, self.conv2_net)
        self.linear1 = torch.nn.Linear(16, 32)
        self.out = nn.Linear(32,1)
    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.out(x)
        return x

class GNNModel_N(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, dataset.num_node_features*32))
        self.conv2_net = torch.nn.Sequential(nn.Linear(dataset.num_edge_features, 32),
                                            nn.ReLU(),
                                            nn.Linear(32, 32*16))        
        self.conv1 = NNConv(dataset.num_node_features, 32, self.conv1_net)
        self.conv2 = NNConv(32, 16, self.conv2_net)
        self.linear1 = torch.nn.Linear(16, 32)
        self.out = nn.Linear(32,1)
    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.out(x)
        return x

In [5]:
#データの分割(total: 130831)
num_train, num_val = int(len(dataset)*0.8), int(len(dataset)*0.1)
num_test = len(dataset) - (num_train + num_val)
batch_size = 32

# 乱数の固定
seed = 0
pyg.seed_everything(seed=seed)
"""
random.seed(seed)y
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
"""
train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])

#Dataloaderの生成
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, worker_init_fn=pyg.seed_everything(seed))
valid_loader = DataLoader(valid_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
test_loader = DataLoader(test_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))

#layer = 2
#dim = 32

In [13]:
# 学習したいラベルのインデックス位置
from math import sqrt
from sklearn.metrics import r2_score

epoch_num = 50
target_idx = 1 # 0はじまり 0→分極率
#targets = ["Isotropic polarizability", "HOMO", "LUMO", "E_Gap", "Electronic spatial extent", "ZPVE", "U_0", "U", "H", "G", "Cv", "U_0 ATOM", "U ATOM", "H ATOM", "G ATOM", "A", "B", "C"]
#target_name = targets[target_idx]
start = time.time() #時間計測開始
results = []

mse = F.mse_loss
mae = F.l1_loss #mae

def train(criterion):
    # 学習前に毎回実行する
    model = GCN_N(layer=layer,dim=dim).to("cuda")
    # Optimizerの初期化
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)
    for epoch in range(epoch_num):
        # train
        model.train()
        train_loss = 0
        total_graphs = 0
        for batch in train_loader:
            batch = batch.to("cuda")
            optimizer.zero_grad()
            prediction = model(batch)
            loss = criterion(prediction, batch.y[:, target_idx].unsqueeze(1))
            loss.backward()
            train_loss += loss.item()
            total_graphs += batch.num_graphs
            optimizer.step()
        train_loss /=  len(train_loader) #損失の平均(batchあたり) #平均を取ってからルート
        if criterion == mse:
            train_loss = sqrt(train_loss)
        else:
            pass
    
        # validation
        model.eval()
        valid_loss = 0
        total_graphs = 0
        with torch.inference_mode(): # 自動微分無効。torch.no_grad()よりさらに高速化
            for batch in valid_loader:
                batch = batch.to("cuda")
                prediction = model(batch)
                loss = criterion(prediction, batch.y[:, target_idx].unsqueeze(1))
                valid_loss += loss.item()
                total_graphs += batch.num_graphs
        valid_loss /= len(valid_loader)
        if criterion == mse:
            valid_loss = sqrt(valid_loss)
        else:
            pass

        print(f"Epoch {epoch+1} | train_loss:{train_loss}, valid_loss:{valid_loss}")
        results.append({"Epoch":epoch+1, "train_loss":train_loss, "valid_loss":valid_loss})
    return results

#for layer in range(2,6):
#    for i in range(5,8):
#        dim = 2**i
for layer in range(4,5):
    for dim in [128]:    
        print("RMSE")
        start = time.time()
        results_mse = train(criterion=mse)
        end = time.time()
        time_diff = end - start
        results_mse = (results_mse, time_diff)
        #print("")
        #print("MAE")
        #results_mae = train(criterion=mae)

        #results = [{"Epoch":i + 1, "train_loss_RMSE":results_mse[i]["train_loss"], "valid_loss_RMSE":results_mse[i]["valid_loss"], "accuracy":accuracy} for i in range(epoch_num)]
        target_name = "dipole"
        with open(f"./results/GCN_{target_name}_{layer}_{dim}_{epoch_num}", "wb") as f: #ファイル名：ターゲット、層数、隠れ層数、エポック数 (can be loaded by pickle)
            #pickle.dump(results, f)
            pickle.dump(results_mse, f)


RMSE
Epoch 1 | train_loss:4.00940383532039, valid_loss:2.4108854592902156
Epoch 2 | train_loss:2.4619924735415712, valid_loss:2.2811358342432304
Epoch 3 | train_loss:2.2578335386140123, valid_loss:2.0170335248332325
Epoch 4 | train_loss:2.0302743969865253, valid_loss:1.710263311597712
Epoch 5 | train_loss:1.924151710004786, valid_loss:1.937818917073988
Epoch 6 | train_loss:1.8247654948804979, valid_loss:1.547212273062886
Epoch 7 | train_loss:1.757406932542751, valid_loss:1.539213673384362
Epoch 8 | train_loss:1.7286815503760236, valid_loss:1.4938697441499929
Epoch 9 | train_loss:1.6792915832738438, valid_loss:1.7347841620653854
Epoch 10 | train_loss:1.6991740955156118, valid_loss:1.456029488344603
Epoch 11 | train_loss:1.6723103534945156, valid_loss:2.0334137950324753


KeyboardInterrupt: 

In [17]:
torch.FloatTensor(1,2).to("cuda")

tensor([[-4.9069e+04,  4.5719e-41]], device='cuda:0')

In [None]:
with open("./results/GCN_dipole_4_128_50","rb") as f:
    a=pickle.load(f)
a

In [None]:
results_mse

In [None]:
with open("./results/GCN_dipole_2_32_2", "rb") as f:
    test1 = pickle.load(f)
test1

In [None]:
newarray = np.ndarray()

In [None]:
# ECFP
# https://qiita.com/kimisyo/items/55a01e27aa03852d84e9
# https://pubs.acs.org/doi/10.1021/acsomega.1c01266
# https://pubs.acs.org/doi/10.1021/acs.jcim.0c01208

import pandas as pd
df = pd.read_csv("./qm9_dataset.csv")

from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import numpy as np

def ECFPGen(smiles, radius=3, nBits=12):
    mol = Chem.MolFromSmiles(smiles)
    bit_morgan1 = {}
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, bitInfo=bit_morgan1)
    bit1 = list(fp1)
    return bit1

df["ECFP"] =  [ECFPGen(smiles, radius=2, nBits=2048) for smiles in df["smiles"]]

In [None]:
# cross-validation
from sklearn.model_selection import KFold
from math import sqrt

target_idx = 1
num_epochs = 10
fold = KFold(n_splits=3, random_state=0, shuffle=True)

for f, (train_idx,valid_idx) in enumerate(fold.split(np.arange(len(dataset)))):
    
    print(f"Fold {f + 1}")

    for epoch in range(num_epochs):
    # train
        model.train()
        train_loss = 0
        total_graphs = 0
        for batch in train_loader:
            batch = batch.to("cpu")
            optimizer.zero_grad()
            prediction = model(batch)
            #loss = torch.sqrt(criterion(prediction, batch.y[:, target_idx].unsqueeze(1)))
            loss = criterion(prediction, batch.y[:, target_idx].unsqueeze(1))
            loss.backward()
            train_loss += loss.item()
            total_graphs += batch.num_graphs
            optimizer.step()
        train_loss /=  len(train_loader) #損失の平均(batchあたり) #平均を取ってからルート
        train_loss = sqrt(train_loss)

        # validation
        model.eval()
        valid_loss = 0
        total_graphs = 0
        with torch.inference_mode(): # 自動微分無効。torch.no_grad()よりさらに高速化
            for batch in valid_loader:
                prediction = model(batch)
                #loss = torch.sqrt(criterion(prediction, batch.y[:, target_idx].unsqueeze(1)))
                loss = criterion(prediction, batch.y[:, target_idx].unsqueeze(1))
                valid_loss += loss.item()
                total_graphs += batch.num_graphs
        valid_loss /= len(valid_loader)
        valid_loss = sqrt(valid_loss)

        print(f"Epoch {epoch+1} | train_loss:{train_loss}, valid_loss:{valid_loss}")
            

In [None]:
with open("pickle_test","rb") as f:
    pickle_test = pickle.load(f)
pickle_test


隠れ層の数は層によって変えるべき？

edge_attr:多次元

edge_weight:一次元

GCNはedge_weightのため、QM9のedge_attrが使えない。そのため、edge_attrなしでの計算になる

GATはedge_attrが使える。edge_attrの追加によって悪化した。
GAT2はedge_attrが使えない。






In [None]:
# 具体的な予[測値
[[prediction[i].item(),batch.y[:, target_idx][i].item()] for i in range(len(prediction))]

In [None]:
#2層
epoch = [i for i in range(1, 51)] 

plt.subplot(121)
plt.plot(epoch, [i["train_loss"] for i in dim16])
plt.plot(epoch, [i["valid_loss"] for i in dim16])
plt.title("dim=16")
plt.ylim(0,14)
plt.subplot(122)
plt.plot(epoch, [i["train_loss"] for i in dim32])
plt.plot(epoch, [i["valid_loss"] for i in dim32])
plt.title("dim=32")
plt.ylim(0,14)

In [None]:
dim16

In [None]:
# お前はもう必要ない
import re
def parser(text):
    text = text.split("\n")
    loss = [{"train_loss":re.sub("train_loss:", "", i.split("| ")[1].split(",")[0]), "valid_loss":re.sub(" valid_loss:", "", i.split("| ")[1].split(",")[1])}for i in text if i]
    train_loss = [float(i["train_loss"]) for i in loss]
    valid_loss = [float(i["valid_loss"]) for i in loss]
    return np.array([train_loss, valid_loss])
loss_two = parser(two_layers)
loss_three = parser(theree_layers)

In [None]:
# 2層
import matplotlib.pyplot as plt
import math
epoch = [i for i in range(1, len(loss_two[0]) + 1)]
plt.subplot(1,2,1) 
loss_two = np.log(loss_two)
plt.plot(epoch, loss_two[0])
plt.subplot(1,2,2)
plt.plot(epoch, loss_two[1])

In [None]:
# 3層
import matplotlib.pyplot as plt
epoch = [i for i in range(1, len(loss_three[0]) + 1)]
loss_three = np.log(loss_three)
plt.subplot(1,2,1)
plt.plot(epoch, loss_three[0])
plt.subplot(1,2,2)
plt.plot(epoch, loss_three[1])

In [None]:
model.eval() # 評価開始
predictions = []
real = []
for batch in test_loader:
    output = model(batch.to("cpu"))
    predictions.append(output.detach().cpu().numpy())
    real.append(batch.y[:,target_idx].detach().cpu().numpy())
real = np.concatenate(real)
predictions = np.concatenate(predictions)

plt.scatter(real, predictions)
plt.ylabel('Predicted')
plt.xlabel('real')
plt.show()


Creating Your Own Datasets

https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html

https://qiita.com/maskot1977/items/4aa6322459eb3a78955f


Datasets

https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html


TORCH.NN.FUNCTIONAL

https://pytorch.org/docs/stable/nn.functional.html


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

# エタンのグラフ構造の作成
mol = Chem.MolFromSmiles("CC")
mol = Chem.AddHs(mol)
atoms = mol.GetAtoms()
bonds = mol.GetBonds()
bonds[0].GetEndAtomIdx()

edge_list = []
for bond in bonds:
    edge_list.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])
    edge_list.append([bond.GetEndAtomIdx(),bond.GetBeginAtomIdx()])
edge_index = torch.tensor(edge_list) #エッジのリスト作成
x = torch.tensor([[atom.GetAtomicNum()] for atom in atoms]) # 原子番号

edge_attr = []
for bond in bonds:
    edge_attr.append([])
data = Data(x=x, edge_index=edge_index.t().contiguous())
data



In [1]:
# グラフ構造の可視化
import networkx
from matplotlib import pyplot as plt
import numpy as np
from torch_geometric.utils import to_networkx
from IPython.display import SVG, display
data = dataset[4921]
nxg = to_networkx(data)

pagerank = networkx.pagerank(nxg) #pagerankはノードの中心性(重要性の指標)
pagerank_max = np.array(list(pagerank.values())).max()

#可視化する時のノード位置
draw_position = networkx.spring_layout(nxg,seed=0)

# 色指定
color_map = plt.get_cmap("tab10")
labels = data.x.numpy()
colors = [color_map(i) for i in labels]

svg = SVG(networkx.nx_agraph.to_agraph(nxg).draw(prog='fdp', format='svg'))
display(svg)

NameError: name 'dataset' is not defined

In [None]:
# 学習したいラベルのインデックス位置
target_idx = 1

for epoch in range(50):
    model.train() #訓練モード
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y[:, target_idx].unsqueeze(1))
        loss.backward()
        epoch_loss += loss.item()
        total_graphs += batch.num_graphs
        optimizer.step()
    
    train_avg_loss = epoch_loss / total_graphs
    val_loss = 0
    total_graphs = 0
    model.eval()
    for batch in valid_loader:
        output = model(batch)
        loss = criterion(output,batch.y[:, target_idx].unsqueeze(1)) #平方根で比較
        val_loss += loss.item()
        total_graphs += batch.num_graphs
    
    val_avg_loss = val_loss / total_graphs
    print(f"Epochs: {i} | epoch avg. loss: {train_avg_loss:.2f} | validation avg. loss: {val_avg_loss:.2f}")
    