In [None]:
import torch
import torch_geometric as pyg
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.utils import one_hot, scatter
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.datasets import QM9
from torch_geometric.nn import GCNConv, NNConv
from torch_geometric.nn.conv import GATv2Conv, GATConv, TransformerConv
from torch_geometric.nn.models import MLP
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import NormalizeFeatures
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
import rdkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
from rdkit.Chem.rdchem import BondType, HybridizationType
import os
import matplotlib.pyplot as plt

batch_size の変更

RMSE or MAE

反応エネルギー(熱依存)のデータセット

反応エネルギーを予測するモデルの有効性を調べる（大変）

大きめのデータセットを使ってみる

転位の直接的な予測

SphereNet使ってみる？

井田先生のモデル　transformer(QM9を井田先生のグラフ構造に変換)

In [None]:
#https://www.graphcore.ai/posts/getting-started-with-pytorch-geometric-pyg-on-graphcore-ipus

# GCN
#NNでは64層くらい使ってる場合もある
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv2 = GCNConv(32, 32)
        self.linear1 = nn.Linear(16,1)
        self.out = nn.Linear(32, 1)
        #self.conv3 = GCNConv(32, dataset.num_classes) #num_classes:ラベルの数
    #バッチノルム(正則化)
    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        # Dropout:一定割合のノードを不活性化(0になる)させ、過学習を緩和する。pはゼロになるノードの確率で、0.5がデフォルト。
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = torch_geometric.nn.global_add_pool(x, batch) #これが必要やった
        #x = F.dropout(x, p=0.2, training=self.training) # 取ってみる
        x = self.out(x)
        return x

class GCN_N(torch.nn.Module):
    def __init__(self, layer:int, dim=32):
        super().__init__()
        self.layer = layer
        self.dim = dim
        self.conv1 = GCNConv(dataset.num_node_features, self.dim, improved=True)
        self.convn = GCNConv(self.dim, self.dim, improved=True)
        self.out = pyg.nn.Linear(self.dim, 1)

    def forward(self, data):
        x, batch, edge_index, edge_attr = data.x, data.batch, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        for i in range(2, self.layer + 1):
            x = self.convn(x, edge_index)
            x = F.relu(x)
        x = pyg.nn.global_add_pool(x, batch) 
        #x = F.dropout(x, p=0.2, training=self.training)
        x = self.out(x)
        return x

In [None]:
from math import sqrt
from sklearn.metrics import r2_score
from statistics import pstdev

targets = ["Isotropic polarizability", "HOMO", "LUMO", "E_Gap", "Electronic spatial extent", "ZPVE", "U_0", "U", "H", "G", "Cv", "U_0 ATOM", "U ATOM", "H ATOM", "G ATOM", "A", "B", "C"]
dataset = QM9(root="./QM9")

#データの分割(total: 130831)
num_train, num_val = int(len(dataset)*0.6), int(len(dataset)*0.2)
num_test = len(dataset) - (num_train + num_val)
batch_size = 64

# 乱数の固定
device = torch.device("cpu")
seed = 0
pyg.seed_everything(seed=seed)

train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])

#Dataloaderの生成
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, worker_init_fn=pyg.seed_everything(seed))
valid_loader = DataLoader(valid_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
test_loader = DataLoader(test_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))

def train(target_idx, num_epochs, mae=False):
    #split()

    target = targets[target_idx]  
    criterion_mse = F.mse_loss
    criterion_mae = F.l1_loss
    if mae:
        criterion = "MAE"
    else:
        criterion = "RMSE"
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_R2 = 0
        train_R2_list = []
        for batch in train_loader:
            batch = batch.to("cpu")
            optimizer.zero_grad()
            prediction = model(batch)
            if mae:
                loss = criterion_mae(prediction, batch.y[:, target_idx].unsqueeze(1))
            else:
                loss = criterion_mse(prediction, batch.y[:, target_idx].unsqueeze(1))
            # R2
            R2 = r2_score(batch.y[:, target_idx].unsqueeze(1).detach().numpy().copy(), prediction.detach().numpy().copy())
            loss.backward()
            train_loss += loss.item()
            train_R2 += R2
            train_R2_list.append(R2)
            optimizer.step()

        train_loss /=  len(train_loader) #損失の平均(batchあたり) #平均を取ってからルート
        if mae:
            pass
        else:
            train_loss = sqrt(train_loss)   
        
        train_R2 /= len(train_loader)
        train_R2_std = pstdev(train_R2_list)
        train_R2_std /= len(train_loader)

        # validation
        model.eval()
        valid_loss = 0
        valid_R2 = 0
        valid_R2_list = []
        total_graphs = 0
        with torch.inference_mode(): # 自動微分無効。torch.no_grad()よりさらに高速化
            for batch in valid_loader:
                prediction = model(batch)
                if mae:
                    loss = criterion_mae(prediction, batch.y[:, target_idx].unsqueeze(1))
                else:
                    loss = criterion_mse(prediction, batch.y[:, target_idx].unsqueeze(1))
                # R2
                R2 = r2_score(batch.y[:, target_idx].unsqueeze(1).detach().numpy().copy(), prediction.detach().numpy().copy())
                valid_loss += loss.item()
                valid_R2 += R2
                valid_R2_list.append(R2)

        valid_loss /= len(valid_loader)
        if mae:
            pass
        else:
            valid_loss = sqrt(valid_loss)
    
        valid_R2 /= len(valid_loader)
        valid_R2_std = pstdev(valid_R2_list)
        valid_R2_std /= len(valid_loader)

        print(f"Epoch {epoch+1} | train_loss:{train_loss}, valid_loss:{valid_loss}")
    result = [target, criterion, train_loss, valid_loss, train_R2, valid_R2, train_R2_std, valid_R2_std]
    return result

results = [["target", "criterion", "train_loss", "valid_loss", "train_R2", "valid_R2", "train_R2_std", "valid_R2_std"]]

In [None]:
layer = 3
dim = 32
num_epochs = 10

# RMSE
for target_idx in range(len(targets)):
    # 乱数の固定
    device = torch.device("cpu")
    seed = 0
    pyg.seed_everything(seed=seed)

    train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])

    #Dataloaderの生成
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, worker_init_fn=pyg.seed_everything(seed))
    valid_loader = DataLoader(valid_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
    test_loader = DataLoader(test_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
    model = GCN_N(layer=layer,dim=dim)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)
    train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])
    #Dataloaderの生成
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, worker_init_fn=pyg.seed_everything(seed))
    valid_loader = DataLoader(valid_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
    test_loader = DataLoader(test_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
    model = GCN_N(layer=layer,dim=dim)
    # Optimizerの初期化
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)

    result = train(target_idx=target_idx, mae=False, num_epochs=num_epochs)
    results.append(result)
    if target_idx == 0 :
        if not os.path.isfile("GCN_3_32.csv"):
            df_RMSE = pd.DataFrame([result], columns=results)
            df_RMSE.to_csv("GCN_3_32.csv", index=False, header=True)
        else:
            df_RMSE = pd.DataFrame([result], columns=results)
            df_RMSE.to_csv("GCN_3_32.csv", mode="a", index=False, header=False)     
    else:
        df_RMSE = pd.DataFrame([result], columns=results)
        df_RMSE.to_csv("GCN_3_32.csv", mode="a", index=False, header=False)  
    
# MAE
for target_idx in range(len(targets)):
    model = GCN_N(layer=layer,dim=dim)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)
    train_set, valid_set, test_set = random_split(dataset, [num_train, num_val, num_test])
    #Dataloaderの生成
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, worker_init_fn=pyg.seed_everything(seed))
    valid_loader = DataLoader(valid_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
    test_loader = DataLoader(test_set, batch_size=batch_size, worker_init_fn=pyg.seed_everything(seed))
    # Optimizerの初期化
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)

    result = train(target_idx=target_idx+1, mae=True, num_epochs=num_epochs)
    results.append(result)
    if target_idx == 0 :
        if not os.path.isfile("GCN_3_32.csv"):
            df_RMSE = pd.DataFrame([result], columns=results)
            df_RMSE.to_csv("GCN_3_32.csv", index=False, header=True)
        else:
            df_RMSE = pd.DataFrame([result], columns=results)
            df_RMSE.to_csv("GCN_3_32.csv", mode="a", index=False, header=False)     
    else:
        df_RMSE = pd.DataFrame([result], columns=results)
        df_RMSE.to_csv("GCN_3_32.csv", mode="a", index=False, header=False)  