#### Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import numpy as np
import time
from tqdm import tqdm
from torch.utils import data
from dgl.dataloading import GraphDataLoader
from dgllife.model.gnn.mpnn import MPNNGNN
import dgl
from torch.utils.data import random_split

Using backend: pytorch


RDKit is not installed, which is required for utils related to cheminformatics


#### Define Dataset

In [2]:
dataset = dgl.data.QM9EdgeDataset(label_keys=['mu'], raw_dir="./data")
data_len = len(dataset)

Done loading data from cached files.


#### Define Model

In [3]:
class MPNN(nn.Module):
    def __init__(self, n_in_feats, e_in_feats, n_o_feats, e_h_feats, hidden_dim, out_dim, d_prob=0.15):
        super(MPNN, self).__init__()
        self.mpnn1 = MPNNGNN(n_in_feats, e_in_feats, n_o_feats, e_h_feats)
        # use the default node output feature 64
        self.mpnn2 = MPNNGNN(n_o_feats, e_in_feats, n_o_feats, e_h_feats)
        self.fc1 = nn.Linear(n_o_feats, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

        self.d_prob = d_prob
    
    def forward(self, g, n_in_feat, e_in_feat, pool_op):
        h = self.mpnn1(g, n_in_feat, e_in_feat)
        #h = F.relu(h)
        h = self.mpnn2(g, h, e_in_feat)
        g.ndata['h'] = h
        o = dgl.readout_nodes(graph=g, feat='h', op=pool_op)

        x = F.relu(self.fc1(o))
        x = F.dropout(x, p=self.d_prob)
        x = self.fc2(x)

        return x

#### Training Code

In [4]:
def train(train_loader, epoch, model, optimizer, device, pool_op):
    train_loss = 0.
    for epoch in range(1, epoch+1):
        sum_loss = 0.
        for batched_graph, labels in tqdm(train_loader):
            batched_graph, labels = batched_graph.to(device), labels.to(device)
            pred = model(batched_graph, batched_graph.ndata['attr'].float(),
                         batched_graph.edata['edge_attr'].float(), pool_op)
            loss = F.l1_loss(pred, labels)
            sum_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        sum_loss /= len(train_loader)
        #print(f'Epoch: {epoch}, Loss: {sum_loss:.6f}')
        train_loss = sum_loss

    return train_loss

#### Validation and Testing Code

In [5]:
def valid_test(loader, model, device, pool_op):
    final_loss = 0.
    with torch.no_grad():
        for batched_graph, labels in loader:
            batched_graph, labels = batched_graph.to(device), labels.to(device)
            pred = model(batched_graph, batched_graph.ndata['attr'].float(),
                         batched_graph.edata['edge_attr'].float(), pool_op)
            loss = F.l1_loss(pred, labels)
            final_loss += loss.item()
        
        final_loss /= len(loader)
        
    return final_loss

#### Setup Training

In [6]:
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"using device: {device}")
# n_in_feats, e_in_feats, n_o_feats, e_h_feats, hidden_dim, out_dim
n_in_feats = 11
e_in_feats = 4
n_o_feats = 64
e_h_feats = 128
hidden_dim = 128
out_dim = 1
d_prob = 0.15
learning_rate = 0.01
batch_size = 20
epoch = 5
pool_op = 'sum'
models = []

using device: cuda:0


#### Start Training, Validation

In [7]:
# train_set, valid_set, test_set = random_split(dataset,
#                             [int(data_len*0.8), int(data_len*0.1),
#                             data_len-int(data_len*0.8)-int(data_len*0.1)])
# model = MPNN(n_in_feats, e_in_feats, n_o_feats, e_h_feats, hidden_dim, out_dim)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# model = model.to(device)
# train_loader = GraphDataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=8)
# model.train()
# train_loss = train(train_loader, epoch, model, optimizer, device, pool_op)
# train_loss
train_losses = []
valid_losses = []
test_sets = []
test_losses = []
for i in range(5):
    train_set, valid_set, test_set = random_split(dataset,
                            [int(data_len*0.8), int(data_len*0.1),
                            data_len-int(data_len*0.8)-int(data_len*0.1)])
    
    model = MPNN(n_in_feats, e_in_feats, n_o_feats, e_h_feats, hidden_dim, out_dim, d_prob)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model = model.to(device)
    
    train_loader = GraphDataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=8)
    model.train()
    train_loss = train(train_loader, epoch, model, optimizer, device, pool_op)
    print('fold {}, train loss {:.3f}'.format(i+1, train_loss))
    train_losses.append(train_loss)
    
    valid_loader = GraphDataLoader(dataset=valid_set, batch_size=batch_size, shuffle=True, num_workers=8)
    model.eval()
    valid_loss = valid_test(valid_loader, model, device, pool_op)
    print('fold {}, valid loss {:.3f}'.format(i+1, valid_loss))
    valid_losses.append(valid_loss)
    
    test_sets.append(test_set)
    models.append(model)
    
print('average train loss is {:.3f}, std is {:.3f}'.format(np.mean(train_losses), np.std(train_losses)))
print('average validation loss is {:.3f}, std is {:.3f}'.format(np.mean(valid_losses), np.std(valid_losses)))

100%|██████████| 5234/5234 [05:13<00:00, 16.69it/s]
100%|██████████| 5234/5234 [05:19<00:00, 16.39it/s]
100%|██████████| 5234/5234 [05:10<00:00, 16.84it/s]
100%|██████████| 5234/5234 [05:15<00:00, 16.58it/s]
100%|██████████| 5234/5234 [05:13<00:00, 16.71it/s]

fold 1, train loss 1.124



  0%|          | 0/5234 [00:00<?, ?it/s]

fold 1, valid loss 1.156


100%|██████████| 5234/5234 [05:09<00:00, 16.92it/s]
100%|██████████| 5234/5234 [04:42<00:00, 18.55it/s]
100%|██████████| 5234/5234 [04:42<00:00, 18.54it/s]
100%|██████████| 5234/5234 [04:41<00:00, 18.59it/s]
100%|██████████| 5234/5234 [04:39<00:00, 18.74it/s]

fold 2, train loss 1.001



  0%|          | 0/5234 [00:00<?, ?it/s]

fold 2, valid loss 1.006


100%|██████████| 5234/5234 [04:44<00:00, 18.38it/s]
100%|██████████| 5234/5234 [04:47<00:00, 18.22it/s]
100%|██████████| 5234/5234 [04:40<00:00, 18.66it/s]
100%|██████████| 5234/5234 [04:42<00:00, 18.51it/s]
100%|██████████| 5234/5234 [04:42<00:00, 18.52it/s]

fold 3, train loss 1.162



  0%|          | 0/5234 [00:00<?, ?it/s]

fold 3, valid loss 1.145


100%|██████████| 5234/5234 [04:38<00:00, 18.80it/s]
100%|██████████| 5234/5234 [04:41<00:00, 18.63it/s]
100%|██████████| 5234/5234 [04:41<00:00, 18.60it/s]
100%|██████████| 5234/5234 [04:41<00:00, 18.58it/s]
100%|██████████| 5234/5234 [04:39<00:00, 18.73it/s]

fold 4, train loss 1.158



  0%|          | 0/5234 [00:00<?, ?it/s]

fold 4, valid loss 1.166


100%|██████████| 5234/5234 [04:46<00:00, 18.28it/s]
100%|██████████| 5234/5234 [04:41<00:00, 18.59it/s]
100%|██████████| 5234/5234 [04:45<00:00, 18.34it/s]
100%|██████████| 5234/5234 [04:39<00:00, 18.72it/s]
100%|██████████| 5234/5234 [04:44<00:00, 18.40it/s]

fold 5, train loss 1.014





fold 5, valid loss 0.992
average train loss is 1.092, std is 0.070
average validation loss is 1.093, std is 0.077


#### Start Testing

In [8]:
for i in range(5):
    test_loader = GraphDataLoader(dataset=test_sets[i], batch_size=batch_size, shuffle=True, num_workers=8)
    test_loss = valid_test(test_loader, models[i], device, pool_op)
    test_losses.append(test_loss)

print('average test loss is {:.3f}, std is {:.3f}'.format(np.mean(test_losses), np.std(test_losses)))

average test loss is 1.104, std is 0.078
