In [6]:
from rdkit import Chem
from rdkit.Chem.rdchem import HybridizationType, ChiralType
import torch
from torch_geometric.data import Dataset, Data, DataLoader
import numpy as np
import os
import networkx as nx
import torch.nn as nn
from torch_geometric.nn import SAGEConv, global_mean_pool, global_add_pool, global_max_pool
from torch.utils.data import random_split
import pickle
from torch_geometric.utils import from_networkx
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, f1_score, recall_score, jaccard_score

In [2]:
from torch.utils.tensorboard import SummaryWriter
tr_writer = SummaryWriter("./tensorboard/subgraphclass/train")
val_writer = SummaryWriter("./tensorboard/subgraphclass/val")

In [7]:
class SubGraph(Dataset):

    def __init__(self, root, filename, test=False,transform=None, pre_transform=None, pre_filter=None):
        self.filename = filename
        self.test = test
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return self.filename

    @property
    def processed_file_names(self):
        self.raws = pickle.load(open(self.raw_paths[0], 'rb'))
        if self.test:
            return [f'data_test_{i}' for i in range(len(self.raws))]
        else:
            return [f'data_{i}.pt' for i in range(len(self.raws))]

    def download(self):
        pass

    def process(self):
        self.raws = pickle.load(open(self.raw_paths[0], 'rb'))
        for idx, mol in enumerate(self.raws):
            subgraph, label = mol
            # create data object
            data = from_networkx(subgraph)
            label = torch.tensor(label, dtype=torch.int64)
            data['target'] = label
            if self.test:
                torch.save(data, os.path.join(self.processed_dir, \
                f'data_test_{idx}.pt'))
            else:
                torch.save(data, os.path.join(self.processed_dir, \
                f'data_{idx}.pt'))
        
    def len(self):
        return len(self.raws)

    def get(self, idx):
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [8]:
train_dataset = SubGraph('./subgraphdataset/', 'train.pkl')
test_dataset = SubGraph('./subgraphdataset/', 'test.pkl', test=True)

Processing...
Done!


In [9]:
training_set, validation_set  = random_split(train_dataset, [int(len(train_dataset) * 0.8), len(train_dataset) - int(len(train_dataset) * 0.8)], generator=torch.Generator().manual_seed(42))
batch_size = 32
train_loader = DataLoader(training_set, batch_size, shuffle=True)
val_loader = DataLoader(validation_set, batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)



In [10]:
class Model(nn.Module):
    def __init__(self, args):
        super(Model, self).__init__()
        num_classses = 2

        conv_hidden = args['conv_hidden']
        cls_hidden = args['cls_hidden']
        self.n_layers = args['n_layers']
        # cls_drop = ['cls_drop']

        self.conv_layers = nn.ModuleList([])

        self.conv1 = SAGEConv(26, conv_hidden)

        for i in range(self.n_layers):
            self.conv_layers.append(
                SAGEConv(conv_hidden, conv_hidden)
            )

        self.linear1 = nn.Linear(conv_hidden, cls_hidden)
        self.linear2 = nn.Linear(cls_hidden, num_classses)
        self.relu = nn.ReLU()
        self.drop1 = nn.Dropout(p=0.5)
        self.readout = {
            0: global_mean_pool,
            1: global_add_pool,
            2: global_max_pool,
        }
    
    def forward(self, mol):

        res = self.conv1(mol.x, mol.edge_index)
        for i in range(self.n_layers):
            res = self.conv_layers[i](res, mol.edge_index)

        res = self.readout[0](res, mol.batch)
        res = self.linear1(res)
        res = self.relu(res)
        res = self.drop1(res)
        res = self.linear2(res)

        return res

In [11]:
import random
import os
import numpy as np
np.set_printoptions(threshold=np.inf)
def seed_torch(seed=42):
	random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.

In [12]:
def train(args, model, device, training_set, optimizer, criterion, epoch):
    model.train()
    sf = nn.Softmax(dim=1)
    total_loss = 0
    all_pred = []
    all_pred_raw = []
    all_labels = []
    for sub_mol in training_set:
        sub_mol = sub_mol.to(device)
        sub_mol.x = sub_mol.x.to(torch.float32)
        target = sub_mol.target
        optimizer.zero_grad()
        output= model(sub_mol)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # tracking
        all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
        all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
        all_labels.append(target.cpu().detach().numpy())
    
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    # print(f'train epoch {epoch}, all_pred {all_pred}\nall_pred_raw {all_pred_raw}\nall_labels {all_labels}')
    tr_writer.add_scalar('Ave Loss', total_loss / len(training_set), epoch)
    tr_writer.add_scalar('ACC', accuracy_score(all_labels, all_pred), epoch)
    tr_writer.add_scalar('AUC', roc_auc_score(all_labels, all_pred_raw), epoch)
    # tr_writer.add_scalar('MCC', np.mean(all_mcc), epoch)
    print(f'Train Epoch: {epoch}, Ave Loss: {total_loss / len(training_set)} ACC: {accuracy_score(all_labels, all_pred)}  AUC: {roc_auc_score(all_labels, all_pred_raw)}')

In [13]:
def val(args, model, device, val_set, optimizer, criterion, epoch):
    model.eval()
    sf = nn.Softmax(dim=1)
    total_loss = 0
    all_pred = []
    all_pred_raw = []
    all_labels = []
    for sub_mol in val_set:
        sub_mol = sub_mol.to(device)
        sub_mol.x = sub_mol.x.to(torch.float32)
        target = sub_mol.target
        optimizer.zero_grad()
        output= model(sub_mol)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # tracking
        all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
        all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
        all_labels.append(target.cpu().detach().numpy())
    
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    # print(f'val epoch {epoch}, all_pred {all_pred}\nall_pred_raw {all_pred_raw}\nall_labels {all_labels}')
    val_writer.add_scalar('Ave Loss', total_loss / len(val_set), epoch)
    val_writer.add_scalar('ACC', accuracy_score(all_labels, all_pred), epoch)
    val_writer.add_scalar('AUC', roc_auc_score(all_labels, all_pred_raw), epoch)
    print(f'validation Epoch: {epoch}, Ave Loss: {total_loss / len(val_set)} ACC: {accuracy_score(all_labels, all_pred)}  AUC: {roc_auc_score(all_labels, all_pred_raw)}')
    return accuracy_score(all_labels, all_pred)

In [14]:
def main(args):
    seed_torch(args['seed'])
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch.manual_seed(args['seed'])

    model = Model(args).to(device)
    print(model)
    # weights = torch.tensor([1, args['pos_weight']], dtype=torch.float32).to(device)
    # loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'])
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    max_acc = 0
    for epoch in range(1, args['epoch'] + 1):
        train(args, model, device, train_loader, optimizer, loss_fn, epoch)
        acc = val(args, model, device, val_loader, optimizer, loss_fn, epoch)

        scheduler.step()
        if acc > max_acc:
            max_acc = acc
            print('Saving model (epoch = {:4d}, max_acc = {:.4f})'
                .format(epoch, max_acc))
            torch.save(model.state_dict(), args['save_path'])

In [15]:
args = {
    'lr': 0.002,
    'epoch': 2,
    'seed': 42,
    'save_path': './model/subgraph3hop',
    'pos_weight': 3,
    'conv_hidden': 1024, 
    'cls_hidden': 1024,
    'n_layers': 3
}

In [13]:
main(args)

Model(
  (conv_layers): ModuleList(
    (0): SAGEConv(1024, 1024)
    (1): SAGEConv(1024, 1024)
    (2): SAGEConv(1024, 1024)
  )
  (conv1): SAGEConv(26, 1024)
  (linear1): Linear(in_features=1024, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=2, bias=True)
  (relu): ReLU()
  (drop1): Dropout(p=0.5, inplace=False)
)
Train Epoch: 1, Ave Loss: 0.6899853513090317 ACC: 0.525811823480433  AUC: 0.5469971310037717
validation Epoch: 1, Ave Loss: 0.6878304230539423 ACC: 0.5474604496253123  AUC: 0.5718302968902336
Saving model (epoch =    1, max_acc = 0.5475)
Train Epoch: 2, Ave Loss: 0.6848252469123004 ACC: 0.5563072439633638  AUC: 0.5838979997196223
validation Epoch: 2, Ave Loss: 0.6828225035416452 ACC: 0.5636969192339717  AUC: 0.5944299686698749
Saving model (epoch =    2, max_acc = 0.5637)


In [16]:
def test(model, device, test_set):
    model.eval()
    sf = nn.Softmax(dim=1)
    all_pred = []
    all_pred_raw = []
    all_labels = []
    subgraph_num = 0
    with torch.no_grad():
        for sub_mol in test_set:
            sub_mol = sub_mol.to(device)
            sub_mol.x = sub_mol.x.to(torch.float32)
            target = sub_mol.target
            output= model(sub_mol)
            
            # tracking
            all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
            all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())
    
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    all_labels = np.concatenate(all_labels).ravel()

    print(f'ACC: {accuracy_score(all_labels, all_pred)} AUC: {roc_auc_score(all_labels, all_pred_raw)}')

In [17]:
model = Model(args).to("cuda")
model.load_state_dict(torch.load(args['save_path']))

RuntimeError: No CUDA GPUs are available

In [111]:
test(model, "cuda", test_loader)

ACC: 0.6182878183491869 AUC: 0.663294256685919


In [27]:
# all subgraph num 
count = 0
for graph in _tr_set:
    count += len(graph)
print(f'training set subgraph num is {count}')

count = 0
for graph in _val_set:
    count += len(graph)
print(f'validation set subgraph num is {count}')

count = 0
for graph in _test_set:
    count += len(graph)
print(f'test set subgraph num is {count}')

training set subgraph num is 9457
validation set subgraph num is 2553
test set subgraph num is 3259


In [1]:
9457 + 2553 + 3259

15269

In [31]:
# balance
pos = 0; neg = 0
for graph in _tr_set:
    for subgraph in graph:
        if subgraph[-1]:
            pos += 1
        else:
            neg += 1
print(f'training set pos class num is {pos}, neg class num is {neg}')

pos = 0; neg = 0
for graph in _val_set:
    for subgraph in graph:
        if subgraph[-1]:
            pos += 1
        else:
            neg += 1
print(f'validation set pos class num is {pos}, neg class num is {neg}')

pos = 0; neg = 0
for graph in _test_set:
    for subgraph in graph:
        if subgraph[-1]:
            pos += 1
        else:
            neg += 1
print(f'test set pos class num is {pos}, neg class num is {neg}')

training set pos class num is 4865, neg class num is 4592
validation set pos class num is 1225, neg class num is 1328
test set pos class num is 1450, neg class num is 1809


In [2]:
4865 + 1225 + 1450 + 4592 + 1328 + 1809

15269

In [4]:
4865 + 1225 + 1450

7540

In [5]:
4592 + 1328 + 1809

7729

In [33]:
import torch.nn as nn
from torch_geometric.nn import SAGEConv, global_mean_pool
from torch.utils.data import random_split
from torch_geometric.utils import from_networkx
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, f1_score, recall_score, jaccard_score, matthews_corrcoef

In [42]:
class Model(nn.Module):
    def __init__(self, args):
        super(Model, self).__init__()
        num_classses = 2

        conv_hidden = args['conv_hidden']
        cls_hidden = args['cls_hidden']
        self.n_layers = args['n_layers']
        # cls_drop = ['cls_drop']

        self.conv_layers = nn.ModuleList([])

        self.conv1 = SAGEConv(26, conv_hidden)

        for i in range(self.n_layers):
            self.conv_layers.append(
                SAGEConv(conv_hidden, conv_hidden)
            )

        self.linear1 = nn.Linear(conv_hidden, cls_hidden)
        self.linear2 = nn.Linear(cls_hidden, num_classses)
        self.relu = nn.ReLU()
        self.drop1 = nn.Dropout(p=0.5)

    
    def forward(self, mol):

        res = self.conv1(mol.x, mol.edge_index)
        for i in range(self.n_layers):
            res = self.conv_layers[i](res, mol.edge_index)

        res = global_mean_pool(res, mol.batch)
        res = self.linear1(res)
        res = self.relu(res)
        res = self.drop1(res)
        res = self.linear2(res)

        return res

In [43]:
import random
import os
import numpy as np
np.set_printoptions(threshold=np.inf)
def seed_torch(seed=42):
	random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.

In [44]:
def train(args, model, device, training_set, optimizer, criterion, epoch):
    model.train()
    sf = nn.Softmax(dim=1)
    total_loss = 0
    all_pred = []
    all_pred_raw = []
    all_labels = []
    subgraph_num = 0
    for mol in training_set:
        for sub_mol, target in mol:
            subgraph_num += 1
            sub_mol = from_networkx(sub_mol)
            sub_mol = sub_mol.to(device)
            sub_mol.x = sub_mol.x.to(torch.float32)
            target = torch.tensor(target, dtype=torch.int64).to(device)
            optimizer.zero_grad()
            output= model(sub_mol)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            # tracking
            all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
            all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())
    
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    sall_labels = np.concatenate(sall_labels).ravel()

    tr_writer.add_scalar('Ave Loss', total_loss / subgraph_num, epoch)
    tr_writer.add_scalar('ACC', accuracy_score(all_labels, all_pred), epoch)
    tr_writer.add_scalar('AUC', roc_auc_score(all_labels, all_pred_raw), epoch)
    # tr_writer.add_scalar('MCC', np.mean(all_mcc), epoch)
    print(f'Train Epoch: {epoch}, Ave Loss: {total_loss / subgraph_num} ACC: {accuracy_score(all_labels, all_pred)}  AUC: {roc_auc_score(all_labels, all_pred_raw)}')

In [45]:
def val(args, model, device, val_set, optimizer, criterion, epoch):
    model.eval()
    sf = nn.Softmax(dim=1)
    total_loss = 0
    all_pred = []
    all_pred_raw = []
    all_labels = []
    subgraph_num = 0
    for mol in val_set:
        for sub_mol, target in mol:
            subgraph_num += 1
            sub_mol = from_networkx(sub_mol)
            sub_mol = sub_mol.to(device)
            sub_mol.x = sub_mol.x.to(torch.float32)
            target = torch.tensor(target, dtype=torch.int64).to(device)
            optimizer.zero_grad()
            output= model(sub_mol)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # tracking
            all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
            all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())
    
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    sall_labels = np.concatenate(sall_labels).ravel()

    val_writer.add_scalar('Ave Loss', total_loss / subgraph_num, epoch)
    val_writer.add_scalar('ACC', accuracy_score(all_labels, all_pred), epoch)
    val_writer.add_scalar('AUC', roc_auc_score(all_labels, all_pred_raw), epoch)
    print(f'validation Epoch: {epoch}, Ave Loss: {total_loss / subgraph_num} ACC: {accuracy_score(all_labels, all_pred)}  AUC: {roc_auc_score(all_labels, all_pred_raw)}')
    return accuracy_score(all_labels, all_pred)

In [46]:
def main(args):
    seed_torch(args['seed'])
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch.manual_seed(args['seed'])

    model = Model(args).to(device)
    print(model)
    weights = torch.tensor([1, args['pos_weight']], dtype=torch.float32).to(device)
    loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'])
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    max_acc = 0
    for epoch in range(1, args['epoch'] + 1):
        train(args, model, device, _tr_set, optimizer, loss_fn, epoch)
        acc = val(args, model, device, _val_set, optimizer, loss_fn, epoch)
        random.shuffle(_tr_set)
        random.shuffle(_val_set)
        scheduler.step()
        if acc > max_acc:
            max_acc = acc
            print('Saving model (epoch = {:4d}, max_acc = {:.4f})'
                .format(epoch, max_acc))
            torch.save(model.state_dict(), args['save_path'])

In [47]:
args = {
    'lr': 0.0002,
    'epoch': 400,
    'seed': 42,
    'save_path': './model/subgraph3hop',
    'pos_weight': 3,
    'conv_hidden': 1024, 
    'cls_hidden': 1024,
    'n_layers': 3
}

In [49]:
main(args)

Model(
  (conv_layers): ModuleList(
    (0): SAGEConv(1024, 1024)
    (1): SAGEConv(1024, 1024)
    (2): SAGEConv(1024, 1024)
  )
  (conv1): SAGEConv(26, 1024)
  (linear1): Linear(in_features=1024, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=2, bias=True)
  (relu): ReLU()
  (drop1): Dropout(p=0.5, inplace=False)
)


AttributeError: 'NoneType' object has no attribute 'max'

In [None]:
def test(model, device, test_set):
    model.eval()
    sf = nn.Softmax(dim=1)
    all_pred = []
    all_pred_raw = []
    all_labels = []
    subgraph_num = 0
    with torch.no_grad():
        for mol in test_set:
            for sub_mol, target in mol:
                subgraph_num += 1
                sub_mol = from_networkx(sub_mol)
                sub_mol = sub_mol.to(device)
                sub_mol.x = sub_mol.x.to(torch.float32)
                target = torch.tensor(target, dtype=torch.int64).to(device)
                output= model(sub_mol)
            
            # tracking
            all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
            all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())
    
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    sall_labels = np.concatenate(sall_labels).ravel()

    print(f'ACC: {accuracy_score(all_labels, all_pred)} AUC: {roc_auc_score(all_labels, all_pred_raw)}')

In [None]:
model = Model(args).to("cuda")
model.load_state_dict(torch.load(args['save_path']))

In [None]:
test(model, "cuda", _test_set)