    (conv -> pool) * 3 -> fc
    
    
    Input size : (batch_size, 1024, 1, 1)
    conv 1 : filter=32, kernel_size=3, stride=(1,1), padding=same, BN, relu
    pool 1 : maxpool=(2,1), padding=same
    conv 2 : filter=64, kernel_size=3, stride=(1,1), padding=same, BN, relu
    pool 2 : maxpool=(2,1), padding=same
    conv 3 : filter=64, kernel_size=3, stride=(1,1), padding=same, BN, relu
    pool 3 : maxpool=(2,1), padding=same
    flatten layer
    fc : 128, relu
    dropout : 20%
    fc : 2, softmax
    
    optim : adam
    loss function : CE
    metric : accuracy

## 0. 데이터 준비

In [20]:
from rdkit import Chem

import pandas as pd
import numpy as np

import torch

import matplotlib.pyplot as plt

### 0.1 데이터 불러오기

In [21]:
def load_data(path):
    df_train = pd.read_csv(path + '/dataset/train_.csv')
    df_test = pd.read_csv(path + '/dataset/valid_.csv')
    
    df_train = df_train.rename(columns={'Unnamed: 0' : "idx"})
    df_test = df_test.rename(columns={'Unnamed: 0' : "idx"})
    
    df_all = df_train.append(df_test).reset_index(drop=True)
    
    return df_all, df_train, df_test

In [22]:
CURRENT_PATH = '/Users/skcc10170/Desktop'
df_all, df_train, df_test = load_data(path=CURRENT_PATH)

### 0.2 컬럼 분류하기
먼저 다음과 같이 분류할 수 있습니다.
- 스마일코드 (1개 컬럼)
    - 화합물의 구조를 문자열로 표기
- 분자의 지문 데이터 (1024개씩 3개, 3072개 컬럼)
    - ecfp : 1024개 column
    - fcfp : 1024개 column
    - ptfp : 1024개 column
- 분자자체 특성 (4개 컬럼)
    - MolWt : 화합물의 분자 질량
    - clogp : 분배 계수
    - sa_score : 합성 가능성
    - qed : 약물 유사성

In [24]:
def classify_cols(df):
    cols = df.columns

    # smiles code
    col_smiles = ['SMILES']

    # node-edge level (3 footprints)
    col_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
    col_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
    col_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

    # graph level
    col_mol = list(cols[-5:-1])

    # input cols
    col_input = col_ecfp + col_fcfp + col_ptfp + col_mol # col_smiles 제외

    # label
    col_label = ['label']
    
    return col_smiles[0], col_ecfp, col_fcfp, col_ptfp, col_mol, col_label[0]

In [25]:
cols = classify_cols(df_train)

### 0.3 mol2graph
분자를 그래프로 해석한다면
- 그래프(분자)
- 노드(원자) -> 노드 feature matrix
- 엣지(연결관계) -> 엣지 feature matrix (일단 생략)

3457이 제일 쉬움

In [37]:
MAX_LEN = df_all['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms()).max()
LIST_SYMBOLS = list(set.union(*df_all['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))
NUM_ATOM_FEATURES = 5

In [38]:
MAX_LEN, LIST_SYMBOLS, NUM_ATOM_FEATURES

(88, ['S', 'Br', 'I', 'O', 'P', 'C', 'N', 'F', 'Cl', 'Na', 'H', 'Si', 'Se'], 5)

In [39]:
def atom_feature(atom):
    return np.array(char_to_ix(atom.GetSymbol(), LIST_SYMBOLS) +
                    char_to_ix(atom.GetDegree(), [0, 1, 2, 3, 4, 5]) +
                    char_to_ix(atom.GetTotalNumHs(), [0, 1, 2, 3, 4]) +
                    char_to_ix(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5]) +
                    char_to_ix(int(atom.GetIsAromatic()), [0, 1]))    # (40, 6, 5, 6, 2)


def char_to_ix(x, allowable_set):
    if x not in allowable_set:
        return [0] # Unknown Atom Token
    return [allowable_set.index(x)+1]

In [49]:
def mol2graph(smi):
    mol = Chem.MolFromSmiles(smi)
    num_atom = mol.GetNumAtoms()
    
    X = np.zeros((num_atom, NUM_ATOM_FEATURES), dtype=np.uint8)
    A = np.zeros((num_atom, num_atom), dtype=np.uint8)

    A = Chem.rdmolops.GetAdjacencyMatrix(
        mol).astype(np.uint8, copy=False)
    A += np.eye(num_atom, dtype=np.uint8)
    
    for idx, atom in enumerate(mol.GetAtoms()):
        feature = atom_feature(atom)
        X[idx, :] = feature
        
    bond_a, bond_b = [], []
    for bond in mol.GetBonds():
        bond_a.append(bond.GetBeginAtomIdx())
        bond_b.append(bond.GetBeginAtomIdx())
        bond_a.append(bond.GetEndAtomIdx())
        bond_b.append(bond.GetEndAtomIdx())
    edge_index = [bond_a, bond_b]
    
    return X, A, edge_index

In [30]:
for i in range(sample_mol.GetNumAtoms()):
    print(sample_mol.GetAtomWithIdx(i).GetSymbol())

NameError: name 'sample_mol' is not defined

In [None]:
[atom.GetSymbol() for atom in sample_mol.GetAtoms()]

In [None]:
# temp = sample_mol.GetBonds()[0]
# dir(temp)
# # (temp.GetBeginAtomIdx(), temp.GetEndAtomIdx())
# # temp.GetBondTypeAsDouble()
# # (temp.GetBeginAtom().GetSymbol(), temp.GetEndAtom().GetSymbol())
# # temp.GetIsAromatic()
# 

# torch_geometric -graph classification

In [50]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data, DataLoader
from torch_geometric import utils

from torch.utils.data import Dataset

In [51]:
def get_data_list(df):
    smiles = df["SMILES"]
    labels = df['label']
    
    data_list = []
    for idx, smiles in enumerate(smiles):
        x, A, edge_index = mol2graph(smiles)
        x = torch.tensor(x, dtype=torch.float)
        edge_index = torch.tensor(edge_index, dtype=torch.long)
        y = torch.tensor([labels[idx]])

        data = Data(x=x, edge_index=edge_index, y=y,)
        data_list.append(data)
        
    return data_list

In [52]:
train_data_list = get_data_list(df_train)
test_data_list = get_data_list(df_test)

train_loader = DataLoader(train_data_list, batch_size=128)
test_loader = DataLoader(test_data_list, batch_size=128)

In [53]:
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GraphConv, TopKPooling
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
import torch.nn.functional as F

from torch_geometric.nn.pool.topk_pool import topk,filter_adj
from torch.nn import Parameter


import argparse

class SAGPool(torch.nn.Module):
    def __init__(self,in_channels,ratio=0.8,Conv=GCNConv,non_linearity=torch.tanh):
        super(SAGPool,self).__init__()
        self.in_channels = in_channels
        self.ratio = ratio
        self.score_layer = Conv(in_channels,1)
        self.non_linearity = non_linearity
    def forward(self, x, edge_index, edge_attr=None, batch=None):
        if batch is None:
            batch = edge_index.new_zeros(x.size(0))
        #x = x.unsqueeze(-1) if x.dim() == 1 else x
        score = self.score_layer(x,edge_index).squeeze()

        perm = topk(score, self.ratio, batch)
        x = x[perm] * self.non_linearity(score[perm]).view(-1, 1)
        batch = batch[perm]
        edge_index, edge_attr = filter_adj(
            edge_index, edge_attr, perm, num_nodes=score.size(0))

        return x, edge_index, edge_attr, batch, perm


# parser = argparse.ArgumentParser()
# parser.add_argument('--seed', type=int, default=777,
#                     help='seed')
# parser.add_argument('--batch_size', type=int, default=128,
#                     help='batch size')
# parser.add_argument('--lr', type=float, default=0.0005,
#                     help='learning rate')
# parser.add_argument('--weight_decay', type=float, default=0.0001,
#                     help='weight decay')
# parser.add_argument('--nhid', type=int, default=128,
#                     help='hidden size')
# parser.add_argument('--pooling_ratio', type=float, default=0.5,
#                     help='pooling ratio')
# parser.add_argument('--dropout_ratio', type=float, default=0.5,
#                     help='dropout ratio')
# parser.add_argument('--dataset', type=str, default='DD',
#                     help='DD/PROTEINS/NCI1/NCI109/Mutagenicity')
# parser.add_argument('--epochs', type=int, default=100000,
#                     help='maximum number of epochs')
# parser.add_argument('--patience', type=int, default=50,
#                     help='patience for earlystopping')
# parser.add_argument('--pooling_layer_type', type=str, default='GCNConv',
#                     help='DD/PROTEINS/NCI1/NCI109/Mutagenicity')

# args = parser.parse_args()


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
#         self.args = args
        self.num_features = 5 #args.num_features
        self.nhid = 128 # args.nhid
        self.num_classes = 2 # args.num_classes
        self.pooling_ratio = 0.5 # args.pooling_ratio
        self.dropout_ratio = 0.5 # args.dropout_ratio
        
        self.conv1 = GCNConv(self.num_features, self.nhid)
        self.pool1 = SAGPool(self.nhid, ratio=self.pooling_ratio)
        self.conv2 = GCNConv(self.nhid, self.nhid)
        self.pool2 = SAGPool(self.nhid, ratio=self.pooling_ratio)
        self.conv3 = GCNConv(self.nhid, self.nhid)
        self.pool3 = SAGPool(self.nhid, ratio=self.pooling_ratio)

        self.lin1 = torch.nn.Linear(self.nhid*2, self.nhid)
        self.lin2 = torch.nn.Linear(self.nhid, self.nhid//2)
        self.lin3 = torch.nn.Linear(self.nhid//2, self. num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = F.relu(self.conv1(x, edge_index))
        print(data)
        x, edge_index, _, batch, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
        x, edge_index, _, batch, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))
        x, edge_index, _, batch, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=self.dropout_ratio, training=self.training)
        x = F.relu(self.lin2(x))
        x = F.log_softmax(self.lin3(x), dim=-1)

        return x

In [54]:
model = Net()
model

Net(
  (conv1): GCNConv(5, 128)
  (pool1): SAGPool(
    (score_layer): GCNConv(128, 1)
  )
  (conv2): GCNConv(128, 128)
  (pool2): SAGPool(
    (score_layer): GCNConv(128, 1)
  )
  (conv3): GCNConv(128, 128)
  (pool3): SAGPool(
    (score_layer): GCNConv(128, 1)
  )
  (lin1): Linear(in_features=256, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=64, bias=True)
  (lin3): Linear(in_features=64, out_features=2, bias=True)
)

In [55]:
for i in train_loader:
#     print(i)
    model(i)

Batch(batch=[4035], edge_index=[2, 8874], x=[4035, 5], y=[128])
Batch(batch=[4035], edge_index=[2, 8912], x=[4035, 5], y=[128])
Batch(batch=[4062], edge_index=[2, 8904], x=[4062, 5], y=[128])
Batch(batch=[4138], edge_index=[2, 9140], x=[4138, 5], y=[128])
Batch(batch=[3997], edge_index=[2, 8818], x=[3997, 5], y=[128])
Batch(batch=[3950], edge_index=[2, 8732], x=[3950, 5], y=[128])
Batch(batch=[4029], edge_index=[2, 8892], x=[4029, 5], y=[128])
Batch(batch=[3973], edge_index=[2, 8794], x=[3973, 5], y=[128])
Batch(batch=[4077], edge_index=[2, 8992], x=[4077, 5], y=[128])
Batch(batch=[3924], edge_index=[2, 8662], x=[3924, 5], y=[128])
Batch(batch=[4070], edge_index=[2, 8974], x=[4070, 5], y=[128])
Batch(batch=[3973], edge_index=[2, 8756], x=[3973, 5], y=[128])
Batch(batch=[4029], edge_index=[2, 8882], x=[4029, 5], y=[128])
Batch(batch=[4036], edge_index=[2, 8912], x=[4036, 5], y=[128])
Batch(batch=[4135], edge_index=[2, 9110], x=[4135, 5], y=[128])
Batch(batch=[4046], edge_index=[2, 8920]

In [56]:
device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)

In [57]:
def test(model,loader):
    model.eval()
    correct = 0.
    loss = 0.
    for data in loader:
        data = data.to(device)
        out = model(data)
        pred = out.max(dim=1)[1]

        label = torch.tensor(data.y)

        correct += pred.eq(label).sum().item()
        loss += F.cross_entropy(out,label,reduction='sum').item()
    return correct / len(loader.dataset),loss / len(loader.dataset)

In [58]:
test(model, test_loader)

Batch(batch=[3885], edge_index=[2, 8488], x=[3885, 5], y=[128])
Batch(batch=[4119], edge_index=[2, 9092], x=[4119, 5], y=[128])
Batch(batch=[4099], edge_index=[2, 9034], x=[4099, 5], y=[128])
Batch(batch=[4186], edge_index=[2, 9250], x=[4186, 5], y=[128])
Batch(batch=[4065], edge_index=[2, 8958], x=[4065, 5], y=[128])
Batch(batch=[4091], edge_index=[2, 9034], x=[4091, 5], y=[128])


  # Remove the CWD from sys.path while we load stuff.


Batch(batch=[4041], edge_index=[2, 8916], x=[4041, 5], y=[128])
Batch(batch=[3949], edge_index=[2, 8726], x=[3949, 5], y=[128])
Batch(batch=[4077], edge_index=[2, 9008], x=[4077, 5], y=[128])
Batch(batch=[4147], edge_index=[2, 9150], x=[4147, 5], y=[128])
Batch(batch=[3977], edge_index=[2, 8720], x=[3977, 5], y=[128])
Batch(batch=[4057], edge_index=[2, 8948], x=[4057, 5], y=[128])
Batch(batch=[4045], edge_index=[2, 8894], x=[4045, 5], y=[128])
Batch(batch=[207], edge_index=[2, 462], x=[207, 5], y=[6])


(0.5275449101796407, 0.6923285956868155)

In [59]:
min_loss = 1e10
patience = 0

for epoch in range(10):
    model.train()
    for i, data in enumerate(train_loader):
        data = data.to(device)
        out = model(data)
        
        y = torch.tensor(data.y)
        
#         loss = F.nll_loss(out, y)
        criterion = torch.nn.CrossEntropyLoss()
        loss = criterion(out, y)
        print("Training loss:{}".format(loss.item()))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    val_acc,val_loss = test(test_loader)
    print("Validation loss:{}\taccuracy:{}".format(val_loss,val_acc))
    if val_loss < min_loss:
        torch.save(model.state_dict(),'latest.pth')
        print("Model saved at epoch{}".format(epoch))
        min_loss = val_loss
        patience = 0
    else:
        patience += 1
    if patience > 20: # args.patience:
        break 

model = Net().to(device)
model.load_state_dict(torch.load('latest.pth'))
test_acc,test_loss = test(model,test_loader)
print("Test accuarcy:{}".fotmat(test_acc))

Batch(batch=[4035], edge_index=[2, 8874], x=[4035, 5], y=[128])
Training loss:0.6870844960212708
Batch(batch=[4035], edge_index=[2, 8912], x=[4035, 5], y=[128])
Training loss:0.6930311918258667
Batch(batch=[4062], edge_index=[2, 8904], x=[4062, 5], y=[128])


  # Remove the CWD from sys.path while we load stuff.


IndexError: index 1063 is out of bounds for dimension 0 with size 1061