### Package installation

In [1]:
# Install RDKit.
%%capture
!pip install rdkit-pypi

Importing libraries

In [2]:
import os
import sys
import torch
import pickle
import timeit
import numpy as np
import pandas as pd
import torch.nn as nn
from rdkit import Chem
from scipy import spatial
import torch.optim as optim
import torch.nn.functional as F
from collections import defaultdict
from rdkit.Chem import rdDepictor, Descriptors, MACCSkeys
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, auc, roc_curve, matthews_corrcoef, f1_score

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# dictionary of atoms where a new element gets a new index
def create_atoms(mol):
    atoms = [atom_dict[a.GetSymbol()] for a in mol.GetAtoms()]
    return np.array(atoms)

# format from_atomIDx : [to_atomIDx, bondDict]
def create_ijbonddict(mol):
    i_jbond_dict = defaultdict(lambda: [])
    for b in mol.GetBonds():
        i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bond = bond_dict[str(b.GetBondType())]
        i_jbond_dict[i].append((j, bond))
        i_jbond_dict[j].append((i, bond))
    return i_jbond_dict


def create_fingerprints(atoms, i_jbond_dict, radius):
    """Extract the r-radius subgraphs (i.e., fingerprints)
    from a molecular graph using WeisfeilerLehman-like algorithm."""

    if (len(atoms) == 1) or (radius == 0):
        fingerprints = [fingerprint_dict[a] for a in atoms]

    else:
        vertices = atoms
        for _ in range(radius):
            fingerprints = []
            for i, j_bond in i_jbond_dict.items():
                neighbors = [(vertices[j], bond) for j, bond in j_bond]
                fingerprint = (vertices[i], tuple(sorted(neighbors)))
                fingerprints.append(fingerprint_dict[fingerprint])
            vertices = fingerprints

    return np.array(fingerprints)


def create_adjacency(mol):
    adjacency  = Chem.GetAdjacencyMatrix(mol)
    n          = adjacency.shape[0]

    adjacency  = adjacency + np.eye(n)
    degree     = sum(adjacency)
    d_half     = np.sqrt(np.diag(degree))
    d_half_inv = np.linalg.inv(d_half)
    adjacency  = np.matmul(d_half_inv,np.matmul(adjacency,d_half_inv))
    return np.array(adjacency)


def dump_dictionary(dictionary, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(dict(dictionary), f)


def load_tensor(file_name, dtype):
    return [dtype(d).to(device) for d in np.load(file_name + '.npy', allow_pickle=True)]


def load_numpy(file_name):
    return np.load(file_name + '.npy', allow_pickle=True)


def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)


def shuffle_dataset(dataset, seed):
    np.random.seed(seed)
    np.random.shuffle(dataset)
    return dataset


def split_dataset(dataset, ratio):
    n = int(ratio * len(dataset))
    dataset_1, dataset_2 = dataset[:n], dataset[n:]
    return dataset_1, dataset_2

In [6]:
radius = 3

with open('Drugs.txt', 'r') as f:
    data_list = f.read().strip().split('\n')

"""Exclude the data contains "." in the smiles, which correspond to non-bonds"""
data_list = list(filter(lambda x: '.' not in x.strip().split()[0], data_list))
N = len(data_list)

print('Total number of Drugs : %d' %(N))

atom_dict = defaultdict(lambda: len(atom_dict))
bond_dict = defaultdict(lambda: len(bond_dict))
fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))

Molecules, Adjacencies, Properties, MACCS_list = [], [], [], []

max_MolMR, min_MolMR     = -1000, 1000
max_MolLogP, min_MolLogP = -1000, 1000
max_MolWt, min_MolWt     = -1000, 1000
max_NumRotatableBonds, min_NumRotatableBonds = -1000, 1000
max_NumAliphaticRings, min_NumAliphaticRings = -1000, 1000
max_NumAromaticRings, min_NumAromaticRings   = -1000, 1000
max_NumSaturatedRings, min_NumSaturatedRings = -1000, 1000
max_NumAromaticHeterocyclicRings, min_NumHeterocyclicRings = 0, 0

for no, data in enumerate(data_list):

    print('/'.join(map(str, [no+1, N])))

    smiles, property_indices = data.strip().split('\t')
    property_s = property_indices.strip().split(',')

    property = np.zeros((1,7))
    for prop in property_s:
        property[0,int(prop)] = 1

    Properties.append(property)

    mol = Chem.MolFromSmiles(smiles)
    atoms = create_atoms(mol)
    i_jbond_dict = create_ijbonddict(mol)

    fingerprints = create_fingerprints(atoms, i_jbond_dict, radius)
    Molecules.append(fingerprints)

    adjacency = create_adjacency(mol)
    Adjacencies.append(adjacency)

    MACCS         = MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles))
    MACCS_ids     = np.zeros((20,))
    MACCS_ids[0]  = Descriptors.MolMR(mol)
    MACCS_ids[1]  = Descriptors.MolLogP(mol)
    MACCS_ids[2]  = Descriptors.MolWt(mol)
    MACCS_ids[3]  = Descriptors.NumRotatableBonds(mol)
    MACCS_ids[4]  = Descriptors.NumAliphaticRings(mol)
    MACCS_ids[5]  = MACCS[108]
    MACCS_ids[6]  = Descriptors.NumAromaticRings(mol)
    MACCS_ids[7]  = MACCS[98]
    MACCS_ids[8]  = Descriptors.NumSaturatedRings(mol)
    MACCS_ids[9]  = MACCS[137]
    MACCS_ids[10] = MACCS[136]
    MACCS_ids[11] = MACCS[145]
    MACCS_ids[12] = MACCS[116]
    MACCS_ids[13] = MACCS[141]
    MACCS_ids[14] = MACCS[89]
    MACCS_ids[15] = MACCS[50]
    MACCS_ids[16] = MACCS[160]
    MACCS_ids[17] = MACCS[121]
    MACCS_ids[18] = MACCS[149]
    MACCS_ids[19] = MACCS[161]


    if max_MolMR < MACCS_ids[0]:
        max_MolMR = MACCS_ids[0]
    if min_MolMR > MACCS_ids[0]:
        min_MolMR = MACCS_ids[0]

    if max_MolLogP < MACCS_ids[1]:
        max_MolLogP = MACCS_ids[1]
    if min_MolLogP > MACCS_ids[1]:
        min_MolLogP = MACCS_ids[1]

    if max_MolWt < MACCS_ids[2]:
        max_MolWt = MACCS_ids[2]
    if min_MolWt > MACCS_ids[2]:
        min_MolWt = MACCS_ids[2]

    if max_NumRotatableBonds < MACCS_ids[3]:
        max_NumRotatableBonds = MACCS_ids[3]
    if min_NumRotatableBonds > MACCS_ids[3]:
        min_NumRotatableBonds = MACCS_ids[3]

    if max_NumAliphaticRings < MACCS_ids[4]:
        max_NumAliphaticRings = MACCS_ids[4]
    if min_NumAliphaticRings > MACCS_ids[4]:
        min_NumAliphaticRings = MACCS_ids[4]

    if max_NumAromaticRings < MACCS_ids[6]:
        max_NumAromaticRings = MACCS_ids[6]
    if min_NumAromaticRings > MACCS_ids[6]:
        min_NumAromaticRings = MACCS_ids[6]

    if max_NumSaturatedRings < MACCS_ids[8]:
        max_NumSaturatedRings = MACCS_ids[8]
    if min_NumSaturatedRings > MACCS_ids[8]:
        min_NumSaturatedRings = MACCS_ids[8]

    MACCS_list.append(MACCS_ids)

dir_input = ('Drug pathway/input'+str(radius)+'/')
os.makedirs(dir_input, exist_ok=True)

for n in range(N):
    for b in range(20):
        if b==0:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolMR)/(max_MolMR-min_MolMR)
        elif b==1:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolLogP)/(max_MolMR-min_MolLogP)
        elif b==2:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolWt)/(max_MolMR-min_MolWt)
        elif b==3:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumRotatableBonds)/(max_MolMR-min_NumRotatableBonds)
        elif b==4:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAliphaticRings)/(max_MolMR-min_NumAliphaticRings)
        elif b==6:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAromaticRings)/(max_MolMR-min_NumAromaticRings)
        elif b==8:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumSaturatedRings)/(max_NumSaturatedRings-min_NumSaturatedRings)
        elif b==20:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAromaticHeterocyclicRings)/(max_NumAromaticHeterocyclicRings-min_NumAromaticHeterocyclicRings)
np.save(dir_input + 'molecules', Molecules)
np.save(dir_input + 'adjacencies', Adjacencies)
np.save(dir_input + 'properties', Properties)
np.save(dir_input + 'maccs', np.asarray(MACCS_list))

dump_dictionary(fingerprint_dict, dir_input + 'fingerprint_dict.pickle')

print('End!')

Total number of Drugs : 2196
1/2196
2/2196
3/2196
4/2196
5/2196
6/2196
7/2196
8/2196
9/2196
10/2196
11/2196
12/2196
13/2196
14/2196
15/2196
16/2196
17/2196
18/2196
19/2196
20/2196
21/2196
22/2196
23/2196
24/2196
25/2196
26/2196
27/2196
28/2196
29/2196
30/2196
31/2196
32/2196
33/2196
34/2196
35/2196
36/2196
37/2196
38/2196
39/2196
40/2196
41/2196
42/2196
43/2196
44/2196
45/2196
46/2196
47/2196
48/2196
49/2196
50/2196
51/2196
52/2196
53/2196
54/2196
55/2196
56/2196
57/2196
58/2196
59/2196
60/2196
61/2196
62/2196
63/2196
64/2196
65/2196
66/2196
67/2196
68/2196
69/2196
70/2196
71/2196
72/2196
73/2196
74/2196
75/2196
76/2196
77/2196
78/2196
79/2196
80/2196
81/2196
82/2196
83/2196
84/2196
85/2196
86/2196
87/2196
88/2196
89/2196
90/2196
91/2196
92/2196
93/2196
94/2196
95/2196
96/2196
97/2196
98/2196
99/2196
100/2196
101/2196
102/2196
103/2196
104/2196
105/2196
106/2196
107/2196
108/2196
109/2196
110/2196
111/2196
112/2196
113/2196
114/2196
115/2196
116/2196
117/2196
118/2196
119/2196
120/2196

  arr = np.asanyarray(arr)


In [None]:
from sklearn.ensemble import RandomForestClassifier

### Define parameters

In [7]:
dim            = 70
layer          = 3
batch          = 10
lr             = 1e-3
lr_decay       = 0.75
decay_interval = 20
iteration      = 200
extra_dim      = 20
nhead = 5
head_dim = 32
dmodel = nhead * head_dim

(dim, layer, batch, decay_interval, iteration, extra_dim) = map(int, [dim, layer, batch, decay_interval, iteration, extra_dim])
lr, lr_decay = map(float, [lr, lr_decay])

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class GTIE(nn.Module):
    def __init__(self):
        super(GTIE, self).__init__()
        self.embed_atom = nn.Embedding(n_fingerprint, dim)
        self.W_atom = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layer)])
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=nhead),
            num_layers=layer
        )
        self.W_property = nn.Linear(dim + extra_dim, 7)

    def pad(self, matrices, value):
        sizes = [d.shape[0] for d in matrices]
        D = sum(sizes)
        pad_matrices = value + np.zeros((D, D))
        m = 0
        for i, d in enumerate(matrices):
            s_i = sizes[i]
            pad_matrices[m:m + s_i, m:m + s_i] = d
            m += s_i
        return torch.FloatTensor(pad_matrices).to(device)

    def sum_axis(self, xs, axis):
        y = list(map(lambda x: torch.sum(x, 0), torch.split(xs, axis)))
        return torch.stack(y)

    def update(self, xs, adjacency, i):
        hs = torch.relu(self.W_atom[i](xs))
        return torch.matmul(adjacency, hs)

    def forward(self, inputs, sel_maccs):
        atoms, adjacency = inputs
        axis = list(map(lambda x: len(x), atoms))
        atoms = torch.cat(atoms)
        x_atoms = self.embed_atom(atoms)
        adjacency = self.pad(adjacency, 0)

        for i in range(layer):
            x_atoms = self.update(x_atoms, adjacency, i)

        extra_inputs = sel_maccs.to(device)
        y_molecules = self.sum_axis(x_atoms, axis)
        y_molecules = torch.cat((y_molecules, extra_inputs), 1)
        z_properties = self.W_property(y_molecules)

        return z_properties

    def __call__(self, data_batch, train=True):
        sel_maccs = torch.FloatTensor(data_batch[-1])
        inputs, t_properties = data_batch[:-2], torch.cat(data_batch[-2])
        z_properties = self.forward(inputs, sel_maccs)

        if train:
            loss = F.binary_cross_entropy_with_logits(z_properties, t_properties)
            return loss
        else:
            zs = torch.sigmoid(z_properties).cpu().detach().numpy()
            ts = t_properties.cpu().detach().numpy()
            scores = list(map(lambda x: x, zs))
            labels = list(map(lambda x: (x >= 0.5).astype(int), zs))
            return scores, labels, ts


### Create a trainer class

In [9]:
class Trainer(object):

    def __init__(self, model):
        self.model = model
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def train(self, dataset_train):
        np.random.shuffle(dataset_train)
        N = len(dataset_train)
        loss_total = 0
        for i in range(0, N, batch):
            data_batch = list(zip(*dataset_train[i:i+batch]))
            loss = self.model(data_batch)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_total += loss.to('cpu').data.numpy()
        return loss_total

### Create a tester class

In [10]:
class Tester(object):

    def __init__(self, model):
        self.model = model

    def test(self, dataset_test):

        N = len(dataset_test)
        score_list, label_list, t_list = [], [], []

        for i in range(0, N, batch):
            data_batch = list(zip(*dataset_test[i:i+batch]))
            scores, labels, ts = self.model(data_batch, train=False)
            score_list = np.append(score_list, scores)
            label_list = np.append(label_list, labels)
            t_list = np.append(t_list, ts)

        auc       = accuracy_score(t_list, label_list)
        precision = precision_score(t_list, label_list)
        recall    = recall_score(t_list, label_list)

        return auc, precision, recall

### Data preparation

In [11]:
dir_input = ('Drug pathway/input'+str(radius)+'/')

molecules    = load_tensor(dir_input + 'molecules', torch.LongTensor)
adjacencies  = load_numpy(dir_input + 'adjacencies')
t_properties = load_tensor(dir_input + 'properties', torch.FloatTensor)
maccs        = load_numpy(dir_input + 'maccs')

with open(dir_input + 'fingerprint_dict.pickle', 'rb') as f:
    fingerprint_dict = pickle.load(f)

dataset = list(zip(molecules, adjacencies, t_properties, maccs))
dataset = shuffle_dataset(dataset, 1234)
dataset_train, dataset_   = split_dataset(dataset, 0.8)
dataset_dev, dataset_test = split_dataset(dataset_, 0.5)

fingerprint_dict = load_pickle(dir_input + 'fingerprint_dict.pickle')
unknown          = 100
n_fingerprint    = len(fingerprint_dict) + unknown

### Create and train model

In [12]:
torch.manual_seed(1234)

model   = GTIE().to(device)
trainer = Trainer(model)
tester  = Tester(model)

dir_output = ('pathway/output/')
os.makedirs(dir_output, exist_ok=True)

print('Training...')
print('Epoch \t Time(sec) \t Loss_train \t AUC_dev \t AUC_test \t Precision \t Recall')

start = timeit.default_timer()

for epoch in range(iteration):
    if (epoch+1) % decay_interval == 0:
        trainer.optimizer.param_groups[0]['lr'] *= lr_decay

    loss    = trainer.train(dataset_train)
    auc_dev = tester.test(dataset_dev)[0]
    auc_test, precision, recall = tester.test(dataset_test)
    lr_rate = trainer.optimizer.param_groups[0]['lr']

    end  = timeit.default_timer()
    time = end - start

    print('%d \t %.4f \t %.4f \t %.4f \t %.4f \t %.4f \t %.4f' %(epoch, time, loss, auc_dev, auc_test, precision, recall))

Training...
Epoch 	 Time(sec) 	 Loss_train 	 AUC_dev 	 AUC_test 	 Precision 	 Recall


  sel_maccs = torch.FloatTensor(data_batch[-1])


0 	 4.0088 	 66.7876 	 0.8825 	 0.8792 	 0.6667 	 0.4170
1 	 4.5852 	 42.7273 	 0.9240 	 0.9227 	 0.7959 	 0.6638
2 	 5.1427 	 27.0941 	 0.9435 	 0.9338 	 0.8595 	 0.6766
3 	 5.7121 	 17.0663 	 0.9526 	 0.9494 	 0.8945 	 0.7574
4 	 6.2647 	 12.4643 	 0.9571 	 0.9513 	 0.8670 	 0.8043
5 	 6.8280 	 7.7195 	 0.9532 	 0.9468 	 0.8558 	 0.7830
6 	 7.3858 	 5.1473 	 0.9519 	 0.9506 	 0.8472 	 0.8255
7 	 7.9471 	 3.7173 	 0.9545 	 0.9617 	 0.8826 	 0.8638
8 	 8.6163 	 3.2714 	 0.9610 	 0.9578 	 0.8632 	 0.8596
9 	 9.3188 	 2.4371 	 0.9565 	 0.9591 	 0.8707 	 0.8596
10 	 10.0326 	 2.7197 	 0.9597 	 0.9597 	 0.8811 	 0.8511
11 	 10.7452 	 2.7969 	 0.9682 	 0.9643 	 0.8879 	 0.8766
12 	 11.3337 	 2.4255 	 0.9604 	 0.9597 	 0.8745 	 0.8596
13 	 11.8973 	 2.1353 	 0.9617 	 0.9578 	 0.8761 	 0.8426
14 	 12.4585 	 1.3166 	 0.9584 	 0.9571 	 0.8756 	 0.8383
15 	 13.0254 	 1.3094 	 0.9636 	 0.9649 	 0.9058 	 0.8596
16 	 13.5734 	 1.3937 	 0.9656 	 0.9643 	 0.9018 	 0.8596
17 	 14.1431 	 1.8613 	 0.960

### Random evaluation

In [13]:
data_batch = list(zip(*dataset_test[0:0+batch]))

sel_maccs            = torch.FloatTensor(data_batch[-1])
inputs, t_properties = data_batch[:-2], torch.cat(data_batch[-2])
z_properties         = model.forward(inputs, sel_maccs)

In [14]:
# True classes
print(t_properties)

# Predicted classes
torch.set_printoptions(precision=2)
p_properties = torch.sigmoid(z_properties)

for j in range(batch):
    print('%.2f\b %.2f\b %.2f\b %.2f\b %.2f\b %.2f\b %.2f\n' %(p_properties[j,0], \
    p_properties[j,1], p_properties[j,2], p_properties[j,3], p_properties[j,4], p_properties[j,5], p_properties[j,6] \
    ))

tensor([[0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.]], device='cuda:0')
0.00 0.00 0.00 0.00 0.00 1.00 0.00

0.00 0.00 0.00 0.00 0.00 1.00 0.00

0.00 0.00 0.00 0.00 0.00 1.00 0.00

0.00 0.00 0.00 1.00 0.00 1.00 0.00

0.00 0.00 0.00 0.00 0.00 1.00 0.00

0.00 0.00 0.00 0.00 0.00 0.73 0.00

0.24 0.98 0.00 0.00 0.00 0.00 0.00

0.00 0.00 0.00 0.00 0.00 1.00 0.00

0.00 0.00 0.00 0.00 0.00 1.00 0.00

0.00 0.00 0.00 0.00 0.00 1.00 0.00



### Class-wise metrics performance

In [15]:
data_batch = list(zip(*dataset_test[:]))

sel_maccs            = torch.FloatTensor(data_batch[-1])
inputs, t_properties = data_batch[:-2], torch.cat(data_batch[-2])
z_properties         = model.forward(inputs, sel_maccs)

torch.set_printoptions(precision=2)
p_properties = torch.sigmoid(z_properties)

p_properties = p_properties.data.to('cpu').numpy()
t_properties = t_properties.data.to('cpu').numpy()

p_properties[p_properties<0.5]  = 0
p_properties[p_properties>=0.5] = 1

for c in range(7):
    y_true = t_properties[:,c]
    y_pred = p_properties[:,c]

    auc       = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall    = recall_score(y_true, y_pred)


    print('Class '+str(c+1)+' Metrics performance:')
    print('Accuracy %.4f, Precision %.4f, Recall %.4f\n' %(auc, precision, recall ))

Class 1 Metrics performance:
Accuracy 0.9636, Precision 0.9444, Recall 0.7083

Class 2 Metrics performance:
Accuracy 0.9455, Precision 0.8182, Recall 0.8182

Class 3 Metrics performance:
Accuracy 0.9682, Precision 1.0000, Recall 0.6818

Class 4 Metrics performance:
Accuracy 0.9682, Precision 0.8182, Recall 0.8571

Class 5 Metrics performance:
Accuracy 0.9864, Precision 0.9474, Recall 0.9000

Class 6 Metrics performance:
Accuracy 0.9091, Precision 0.9167, Recall 0.8800

Class 7 Metrics performance:
Accuracy 0.9864, Precision 0.9286, Recall 0.8667



In [16]:
dir_input = ('Drug pathway/input'+str(radius)+'/')

molecules  = load_tensor(dir_input + 'molecules', torch.FloatTensor)
properties = load_numpy(dir_input + 'properties')
maccs      = load_numpy(dir_input + 'maccs')


with open(dir_input + 'fingerprint_dict.pickle', 'rb') as f:
    fingerprint_dict = pickle.load(f)

fingerprint_dict = load_pickle(dir_input + 'fingerprint_dict.pickle')
unknown          = 100
n_fingerprint    = len(fingerprint_dict) + unknown

my_maccs = []
for i in range(len(molecules)):
    target_mol = (n_fingerprint-1)*torch.ones([259], dtype=torch.float, device=device)
    target_mol[:molecules[i].size()[0]] = molecules[i]
    my_maccs.append(np.concatenate((target_mol.cpu().data.numpy(),maccs[i]), axis=0))

dataset = list(zip(properties, my_maccs))
dataset = shuffle_dataset(dataset, 4123)
dataset_train, dataset_   = split_dataset(dataset, 0.8)
dataset_dev, dataset_test = split_dataset(dataset_, 0.2)


data_batch = list(zip(*dataset_train))
properties_train, maccs_train = data_batch[-2], data_batch[-1]

data_batch = list(zip(*dataset_dev))
properties_dev, maccs_dev = data_batch[-2], data_batch[-1]

data_batch = list(zip(*dataset_test))
properties_test, maccs_test = data_batch[-2], data_batch[-1]

In [17]:
train_len, dev_len, test_len = len(dataset_train), len(dataset_dev), len(dataset_test)

feature_len = maccs_train[0].shape[0]

X_train, X_dev, X_test = np.zeros((train_len,feature_len)), np.zeros((dev_len,feature_len)), np.zeros((test_len,feature_len))
Y_train, Y_dev, Y_test = np.zeros((train_len,7)), np.zeros((dev_len,7)), np.zeros((test_len,7))

for i in range(train_len):
    X_train[i,:] = maccs_train[i]
    Y_train[i] = properties_train[i][0]

for i in range(dev_len):
    X_dev[i,:]   = maccs_dev[i]
    Y_dev[i]   = properties_dev[i][0]

for i in range(test_len):
    X_test[i,:]  = maccs_test[i]
    Y_test[i]  = properties_test[i][0]

In [18]:
clf = ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=500, criterion = 'entropy', bootstrap = False, max_features = 0.3, max_depth = 50, random_state=0)
clf.fit(X_train, Y_train)
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=500, criterion = 'entropy', bootstrap = False, max_features = 0.3, max_depth = 50, random_state=0)
# clf.fit(X_train, Y_train)

In [19]:
Y_pred = clf.predict(X_test)
start = timeit.default_timer()
acc_score, prec_score, rec_score, mat_score, pr_score = 0., 0., 0., 0., 0.
for i in range(Y_test.shape[0]):
    acc_score  += accuracy_score(Y_test[i],Y_pred[i])
    prec_score += precision_score(Y_test[i],Y_pred[i])
    rec_score  += recall_score(Y_test[i],Y_pred[i])
    mat_score  += matthews_corrcoef(Y_test[i],Y_pred[i])
    pr_score   += f1_score(Y_test[i],Y_pred[i])



acc_score  = acc_score/Y_test.shape[0]
prec_score = prec_score/Y_test.shape[0]
rec_score  = rec_score/Y_test.shape[0]
mat_score  = mat_score/Y_test.shape[0]
pr_score  = pr_score/Y_test.shape[0]

end  = timeit.default_timer()
time = end - start

print('Accuracy \t Precision \t Recall \t Mattews \t F1 \t Time (sec)')
print('%.4f%% \t %.4f%% \t %.4f%% \t %.4f%% \t %.4f%% \t %.4f ' %(acc_score, prec_score, rec_score, mat_score, pr_score, time))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy 	 Precision 	 Recall 	 Mattews 	 F1 	 Time (sec)
0.9785% 	 0.9361% 	 0.9252% 	 0.9206% 	 0.9270% 	 1.6177 
