Skip to content

Commit

Permalink
freated folder with example configs
Browse files Browse the repository at this point in the history
  • Loading branch information
Mariewelt committed Aug 9, 2018
1 parent c2876b9 commit 67d3019
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 0 deletions.
95 changes: 95 additions & 0 deletions example_configs/logp_gcnn_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from openchem.models.Graph2Label import Graph2Label
from openchem.modules.encoders.gcn_encoder import GraphCNNEncoder
from openchem.modules.mlp.openchem_mlp import OpenChemMLP
from openchem.data.graph_data_layer import GraphDataset

from openchem.utils.graph import Attribute
from openchem.utils.utils import identity

import torch.nn as nn
from torch.optim import RMSprop, SGD, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
import torch.nn.functional as F
from sklearn.metrics import r2_score, mean_squared_error

import pandas as pd

import copy
import pickle


def get_atomic_attributes(atom):
attr_dict = {}

atomic_num = atom.GetAtomicNum()
atomic_mapping = {5: 0, 7: 1, 6: 2, 8: 3, 9: 4, 15: 5, 16: 6, 17: 7, 35: 8,
53: 9}
if atomic_num in atomic_mapping.keys():
attr_dict['atom_element'] = atomic_mapping[atomic_num]
else:
attr_dict['atom_element'] = 10
attr_dict['valence'] = atom.GetTotalValence()
attr_dict['charge'] = atom.GetFormalCharge()
attr_dict['hybridization'] = atom.GetHybridization().real
attr_dict['aromatic'] = int(atom.GetIsAromatic())
return attr_dict


node_attributes = {}
node_attributes['valence'] = Attribute('node', 'valence', one_hot=True, values=[1, 2, 3, 4, 5, 6])
node_attributes['charge'] = Attribute('node', 'charge', one_hot=True, values=[-1, 0, 1, 2, 3, 4])
node_attributes['hybridization'] = Attribute('node', 'hybridization',
one_hot=True, values=[0, 1, 2, 3, 4, 5, 6, 7])
node_attributes['aromatic'] = Attribute('node', 'aromatic', one_hot=True,
values=[0, 1])
node_attributes['atom_element'] = Attribute('node', 'atom_element',
one_hot=True,
values=list(range(11)))

train_dataset = GraphDataset(get_atomic_attributes, node_attributes,
'./benchmark_datasets/Lipophilicity_dataset/Lipophilicity_train.csv',
delimiter=',', cols_to_read=[0, 1])
test_dataset = GraphDataset(get_atomic_attributes, node_attributes,
'./benchmark_datasets/Lipophilicity_dataset/Lipophilicity_test.csv',
delimiter=',', cols_to_read=[0, 1])

model = Graph2Label

model_params = {
'task': 'regression',
'data_layer': GraphDataset,
'use_clip_grad': False,
'batch_size': 256,
'num_epochs': 101,
'logdir': '/home/user/Work/OpenChem/logs/logp_gcnn_logs',
'print_every': 10,
'save_every': 5,
'train_data_layer': train_dataset,
'val_data_layer': test_dataset,
'eval_metrics': r2_score,
'criterion': nn.MSELoss(),
'optimizer': Adam,
'optimizer_params': {
'lr': 0.0005,
},
'lr_scheduler': StepLR,
'lr_scheduler_params': {
'step_size': 15,
'gamma': 0.8
},
'encoder': GraphCNNEncoder,
'encoder_params': {
'input_size': train_dataset.num_features,
'encoder_dim': 128,
'n_layers': 5,
'hidden_size': [128, 128, 128, 128, 128],
},
'mlp': OpenChemMLP,
'mlp_params': {
'input_size': 128,
'n_layers': 2,
'hidden_size': [128, 1],
'activation': [F.relu, identity]
}
}

111 changes: 111 additions & 0 deletions example_configs/tox21_rnn_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from openchem.models.Smiles2Label import Smiles2Label
from openchem.modules.embeddings.basic_embedding import Embedding
from openchem.modules.encoders.rnn_encoder import RNNEncoder
from openchem.modules.mlp.openchem_mlp import OpenChemMLP
from openchem.data.smiles_data_layer import SmilesDataset
from openchem.criterion.multitask_loss import MultitaskLoss

import torch
import torch.nn as nn

import numpy as np

from torch.optim import RMSprop, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, mean_squared_error

from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file('./benchmark_datasets/tox21/tox21.csv',
cols_to_read=[13] + list(range(0,12)))
smiles = data[0]
labels = np.array(data[1:])

labels[np.where(labels=='')] = '999'
labels = labels.T

from openchem.data.utils import get_tokens
tokens, _, _ = get_tokens(smiles)
tokens = tokens + ' '

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2,
random_state=42)

from openchem.data.utils import save_smiles_property_file
save_smiles_property_file('./benchmark_datasets/tox21/train.smi', X_train, y_train)
save_smiles_property_file('./benchmark_datasets/tox21/test.smi', X_test, y_test)

from openchem.data.smiles_data_layer import SmilesDataset
train_dataset = SmilesDataset('./benchmark_datasets/tox21/train.smi',
delimiter=',', cols_to_read=list(range(13)),
tokens=tokens, augment=True)
test_dataset = SmilesDataset('./benchmark_datasets/tox21/test.smi',
delimiter=',', cols_to_read=list(range(13)),
tokens=tokens)

def multitask_auc(ground_truth, predicted):
from sklearn.metrics import roc_auc_score
import numpy as np
import torch
ground_truth = np.array(ground_truth)
predicted = np.array(predicted)
n_tasks = ground_truth.shape[1]
auc = []
for i in range(n_tasks):
ind = np.where(ground_truth[:, i] != 999)[0]
auc.append(roc_auc_score(ground_truth[ind, i], predicted[ind, i]))
#if torch.distributed.get_rank() == 0:
# print(auc)
return np.mean(auc)

model = Smiles2Label

model_params = {
'use_cuda': True,
'task': 'multitask',
'random_seed': 5,
'use_clip_grad': True,
'max_grad_norm': 10.0,
'batch_size': 256,
'num_epochs': 31,
'logdir': '/home/mpopova/Work/OpenChem/logs/rnn_log',
'print_every': 5,
'save_every': 5,
'train_data_layer': train_dataset,
'val_data_layer': test_dataset,
'eval_metrics': multitask_auc,
'criterion': MultitaskLoss(ignore_index=999, n_tasks=12).cuda(),
'optimizer': RMSprop,
'optimizer_params': {
'lr': 0.001,
},
'lr_scheduler': StepLR,
'lr_scheduler_params': {
'step_size': 10,
'gamma': 0.8
},
'embedding': Embedding,
'embedding_params': {
'num_embeddings': train_dataset.num_tokens,
'embedding_dim': 128,
'padding_idx': train_dataset.tokens.index(' ')
},
'encoder': RNNEncoder,
'encoder_params': {
'input_size': 128,
'layer': "LSTM",
'encoder_dim': 128,
'n_layers': 4,
'dropout': 0.8,
'is_bidirectional': False
},
'mlp': OpenChemMLP,
'mlp_params': {
'input_size': 128,
'n_layers': 2,
'hidden_size': [128, 12],
'activation': [F.relu, torch.sigmoid],
'dropout': 0.0
}
}

0 comments on commit 67d3019

Please sign in to comment.