In [35]:

#%matplotlib inline
#import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import math
import pandas as pd

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import InputLayer, Input
from tensorflow.python.keras.layers import Reshape, MaxPooling2D
from tensorflow.python.keras.layers import Conv2D, Dense, Flatten
from tensorflow.python.keras.callbacks import TensorBoard
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.models import load_model
from keras.models import model_from_json
from tensorflow import keras

# Scikit-optimizer
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
#from skopt.plots import plot_convergence
#from skopt.plots import plot_objective, plot_evaluations
#from skopt.plots import plot_histogram, plot_objective_2D
from skopt.utils import use_named_args

best_accuracy = 0.0
path_best_model_h5 = 'best_model_keras.h5'
path_best_model_json = 'best_model_keras.json'
modelpath = 'best_model_keras'

In [36]:
def log_dir_name(learning_rate, num_dense_layers,
                 num_dense_nodes, activation):

    # The dir-name for the TensorBoard log-dir.
    s = "./19_logs/lr_{0:.0e}_layers_{1}_nodes_{2}_{3}/"

    # Insert all the hyper-parameters in the dir-name.
    log_dir = s.format(learning_rate,
                       num_dense_layers,
                       num_dense_nodes,
                       activation)

    return log_dir


In [37]:
def xyPart(data):
    Xdata = data.iloc[:,0:177]
    Ydata = []
    for elem in data.iloc[:,177]:
        if elem == "Low":
            Ydata.append(0)
        elif elem == "High":
            Ydata.append(1)
    Ydata = np.array(Ydata)
    return Xdata, Ydata

In [38]:

def create_model(learning_rate, num_dense_layers,
                 num_dense_nodes, activation):
    """
    Hyper-parameters:
    learning_rate:     Learning-rate for the optimizer.
    num_dense_layers:  Number of dense layers.
    num_dense_nodes:   Number of nodes in each dense layer.
    activation:        Activation function for all layers.
    """

    # Start construction of a Keras Sequential model.
    model = Sequential()

    # Add fully-connected / dense layers.
    # The number of layers is a hyper-parameter we want to optimize.
    for i in range(num_dense_layers):
        # Name of the layer. This is not really necessary
        # because Keras should give them unique names.
        name = 'layer_dense_{0}'.format(i+1)

        # Add the dense / fully-connected layer to the model.
        # This has two hyper-parameters we want to optimize:
        # The number of nodes and the activation function.
        model.add(Dense(num_dense_nodes,
                        activation=activation,
                        name=name))

    # Last fully-connected / dense layer with softmax-activation
    # for use in classification.
    model.add(Dense(2, activation='softmax'))

    # Use the Adam method for training the network.
    # We want to find the best learning-rate for the Adam method.
    optimizer = Adam(lr=learning_rate)

    # In Keras we need to compile the model so it can be trained.
    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [39]:
def saveModel(model, path_best_model_h5, path_best_model_json):
    # serialize model to JSON
    model_json = model.to_json()
    with open(path_best_model_json, "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(path_best_model_h5)
    print("Saved model to disk")

In [10]:
def loadModel(path_best_model_h5, path_best_model_json):
    # load json and create model
    json_file = open(path_best_model_json, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(path_best_model_h5)
    print("Loaded model from disk")
    return loaded_model

In [40]:
# Define the parameters to optimize
dim_learning_rate = Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate')
dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_dense_nodes = Integer(low=5, high=512, name='num_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'], name='activation')
dimensions = [dim_learning_rate, dim_num_dense_layers, dim_num_dense_nodes, dim_activation]
default_parameters = [1e-5, 1, 16, 'relu'] # Same order as dimensions

In [41]:
@use_named_args(dimensions=dimensions)
def fitness(learning_rate, num_dense_layers,
            num_dense_nodes, activation):
    """
    Hyper-parameters:
    learning_rate:     Learning-rate for the optimizer.
    num_dense_layers:  Number of dense layers.
    num_dense_nodes:   Number of nodes in each dense layer.
    activation:        Activation function for all layers.
    """

    # Print the hyper-parameters.
    print('learning rate: {0:.1e}'.format(learning_rate))
    print('num_dense_layers:', num_dense_layers)
    print('num_dense_nodes:', num_dense_nodes)
    print('activation:', activation)
    print()

    # Create the neural network with these hyper-parameters.
    model = create_model(learning_rate=learning_rate,
                         num_dense_layers=num_dense_layers,
                         num_dense_nodes=num_dense_nodes,
                         activation=activation)

    # Dir-name for the TensorBoard log-files.
    log_dir = log_dir_name(learning_rate, num_dense_layers,
                           num_dense_nodes, activation)

    # Create a callback-function for Keras which will be
    # run after each epoch has ended during training.
    # This saves the log-files for TensorBoard.
    # Note that there are complications when histogram_freq=1.
    # It might give strange errors and it also does not properly
    # support Keras data-generators for the validation-set.
    callback_log = TensorBoard(
        log_dir=log_dir,
        histogram_freq=0,
        batch_size=32,
        write_graph=True,
        write_grads=False,
        write_images=False)
    
    # Stop training dependent on validation accuracy - JGW
    early_stop = keras.callbacks.EarlyStopping(monitor='val_acc', patience=20)

    # Use Keras to train the model.
    history = model.fit(x=ncXtrain.values,
                        y=Ytrain,
                        epochs=3,
                        batch_size=128,
                        validation_split = 0.2,
                        callbacks=[callback_log, early_stop])

    # Get the classification accuracy on the validation-set
    # after the last training-epoch.
    accuracy = history.history['val_acc'][-1]

        # Print the classification accuracy.
    print()
    print("Accuracy: {0:.2%}".format(accuracy))
    print()
    # Save the model if it improves on the best-found performance.
    # We use the global keyword so we update the variable outside
    # of this function.
    global best_accuracy

    # If the classification accuracy of the saved model is improved ...
    if accuracy > best_accuracy:
        # Save the new model to harddisk.
        #saveModel(model, path_best_model_h5, path_best_model_json)
        tf.keras.models.save_model(model, modelpath, overwrite=True, include_optimizer=True)

        # Update the classification accuracy.
        best_accuracy = accuracy

    # Delete the Keras model with these hyper-parameters from memory.
    del model

    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()

    # NOTE: Scikit-optimize does minimization so it tries to
    # find a set of hyper-parameters with the LOWEST fitness-value.
    # Because we are interested in the HIGHEST classification
    # accuracy, we need to negate this number so it can be minimized.
    return -accuracy


In [42]:
# Load and preprocess data
data = pd.read_csv('HLMdesc.txt', sep="\t")
data = data.drop(columns = ['Protocol Number', 'Experiment Start Time', 'Experiment Stop Time', 'Result Type Name', 'Result Operator', 'Result Number Rounded',\
 'Project NumName', 'Smiles', 'Leonumber', 'DResult Text', 'DResult Text_1'])
data = data.drop(columns = ['Protocol Number_1', 'Experiment Start Time_1', 'Experiment Stop Time_1', 'Result Type Name_1', 'Result Operator_1', 'Result Number Rounded_1', 'Project NumName_1',\
       'Leonumber_1', 'origSmiles_1', 'ClappClass_1'])
print(data.columns)


Index(['rdk.fr_C_O_noCOO', 'rdk.Chi4v', 'rdk.fr_Ar_COO', 'rdk.fr_SH',
       'rdk.Chi4n', 'rdk.SMR_VSA10', 'rdk.fr_para_hydroxylation',
       'rdk.fr_barbitur', 'rdk.fr_halogen', 'rdk.fr_dihydropyridine',
       ...
       'rdk.PEOE_VSA9', 'rdk.fr_aldehyde', 'rdk.fr_pyridine',
       'rdk.fr_tetrazole', 'rdk.RingCount', 'rdk.fr_nitro_arom_nonortho',
       'rdk.Chi0v', 'rdk.fr_ArN', 'rdk.NumRotatableBonds', 'c#ClappClass'],
      dtype='object', length=178)


In [43]:
# Seperate test set
train = data.sample(frac=0.8,random_state=0)
test = data.drop(train.index)
#print(len(train))
#print(len(test))
Xtrain, Ytrain = xyPart(train)
Xtest, Ytest = xyPart(test)
print(Xtrain.columns)

Index(['rdk.fr_C_O_noCOO', 'rdk.Chi4v', 'rdk.fr_Ar_COO', 'rdk.fr_SH',
       'rdk.Chi4n', 'rdk.SMR_VSA10', 'rdk.fr_para_hydroxylation',
       'rdk.fr_barbitur', 'rdk.fr_halogen', 'rdk.fr_dihydropyridine',
       ...
       'rdk.MolMR', 'rdk.PEOE_VSA9', 'rdk.fr_aldehyde', 'rdk.fr_pyridine',
       'rdk.fr_tetrazole', 'rdk.RingCount', 'rdk.fr_nitro_arom_nonortho',
       'rdk.Chi0v', 'rdk.fr_ArN', 'rdk.NumRotatableBonds'],
      dtype='object', length=177)


In [44]:
# Normalize
data_stats = Xtrain.describe()
data_stats = data_stats.transpose()
def norm(x):
  return (x - data_stats['mean']) / data_stats['std']
nXtrain = norm(Xtrain)
nXtest = norm(Xtest)
#print(Xtrain.values[0])
#print(nXtrain.values[0])

In [45]:
# Rm columns containing NA
allCol = nXtrain.columns
ncXtrain = nXtrain.dropna(axis='columns')
col = ncXtrain.columns  # After removing NA cols
delCol = []
for c in allCol:
    if c not in col:
        delCol.append(c)
ncXtest = nXtest.drop(delCol, axis=1)
#print(nXtrain.values[0])
#print(ncXtrain.values[0])
#print(ncXtest.columns)


In [18]:
fitness(x=[1e-4, 3, 256, 'relu'])

learning rate: 1.0e-04
num_dense_layers: 3
num_dense_nodes: 256
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 58.90%



-0.589029535512884

In [46]:
search_result = gp_minimize(func=fitness, dimensions=dimensions, acq_func='EI', n_calls=40, x0=default_parameters)

learning rate: 1.0e-05
num_dense_layers: 1
num_dense_nodes: 16
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 17.22%

learning rate: 5.3e-04
num_dense_layers: 2
num_dense_nodes: 386
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 63.54%

learning rate: 1.8e-05
num_dense_layers: 5
num_dense_nodes: 77
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 57.47%

learning rate: 5.0e-03
num_dense_layers: 3
num_dense_nodes: 173
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 36.71%

learning rate: 2.9e-04
num_dense_layers: 5
num_dense_nodes: 437
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 99.24%

learning rate: 2.8e-05
num_dense_layers: 3
num_dense_nodes: 349
activation: sigmoid

Train on 4740

Epoch 3/3

Accuracy: 20.42%

learning rate: 1.1e-06
num_dense_layers: 1
num_dense_nodes: 497
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 72.91%

learning rate: 3.1e-04
num_dense_layers: 3
num_dense_nodes: 435
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 51.05%

learning rate: 1.6e-03
num_dense_layers: 5
num_dense_nodes: 430
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 100.00%

learning rate: 1.2e-06
num_dense_layers: 5
num_dense_nodes: 414
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 21.27%

learning rate: 4.9e-06
num_dense_layers: 5
num_dense_nodes: 415
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 31.48%

learning rate: 1.0e-02
num_dense_layers: 5
num_dense_nodes: 435
ac

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 63.29%

learning rate: 9.1e-05
num_dense_layers: 4
num_dense_nodes: 443
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 51.14%

learning rate: 1.4e-06
num_dense_layers: 3
num_dense_nodes: 508
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 1.69%

learning rate: 2.6e-03
num_dense_layers: 2
num_dense_nodes: 424
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 78.73%

learning rate: 5.2e-03
num_dense_layers: 5
num_dense_nodes: 419
activation: relu

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 9.87%

learning rate: 1.5e-03
num_dense_layers: 3
num_dense_nodes: 428
activation: sigmoid

Train on 4740 samples, validate on 1185 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 30.63%

l

In [47]:
print(search_result.x)
space = search_result.space
#space.point_to_dict(search_result.x)
search_result.fun

[0.0015745669510686966, 5, 430, 'sigmoid']


-1.0

In [48]:
#sorted(zip(search_result.func_vals, search_result.x_iters))
print(dir(search_result))
print(search_result.space)

['fun', 'func_vals', 'models', 'random_state', 'space', 'specs', 'x', 'x_iters']
Space([Real(low=1e-06, high=0.01, prior='log-uniform', transform='normalize'),
       Integer(low=1, high=5),
       Integer(low=5, high=512),
       Categorical(categories=('relu', 'sigmoid'), prior=None)])


In [49]:
from skopt.plots import plot_objective_2D
fig = plot_objective_2D(result=search_result,
                        dimension_name1='learning_rate',
                        dimension_name2='num_dense_layers',
                        levels=50)

ImportError: cannot import name 'rcParams'

In [87]:
#model = loadModel(path_best_model_h5, path_best_model_json)
tf.keras.models.load_model(modelpath, custom_objects=None, compile=True)

AttributeError: 'Sequential' object has no attribute 'output_names'