In [20]:

#%matplotlib inline
#import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import math
import pandas as pd

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import InputLayer, Input
from tensorflow.python.keras.layers import Reshape, MaxPooling2D
from tensorflow.python.keras.layers import Conv2D, Dense, Flatten
from tensorflow.python.keras.callbacks import TensorBoard
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.models import load_model
from keras.models import model_from_json

# Scikit-optimizer
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
#from skopt.plots import plot_convergence
#from skopt.plots import plot_objective, plot_evaluations
#from skopt.plots import plot_histogram, plot_objective_2D
from skopt.utils import use_named_args

In [21]:
def xyPart(data):
    Xdata = data.iloc[:,0:177]
    Ydata = []
    for elem in data.iloc[:,177]:
        if elem == "Low":
            Ydata.append(0)
        elif elem == "High":
            Ydata.append(1)
    Ydata = np.array(Ydata)
    return Xdata, Ydata

In [22]:
def create_model(learning_rate, num_dense_layers,
                 num_dense_nodes, activation):
    """
    Hyper-parameters:
    learning_rate:     Learning-rate for the optimizer.
    num_dense_layers:  Number of dense layers.
    num_dense_nodes:   Number of nodes in each dense layer.
    activation:        Activation function for all layers.
    """

    # Start construction of a Keras Sequential model.
    model = Sequential()

    # Add fully-connected / dense layers.
    # The number of layers is a hyper-parameter we want to optimize.
    for i in range(num_dense_layers):
        # Name of the layer. This is not really necessary
        # because Keras should give them unique names.
        name = 'layer_dense_{0}'.format(i+1)

        # Add the dense / fully-connected layer to the model.
        # This has two hyper-parameters we want to optimize:
        # The number of nodes and the activation function.
        model.add(Dense(num_dense_nodes,
                        activation=activation,
                        name=name))

    # Last fully-connected / dense layer with softmax-activation
    # for use in classification.
    model.add(Dense(2, activation='softmax'))

    # Use the Adam method for training the network.
    # We want to find the best learning-rate for the Adam method.
    optimizer = Adam(lr=learning_rate)

    # In Keras we need to compile the model so it can be trained.
    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [23]:
# Load and preprocess data
data = pd.read_csv('HLMdesc.txt', sep="\t")
data = data.drop(columns = ['Protocol Number', 'Experiment Start Time', 'Experiment Stop Time', 'Result Type Name', 'Result Operator', 'Result Number Rounded',\
 'Project NumName', 'Smiles', 'Leonumber', 'DResult Text', 'DResult Text_1'])
data = data.drop(columns = ['Protocol Number_1', 'Experiment Start Time_1', 'Experiment Stop Time_1', 'Result Type Name_1', 'Result Operator_1', 'Result Number Rounded_1', 'Project NumName_1',\
       'Leonumber_1', 'origSmiles_1', 'ClappClass_1'])
print(data.columns)

Index(['rdk.fr_C_O_noCOO', 'rdk.Chi4v', 'rdk.fr_Ar_COO', 'rdk.fr_SH',
       'rdk.Chi4n', 'rdk.SMR_VSA10', 'rdk.fr_para_hydroxylation',
       'rdk.fr_barbitur', 'rdk.fr_halogen', 'rdk.fr_dihydropyridine',
       ...
       'rdk.PEOE_VSA9', 'rdk.fr_aldehyde', 'rdk.fr_pyridine',
       'rdk.fr_tetrazole', 'rdk.RingCount', 'rdk.fr_nitro_arom_nonortho',
       'rdk.Chi0v', 'rdk.fr_ArN', 'rdk.NumRotatableBonds', 'c#ClappClass'],
      dtype='object', length=178)


In [24]:
# Seperate test set
train = data.sample(frac=0.8,random_state=0)
test = data.drop(train.index)
#print(len(train))
#print(len(test))
Xtrain, Ytrain = xyPart(train)
Xtest, Ytest = xyPart(test)
print(Xtrain.columns)

Index(['rdk.fr_C_O_noCOO', 'rdk.Chi4v', 'rdk.fr_Ar_COO', 'rdk.fr_SH',
       'rdk.Chi4n', 'rdk.SMR_VSA10', 'rdk.fr_para_hydroxylation',
       'rdk.fr_barbitur', 'rdk.fr_halogen', 'rdk.fr_dihydropyridine',
       ...
       'rdk.MolMR', 'rdk.PEOE_VSA9', 'rdk.fr_aldehyde', 'rdk.fr_pyridine',
       'rdk.fr_tetrazole', 'rdk.RingCount', 'rdk.fr_nitro_arom_nonortho',
       'rdk.Chi0v', 'rdk.fr_ArN', 'rdk.NumRotatableBonds'],
      dtype='object', length=177)


In [25]:
# Normalize
data_stats = Xtrain.describe()
data_stats = data_stats.transpose()
def norm(x):
  return (x - data_stats['mean']) / data_stats['std']
nXtrain = norm(Xtrain)
nXtest = norm(Xtest)
#print(Xtrain.values[0])
#print(nXtrain.values[0])

In [26]:
# Rm columns containing NA
allCol = nXtrain.columns
ncXtrain = nXtrain.dropna(axis='columns')
col = ncXtrain.columns  # After removing NA cols
delCol = []
for c in allCol:
    if c not in col:
        delCol.append(c)
ncXtest = nXtest.drop(delCol, axis=1)
#print(nXtrain.values[0])
#print(ncXtrain.values[0])
#print(ncXtest.columns)

In [27]:
model = create_model(0.00927, 3, 393, "sigmoid")

In [28]:
from tensorflow import keras
# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')
    

early_stop = keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)
history = model.fit(ncXtrain.values, Ytrain, epochs=300,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])


.....................

In [29]:
#import matplotlib.pyplot as plt

def plot_history(history):
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Acc')
  plt.plot(hist['epoch'], hist['acc'],
           label='Train Acc')
  plt.plot(hist['epoch'], hist['val_acc'],
           label = 'Val Acc')
  plt.legend()
  plt.ylim([0.5,1])
  

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()
print(hist)
#plot_history(history)

    val_loss   val_acc      loss       acc  epoch
0   0.663318  0.309705  0.785709  0.482278      0
1   0.663316  0.200844  0.661562  0.450633      1
2   0.663316  0.277637  0.661561  0.419198      2
3   0.663316  0.549367  0.661561  0.426793      3
4   0.663316  0.717300  0.661561  0.454852      4
5   0.663316  0.493671  0.661561  0.495148      5
6   0.663316  0.211814  0.661561  0.457384      6
7   0.663316  0.215190  0.661561  0.481013      7
8   0.663316  0.999156  0.661561  0.455907      8
9   0.663316  0.000000  0.661561  0.465401      9
10  0.663316  1.000000  0.661561  0.474051     10
11  0.663319  1.000000  0.661561  0.495992     11
12  0.663316  1.000000  0.661561  0.478692     12
13  0.663435  0.000000  0.661567  0.464557     13
14  0.663316  0.603376  0.663854  0.493882     14
15  0.663316  0.009283  0.661561  0.527848     15
16  0.663316  0.786498  0.661561  0.515401     16
17  0.663316  0.007595  0.661561  0.524051     17
18  0.663316  0.000000  0.661561  0.518776     18
