## Building an optimized CNN Model for Morgan fingerprints

This Notebook will uses Keras Tuner to find the best hyperparameters for a CNN model which will label molecules according to their Morgan fingerprints

#### In case the requirements.txt file won't work

In [4]:
#pip install keras_tuner

In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from numpy import set_printoptions
from numpy import asarray
from numpy import unique
from numpy import argmax
from tensorflow import keras
from tensorflow.keras.datasets.mnist import load_data
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets.mnist import load_data
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras_tuner import RandomSearch

### Loading data

In [8]:
all_drugs = pd.read_pickle(os.path.join('res','pickles', 'drug_fp.pkl'))

In [9]:
drugs = all_drugs
drugs.head()

Unnamed: 0,CID,Molecule,drug_class,drug_class_code,FeatInvariants,ConnInvariants,Morgan2FP,MACCSKeys,AtomPairFP,TopTorFP,AvalonFP,PubchemFP,CactvsFP
0,24769,<rdkit.Chem.rdchem.Mol object at 0x000001B9642...,hematologic,7,"[0, 18, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 8...","[2968968094, 2976033787, 2968968094, 297603378...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
1,134694070,<rdkit.Chem.rdchem.Mol object at 0x000001B9642...,cardio,3,"[0, 0, 2, 0, 17, 17, 4, 4, 4, 6, 4, 4, 6, 4, 4...","[2968968094, 2976033787, 2968968094, 297603378...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
2,5121,<rdkit.Chem.rdchem.Mol object at 0x000001B94ED...,antiinfective,0,"[0, 0, 2, 0, 2, 0, 0, 16, 4, 4, 4, 4, 4, 4, 8]","[2968968094, 2976033787, 2968968094, 297603378...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
3,4660557,<rdkit.Chem.rdchem.Mol object at 0x000001B94ED...,cns,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 4, 4, 4, 4,...","[2968968094, 2976033787, 2968968094, 297603378...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
4,122175,<rdkit.Chem.rdchem.Mol object at 0x000001B963E...,antineoplastic,2,"[0, 0, 0, 0, 0, 0, 0, 19, 19]","[2968968094, 2976033787, 2968968094, 297603378...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."


### Tuning the convolutional network

In [10]:
drugs.columns

Index(['CID', 'Molecule', 'drug_class', 'drug_class_code', 'FeatInvariants',
       'ConnInvariants', 'Morgan2FP', 'MACCSKeys', 'AtomPairFP', 'TopTorFP',
       'AvalonFP', 'PubchemFP', 'CactvsFP'],
      dtype='object')

In [11]:
#Choose which fingerprints which you would like to train with
X = drugs['Morgan2FP']
y = drugs['drug_class_code']

In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Reshaping the arrays to fit in the model
x_train = np.array(list(X_train))
x_test = np.array(list(X_test))
print('Shape x_train: ', x_train.shape)
n_classes = len(unique(y_train))
print('Number of classes: ', n_classes)
x_train= x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test= x_test.reshape((x_test.shape[0], x_test.shape[1], 1))
print('Reshapep x_train: ', x_train.shape)
in_shape = x_train.shape[1:]
print('In_shape: ', in_shape)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

Shape x_train:  (4854, 2048)
Number of classes:  12
Reshapep x_train:  (4854, 2048, 1)
In_shape:  (2048, 1)
(4854, 2048, 1) (4854,)
(2081, 2048, 1) (2081,)


In [14]:
def build_tuned_model(hp):
    """
    Function that searches the best hyperparameters of a CNN model for the entry data provided using Keras Tuner.
    Input: hyperparameters
    Output: a compiled optimized model
    """
    # Create model object
    model = keras.Sequential([
    keras.layers.Conv1D(
        filters=hp.Int('conv_1_filter', min_value=32, max_value=128, step=16),
        kernel_size=hp.Choice('conv_1_kernel', values = [3,5]),
        activation='relu',
        input_shape=(2048, 1)),
    keras.layers.MaxPool1D(hp.Int('pool_size', min_value=2, max_value=6)), 
    keras.layers.Flatten(),
    keras.layers.Dense(
        units=hp.Int('dense_1_units', min_value=32, max_value=128, step=16),
        activation='relu', kernel_initializer = 'he_uniform'
        ),
    keras.layers.Dropout(0.5),  
    keras.layers.Dense(12, activation='softmax')
    ])
    # compilation of model
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3])),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [25]:
#importing random search

#creating randomsearch object
tuner = RandomSearch(build_model,
                    objective='val_accuracy',
                    max_trials = 5)
# search best parameter
tuner.search(x_train,y_train,epochs=3,validation_data=(x_train,y_train))

Trial 5 Complete [00h 01m 40s]
val_accuracy: 0.8438401222229004

Best val_accuracy So Far: 0.8506386280059814
Total elapsed time: 00h 08m 16s
INFO:tensorflow:Oracle triggered exit


In [None]:
opt_model