<h1>Feature extractor</h1>

In [1]:
from rdkit.Chem import rdMolDescriptors, MolFromSmiles, rdmolfiles, rdmolops


def fingerprint_features(smile_string, radius=2, size=2048):
    mol = MolFromSmiles(smile_string)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius,
                                                          nBits=size,
                                                          useChirality=True,
                                                          useBondTypes=True,
                                                          useFeatures=False
                                                          )

<h1>Prepare data</h1>

In [7]:
import numpy as np
import pandas as pd

from feature_extractor import fingerprint_features

def prepare_dataframe(df_data):
    df_features = df_data['smiles'].apply(fingerprint_features)
    df_features = df_features.apply(np.array)
    return(df_features)

def prepare_smile(smile : str):
    return(np.array(fingerprint_features(smile)))

<h1>Main</h1>

In [14]:
import numpy as np
import pandas as pd

from feature_extractor import fingerprint_features

from sklearn.model_selection import train_test_split, KFold

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau

df_single = pd.read_csv('../dataset_single.csv')
df_multi = pd.read_csv('../dataset_multi.csv')

df_features = df_single['smiles'].apply(fingerprint_features)
df_features = df_features.apply(np.array)

#We split our data between training,test and validation datasets
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(df_features.tolist()), df_single['P1'], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

def neural_model_1(x, y, x_test, y_test, neurons):
    """
    Neural network model
    
    Inputs
    x: descriptors values for training and validation
    y: properties values for training and validation
    x_test: descriptors values for test
    y_test: properties values for test
    
    
    Outputs
    model: trained neural network model
    score: a list with the score values for each fold
    """
    np.random.seed(1)
    score = []
    kfold = KFold(n_splits=5, shuffle=True)
    
    model = Sequential()
    model.add(Dense(neurons, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    opt = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    rlrop = ReduceLROnPlateau(monitor='accuracy', factor=0.01, patience=10)
        
    for train, validation in kfold.split(x, y):

        model.fit(x.iloc[train], y.iloc[train], 
                      epochs=100,
                      batch_size=128,
                      callbacks=[rlrop],
                      verbose=0,
                      validation_data=(x.iloc[validation], y.iloc[validation]))

        score.append(model.evaluate(x_test, y_test))
    
    return model, score

model1 = neural_model_1(X_train, y_train, X_test, y_test, 64)[0]

model1.save("model1.keras")



In [15]:
X_val

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1780,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4718,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3348,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4331,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
model1.predict(X_val)

array([[1.00000000e+00],
       [9.99989927e-01],
       [9.59581017e-01],
       [9.99998212e-01],
       [1.00000000e+00],
       [9.97164369e-01],
       [1.00000000e+00],
       [9.99254823e-01],
       [9.99837816e-01],
       [9.30207968e-01],
       [9.99555111e-01],
       [9.99955535e-01],
       [9.99996662e-01],
       [9.99998689e-01],
       [1.00000000e+00],
       [8.39722991e-01],
       [9.99933600e-01],
       [9.95477915e-01],
       [8.66391301e-01],
       [9.99997616e-01],
       [9.99999404e-01],
       [9.99333978e-01],
       [9.99988794e-01],
       [1.00000000e+00],
       [9.99562085e-01],
       [2.08125830e-01],
       [9.99955177e-01],
       [9.98370290e-01],
       [9.99972463e-01],
       [9.99997079e-01],
       [9.99997079e-01],
       [9.99999583e-01],
       [9.99373555e-01],
       [9.99998331e-01],
       [5.10010660e-01],
       [9.99876022e-01],
       [9.99731302e-01],
       [1.00000000e+00],
       [9.99986887e-01],
       [9.99972463e-01],


In [21]:
model1.predict(pd.DataFrame(prepare_smile("c1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C")).T)[0][0]

0.96396506

<h1>API</h1>

In [19]:
import tensorflow as tf
from flask import Flask, jsonify
from prepare_data import prepare_smile

model1 = tf.keras.models.load_model('model1.keras')

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict_P1():
    smile_array = prepare_smile("c1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C")
    prediction = model1.predict(pd.DataFrame(smile_array).T)[0][0]
    if prediction > 0.5:
        P1_predicted = 1
    else:
        P1_predicted = 0
    return(jsonify({"P1_predicted": P1_predicted}))


@app.route('/', methods=['GET'])
def index():
    return("test")

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


In [26]:
def predict_P1():
    smile_array = prepare_smile("c1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C")
    prediction = model1.predict(pd.DataFrame(smile_array).T)[0][0]
    return({"value": prediction})

predict_P1()

{'value': 0.96396506}