In [None]:
#Import standard packages for model training

import tensorflow as T
from tensorflow import *
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.losses import *
import numpy as np
import pandas as pd
import sklearn.metrics as SK
from sklearn.metrics import *
import seaborn as sns
import matplotlib.pyplot as plt
import os 
import io
from utils.GPU import *
    
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [None]:
#Inport train, validation and test sets

training = './data/random_split/train_fold_0.csv'
validation = './data/random_split/valid_fold_0.csv'
test = './data/random_split/test_fold_0.csv'

# load training dataset
train_dataset = pd.read_csv(training, delimiter=',', low_memory=False)

# load validation dataset
val_dataset = pd.read_csv(validation, delimiter=',', low_memory=False)

# load test dataset
test_dataset = pd.read_csv(test, delimiter=',', low_memory=False)

train_dataset.head()

In [None]:
#Import atom and bond featurizers

from utils.graphs import *

#select the numb of tasks here

task_start = 2
tasks= 6
task_index = tasks + 2

#Gennerate graphs from SMILES

X_train = graphs_from_smiles(train_dataset.iloc[:,1].values)
y_train = train_dataset.iloc[:,2:task_index].values

X_test = graphs_from_smiles(test_dataset.iloc[:,1].values)
y_test = test_dataset.iloc[:,2:task_index].values

X_val = graphs_from_smiles(val_dataset.iloc[:,1].values)
y_val = val_dataset.iloc[:,2:task_index].values

#Test graph function

molecule = molecule_from_smiles(train_dataset.iloc[8].SMILES)
graph = graph_from_molecule(molecule)
print("Graph (including self-loops):")
print("\tatom features\t", graph[0].shape)
print("\tbond features\t", graph[1].shape)
print("\tpair indices\t", graph[2].shape)

In [None]:
# Import parameters for batch, MPNN, loss and scorers

batch_size = 48
dense_units = 50

from utils.batch import *
from utils.MPNN import *
from utils.utils import *


# parameters for train network

def MPNNModel(atom_dim,
    bond_dim,
    batch_size=batch_size,
    message_units=32,
    message_steps=8,
    num_attention_heads=16,
    dense_units=dense_units):

    atom_features = layers.Input((atom_dim), dtype="float32", name="atom_features")
    bond_features = layers.Input((bond_dim), dtype="float32", name="bond_features")
    pair_indices = layers.Input((2), dtype="int32", name="pair_indices")
    molecule_indicator = layers.Input((), dtype="int32", name="molecule_indicator")

    x = MessagePassing(message_units, message_steps)([atom_features, bond_features, pair_indices])
    x = TransformerEncoderReadout(num_attention_heads, message_units, dense_units, batch_size)([x, molecule_indicator])
    x = layers.Dense(50, activation="relu")(x)
    x = layers.Dense(tasks, activation="linear")(x)
    
    model = keras.Model(inputs=[atom_features, bond_features, pair_indices, molecule_indicator],
        outputs=[x])
    return model
    


optimizer = RMSprop(0.1)
lr_metric = get_lr_metric(optimizer)   

model = MPNNModel(atom_dim = X_train[0][0][0].shape[0], bond_dim = X_train[1][0][0].shape[0])
model.compile(loss = regression_loss, metrics = [lr_metric])    
model.summary()
T.keras.utils.plot_model(model)

In [None]:
# Early stopping parameters

train_dataset = MPNNDataset(X_train, y_train, batch_size)
val_dataset = MPNNDataset(X_val, y_val, batch_size)
test_dataset = MPNNDataset(X_test, y_test, batch_size)

callbacks_list = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00000001, verbose=1, mode='auto',cooldown=0),
    EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, mode='min', verbose=1)]

# parameters for train network

epochs=2000

history = model.fit(train_dataset,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=val_dataset,                    
                    callbacks=(callbacks_list))

model.save_weights("./models/TM-MPNN_regression_model.hdf5")

In [None]:
#Plot model history

hist = history.history

plt.figure(figsize=(13, 9))


for label in ['val_loss','loss']:
    plt.subplot(221)
    plt.plot(hist[label], label = label)
    plt.legend()
    plt.xlabel("Epochs")
    plt.ylabel("loss")

plt.subplot(222)
plt.plot( hist['lr'],hist['val_loss']  )
plt.legend()
plt.xlabel("lr")
plt.ylabel("val_loss")
    
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                    wspace=0.35)

In [None]:
#Statistical characteristico of model without 3-sigma rule

prediction_train = model.predict(train_dataset)
prediction_val = model.predict(val_dataset)
prediction_test = model.predict(test_dataset)


for index1 in range(prediction_train.shape[1]):

    train_pred = pd.DataFrame(y_train[:,index1],prediction_train[:,index1]) 
    train_pred['y_pred'] = train_pred.index
    train_pred = train_pred.rename(columns = {0: 'y_obs'})
    train_pred2 = train_pred.dropna()
    train_pred2 = train_pred2.reset_index(drop=True)
    train_pred2['Folds'] = 'Train'
    train_pred2 = train_pred2.assign(Folds_error = abs(train_pred2['y_pred'] - train_pred2['y_obs']))
    train_pred2['Folds error Mean'] = train_pred2['Folds_error'].mean() 
    train_pred2['Folds error 3*sigma'] = train_pred2['Folds_error'].std()
    train_pred2['Folds error 3*sigma'] = train_pred2['Folds error 3*sigma']*3

    for index2 in range(prediction_val.shape[1]):
         
        val_pred = pd.DataFrame(y_val[:,index2],prediction_val[:,index2])
        val_pred['y_pred'] = val_pred.index
        val_pred = val_pred.rename(columns = {0: 'y_obs'})
        val_pred2 = val_pred.dropna()
        val_pred2 = val_pred2.reset_index(drop=True)
        val_pred2['Folds'] = 'val'
        val_pred2 = val_pred2.assign(Folds_error = abs(val_pred2['y_pred'] - val_pred2['y_obs']))
        val_pred2['Folds error Mean'] = val_pred2['Folds_error'].mean() 
        val_pred2['Folds error 3*sigma'] = val_pred2['Folds_error'].std()
        val_pred2['Folds error 3*sigma'] = val_pred2['Folds error 3*sigma']*3
           
            
        for index3 in range(prediction_test.shape[1]):
         
            test_pred = pd.DataFrame(y_test[:,index3],prediction_test[:,index3])
            test_pred['y_pred'] = test_pred.index
            test_pred = test_pred.rename(columns = {0: 'y_obs'})
            test_pred2 = test_pred.dropna()
            test_pred2 = test_pred2.reset_index(drop=True)
            test_pred2['Folds'] = 'Test'
            test_pred2 = test_pred2.assign(Folds_error = abs(test_pred2['y_pred'] - test_pred2['y_obs']))
            test_pred2['Folds error Mean'] = test_pred2['Folds_error'].mean() 
            test_pred2['Folds error 3*sigma'] = test_pred2['Folds_error'].std()
            test_pred2['Folds error 3*sigma'] = test_pred2['Folds error 3*sigma']*3

            crossval_df = pd.concat([train_pred2, val_pred2, test_pred2], axis=0).reset_index(drop=True)


            if index1 == index2 and index1 == index3:
                    
                r2  = (train_pred2["y_obs"].corr(train_pred2["y_pred"]))    
                print(("Results for task {} (train)").format(index2+1))
                print("r^2\t%.2f" % r2)
                print ("rmse\t%.2f" % sqrt(mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
                print ("mse\t%.2f" % (mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
                print ("mae\t%.2f"  %mean_absolute_error(train_pred2["y_obs"],train_pred2["y_pred"]))   

                r2 = (val_pred2["y_obs"].corr(val_pred2["y_pred"]))
                print(("Results for task {} (validation)").format(index3+1))
                print("r^2\t%.2f" % r2)
                print ("rmse\t%.2f"  % sqrt(mean_squared_error(val_pred2["y_pred"],val_pred2["y_obs"])))
                print ("mse\t%.2f"  % (mean_squared_error(val_pred2["y_pred"],val_pred2["y_obs"])))
                print ("mae\t%.2f"  % mean_absolute_error(val_pred2["y_pred"],val_pred2["y_obs"]))
                
                r2 = (test_pred2["y_obs"].corr(test_pred2["y_pred"])) 
                print(("Results for task {} (test)").format(index1+1))
                print("r^2\t%.2f" % r2)
                print ("rmse\t%.2f"  % sqrt(mean_squared_error(test_pred2["y_pred"],test_pred2["y_obs"]))) 
                print ("mse\t%.2f"  % (mean_squared_error(test_pred2["y_pred"],test_pred2["y_obs"])))
                print ("mae\t%.2f"  % mean_absolute_error(test_pred2["y_pred"],test_pred2["y_obs"]))

                g = sns.lmplot(x="y_pred", y="y_obs", hue="Folds", data=crossval_df, fit_reg=False, height=7, 
                markers=["o", "o", "o"], palette="rocket",scatter_kws={"s": 50,'alpha':0.9},  aspect=30/30)
                sns.regplot(x="y_pred", y="y_obs", data=crossval_df, scatter=False, ax=g.axes[0, 0]) 

In [None]:
#Statistical characteristico of model using 3-sigma rule

for index1 in range(prediction_train.shape[1]):

    train_pred = pd.DataFrame(y_train[:,index1],prediction_train[:,index1]) 
    train_pred['y_pred'] = train_pred.index
    train_pred = train_pred.rename(columns = {0: 'y_obs'})
    train_pred2 = train_pred.dropna()
    train_pred2 = train_pred2.reset_index(drop=True)
    train_pred2['Folds'] = 'Train'
    train_pred2 = train_pred2.assign(Folds_error = abs(train_pred2['y_pred'] - train_pred2['y_obs']))
    train_pred2['Folds error Mean'] = train_pred2['Folds_error'].mean() 
    train_pred2['Folds error 3*sigma'] = train_pred2['Folds_error'].std()
    train_pred2['Folds error 3*sigma'] = train_pred2['Folds error 3*sigma']*3
    train_pred2=train_pred2[train_pred2['Folds_error']<=(train_pred2['Folds error 3*sigma'])] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

    for index2 in range(prediction_val.shape[1]):
         
        val_pred = pd.DataFrame(y_val[:,index2],prediction_val[:,index2])
        val_pred['y_pred'] = val_pred.index
        val_pred = val_pred.rename(columns = {0: 'y_obs'})
        val_pred2 = val_pred.dropna()
        val_pred2 = val_pred2.reset_index(drop=True)
        val_pred2['Folds'] = 'val'
        val_pred2 = val_pred2.assign(Folds_error = abs(val_pred2['y_pred'] - val_pred2['y_obs']))
        val_pred2['Folds error Mean'] = val_pred2['Folds_error'].mean() 
        val_pred2['Folds error 3*sigma'] = val_pred2['Folds_error'].std()
        val_pred2['Folds error 3*sigma'] = val_pred2['Folds error 3*sigma']*3
        val_pred2=val_pred2[val_pred2['Folds_error']<=(val_pred2['Folds error 3*sigma'])]#keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.
   
        for index3 in range(prediction_test.shape[1]):

            test_pred = pd.DataFrame(y_test[:,index3],prediction_test[:,index3])
            test_pred['y_pred'] = test_pred.index
            test_pred = test_pred.rename(columns = {0: 'y_obs'})
            test_pred2 = test_pred.dropna()
            test_pred2 = test_pred2.reset_index(drop=True)
            test_pred2['Folds'] = 'Test'
            test_pred2 = test_pred2.assign(Folds_error = abs(test_pred2['y_pred'] - test_pred2['y_obs']))
            test_pred2['Folds error Mean'] = test_pred2['Folds_error'].mean() 
            test_pred2['Folds error 3*sigma'] = test_pred2['Folds_error'].std()
            test_pred2['Folds error 3*sigma'] = test_pred2['Folds error 3*sigma']*3
            test_pred2=test_pred2[test_pred2['Folds_error']<=(test_pred2['Folds error 3*sigma'])] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

            crossval_df = pd.concat([train_pred2, val_pred2, test_pred2], axis=0).reset_index(drop=True)

            if index1 == index2 and index1 == index3:
                        
                r2 = (train_pred2["y_obs"].corr(train_pred2["y_pred"]))    
                print(("Results for task {} (train)").format(index2+1))
                print("r^2\t%.2f" % r2)
                print ("rmse\t%.2f" % sqrt(mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
                print ("mse\t%.2f" % (mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
                print ("mae\t%.2f"  %mean_absolute_error(train_pred2["y_obs"],train_pred2["y_pred"]))   

                r2= (val_pred2["y_obs"].corr(val_pred2["y_pred"]))
                print(("Results for task {} (validation)").format(index3+1))
                print("r^2\t%.2f" % r2)
                print ("rmse\t%.2f"  % sqrt(mean_squared_error(val_pred2["y_pred"],val_pred2["y_obs"])))
                print ("mse\t%.2f"  % (mean_squared_error(val_pred2["y_pred"],val_pred2["y_obs"])))
                print ("mae\t%.2f"  % mean_absolute_error(val_pred2["y_pred"],val_pred2["y_obs"]))
                
                r2 = (test_pred2["y_obs"].corr(test_pred2["y_pred"])) 
                print(("Results for task {} (test)").format(index1+1))
                print("r^2\t%.2f" % r2)
                print ("rmse\t%.2f"  % sqrt(mean_squared_error(test_pred2["y_pred"],test_pred2["y_obs"]))) 
                print ("mse\t%.2f"  % (mean_squared_error(test_pred2["y_pred"],test_pred2["y_obs"])))
                print ("mae\t%.2f"  % mean_absolute_error(test_pred2["y_pred"],test_pred2["y_obs"]))

                g = sns.lmplot(x="y_pred", y="y_obs", hue="Folds", data=crossval_df, fit_reg=False, height=7, 
                markers=["o", "o", "o"], palette="rocket",scatter_kws={"s": 50,'alpha':0.9},  aspect=30/30)
                sns.regplot(x="y_pred", y="y_obs", data=crossval_df, scatter=False, ax=g.axes[0, 0]) 