In [18]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, \
   precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef

In [19]:
data = pd.read_csv(r'.\input\train_10folds.csv')
data

Unnamed: 0,MaxPartialCharge,FpDensityMorgan2,BCUT2D_CHGLO,BCUT2D_MRHI,PEOE_VSA12,PEOE_VSA6,SMR_VSA3,SlogP_VSA3,SlogP_VSA8,EState_VSA6,NumHAcceptors,NumSaturatedCarbocycles,fr_bicyclic,TARGET,Kfold
0,0.232165,2.055556,-2.414188,6.375148,5.879988,11.600940,30.133278,11.343745,33.454659,6.196844,10,0,2,1.0,8
1,0.313409,1.155172,-2.211123,7.894021,0.000000,101.109334,4.899910,51.033142,6.076020,62.378411,10,0,0,0.0,9
2,0.394239,1.740741,-2.156618,6.360204,0.000000,17.907236,0.000000,9.589074,0.000000,31.127988,5,0,0,0.0,3
3,0.259309,1.789474,-1.979236,6.347147,5.824404,28.358543,15.124620,0.000000,22.842031,42.725522,4,0,0,0.0,5
4,0.326721,1.804878,-2.225746,9.103009,0.000000,88.440639,9.883888,22.616778,27.751393,12.272864,5,0,2,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2734,0.157227,1.928571,-2.406681,7.927002,0.000000,35.334614,9.967957,9.837253,11.257379,24.395945,7,0,0,0.0,6
2735,0.152046,1.958333,-2.386793,6.432995,0.000000,42.258351,9.967957,0.000000,11.257379,12.263211,5,0,0,1.0,2
2736,0.339242,1.900000,-2.136891,6.311472,6.792942,11.600940,0.000000,17.822241,10.969244,32.046576,6,0,2,0.0,5
2737,0.409243,2.000000,-2.396237,7.800537,0.000000,17.667307,14.699729,4.736863,0.000000,4.899910,4,0,0,0.0,8


In [20]:
data['TARGET'].value_counts()

0.0    1771
1.0     968
Name: TARGET, dtype: int64

In [21]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.iloc[:,:-1].copy()
    labels = df.pop('TARGET')
    df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(data))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [22]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)
    
    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    
    return normalizer

In [23]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    
    # create a layer that turns integer values into integer indices.
    index = layers.IntegerLookup(max_tokens=max_tokens)
    
    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)
    
    # Encode the integer indices.
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    
    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    
    return lambda feature: encoder(index(feature))

In [24]:
def train_val_split(fold, batch_size):
    train = data[data.Kfold != fold].reset_index(drop=True)
    valid = data[data.Kfold == fold].reset_index(drop=True)
    train_ds = df_to_dataset(train, batch_size=batch_size)
    val_ds = df_to_dataset(valid, shuffle=False, batch_size=batch_size)
    
    return train_ds, val_ds, train, valid

In [25]:
def preprocess_dataset(numerical_features, categorical_features):
    all_inputs = []
    encoded_features = []

    # Numerical features.
    for header in numerical_features:
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = get_normalization_layer(header, train_ds)
        encoded_numeric_col = normalization_layer(numeric_col)
        all_inputs.append(numeric_col)
        encoded_features.append(encoded_numeric_col)
    
    hacc_col = tf.keras.Input(shape=(1,), name='NumHAcceptors', dtype='int64')

    encoding_layer = get_category_encoding_layer(name='NumHAcceptors',
                                                 dataset=train_ds,
                                                 dtype='int64',
                                                 max_tokens=20)
    encoded_hacc_col = encoding_layer(hacc_col)

    all_inputs.append(hacc_col)
    encoded_features.append(encoded_hacc_col)
    
    scc_col = tf.keras.Input(shape=(1,), name='NumSaturatedCarbocycles', dtype='int64')

    encoding_layer = get_category_encoding_layer(name='NumSaturatedCarbocycles',
                                                 dataset=train_ds,
                                                 dtype='int64',
                                                 max_tokens=5)
    encoded_scc_col = encoding_layer(scc_col)

    all_inputs.append(scc_col)
    encoded_features.append(encoded_scc_col)
    
    fbc_col = tf.keras.Input(shape=(1,), name='fr_bicyclic', dtype='int64')

    encoding_layer = get_category_encoding_layer(name='fr_bicyclic',
                                                 dataset=train_ds,
                                                 dtype='int64',
                                                 max_tokens=10)
    encoded_fbc_col = encoding_layer(fbc_col)
    all_inputs.append(fbc_col)
    encoded_features.append(encoded_fbc_col)
    
    return all_inputs, encoded_features

In [26]:
numerical_features = ['MaxPartialCharge', 'FpDensityMorgan2', 'BCUT2D_CHGLO', 'BCUT2D_MRHI',
       'PEOE_VSA12', 'PEOE_VSA6', 'SMR_VSA3', 'SlogP_VSA3', 'SlogP_VSA8', 'EState_VSA6']

categorical_features = ['NumHAcceptors', 'NumSaturatedCarbocycles', 'fr_bicyclic']

In [27]:
class config:
    hidden_layer_1 = 16
    hidden_layer_2 = 64
    hidden_layer_3 = 128
    hidden_layer_4 = 256
    hidden_layer_5 = 32
    hidden_layer_6 = 8
    initializer = tf.keras.initializers.HeNormal()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    dropout = 0.1
    batch_size = 16
    epochs = 100

In [28]:
def AnnClassifier(all_inputs, encoded_features):
    all_features = tf.keras.layers.concatenate(encoded_features)
    x = tf.keras.layers.Dense(units=config.hidden_layer_1, activation="relu", 
                              kernel_initializer=config.initializer)(all_features)
    x = tf.keras.layers.BatchNormalization(axis=-1)(x)
    #x = tf.keras.layers.Dropout(hparams['dropout'][0])(x)
    x = tf.keras.layers.Dense(units=config.hidden_layer_2, activation="relu",
                             kernel_initializer=config.initializer)(x)
    x = tf.keras.layers.BatchNormalization(axis=-1)(x)
    #x = tf.keras.layers.Dropout(hparams['dropout'][0])(x)
    x = tf.keras.layers.Dense(units=config.hidden_layer_3, activation="relu",
                             kernel_initializer=config.initializer)(x)
    x = tf.keras.layers.BatchNormalization(axis=-1)(x)
    #x = tf.keras.layers.Dropout(hparams['dropout'][0])(x)
    x = tf.keras.layers.Dense(units=config.hidden_layer_4, activation="relu",
                             kernel_initializer=config.initializer)(x)
    x = tf.keras.layers.BatchNormalization(axis=-1)(x)
    x = tf.keras.layers.Dense(units=config.hidden_layer_5, activation="relu",
                             kernel_initializer=config.initializer)(x)
    x = tf.keras.layers.BatchNormalization(axis=-1)(x)
    x = tf.keras.layers.Dense(units=config.hidden_layer_6, activation="relu",
                             kernel_initializer=config.initializer)(x)
    x = tf.keras.layers.BatchNormalization(axis=-1)(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(all_inputs, output)
    model.compile(optimizer = config.optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [32]:
def evaluate(fold, all_inputs, encoded_features, valid, val_ds):
    
    model = AnnClassifier(all_inputs, encoded_features)
    checkpoint = f".\\model_checkpoints\\fold_{fold}"
    model.load_weights(checkpoint)
    
    y_pred_proba = model.predict(val_ds)
    y_true = valid.TARGET.values
    auc = roc_auc_score(y_true, y_pred_proba)
    y_pred = (y_pred_proba > 0.5)
    
    accuracy = accuracy_score(y_true,y_pred)
    precision_1 = precision_score(y_true,y_pred,pos_label=1)
    precision_0 = precision_score(y_true,y_pred,pos_label=0)
    recall_1 = recall_score(y_true,y_pred,pos_label=1)
    recall_0 = recall_score(y_true,y_pred,pos_label=0)
    f1score = f1_score(y_true,y_pred)
    kappa = cohen_kappa_score(y_true,y_pred)
    MCC = matthews_corrcoef(y_true,y_pred)
    
    print(f"Fold = {fold}, AUC = {auc}, Accuracy = {accuracy}, \
          Precision_1 = {precision_1}, Precision_0 = {precision_0}\
          Recall_1 = {recall_1}, Recall_0 = {recall_0}, F1Score = {f1score}, kappa = {kappa}, MCC = {MCC}")
    
    return auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC

In [34]:
aucs, accuracies, precisions_1, precisions_0, recalls_1, recalls_0, f1scores, kappas, MCCs = [], [], [], [], [], [], [], [], []

for fold_ in range(10):
    train_ds, val_ds, train, valid = train_val_split(fold_, config.batch_size)
    all_inputs, encoded_features = preprocess_dataset(numerical_features, categorical_features)
    auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC = evaluate(fold_, all_inputs, encoded_features, valid, val_ds)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions_1.append(precision_1)
    precisions_0.append(precision_0)
    recalls_1.append(recall_1)
    recalls_0.append(recall_0)
    f1scores.append(f1score)
    kappas.append(kappa)
    MCCs.append(MCC)
    
print("\n")
print(f"Mean Scores: AUC = {np.mean(np.array(aucs))}, \
      Accuracy = {np.mean(np.array(accuracies))}, \
      Precision_1 = {np.mean(np.array(precisions_1))}, Precision_0 = {np.mean(np.array(precisions_0))}\
      Recall_1 = {np.mean(np.array(recalls_1))}, Recall_0 = {np.mean(np.array(recalls_0))}\
      F1Score = {np.mean(np.array(f1scores))} \
      Kappa = {np.mean(np.array(kappas))} \
      MCC = {np.mean(np.array(MCCs))}")

  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 0, AUC = 0.9405323548255579, Accuracy = 0.8868613138686131,           Precision_1 = 0.8666666666666667, Precision_0 = 0.8967391304347826          Recall_1 = 0.8041237113402062, Recall_0 = 0.9322033898305084, F1Score = 0.8342245989304814, kappa = 0.7485345491148084, MCC = 0.7497442081114257


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 1, AUC = 0.9812452676335255, Accuracy = 0.9452554744525548,           Precision_1 = 0.9361702127659575, Precision_0 = 0.95          Recall_1 = 0.9072164948453608, Recall_0 = 0.9661016949152542, F1Score = 0.9214659685863875, kappa = 0.8794650712651768, MCC = 0.879720731841955


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 2, AUC = 0.9436484361348942, Accuracy = 0.8905109489051095,           Precision_1 = 0.8764044943820225, Precision_0 = 0.8972972972972973          Recall_1 = 0.8041237113402062, Recall_0 = 0.9378531073446328, F1Score = 0.8387096774193549, kappa = 0.7560686094130216, MCC = 0.7576732765519593


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 3, AUC = 0.9532879026151785, Accuracy = 0.8978102189781022,           Precision_1 = 0.8556701030927835, Precision_0 = 0.9209039548022598          Recall_1 = 0.8556701030927835, Recall_0 = 0.9209039548022598, F1Score = 0.8556701030927835, kappa = 0.7765740578950434, MCC = 0.7765740578950434


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 4, AUC = 0.9562874949036053, Accuracy = 0.8905109489051095,           Precision_1 = 0.819047619047619, Precision_0 = 0.9349112426035503          Recall_1 = 0.8865979381443299, Recall_0 = 0.8926553672316384, F1Score = 0.8514851485148515, kappa = 0.7649951398021614, MCC = 0.7665017515043108


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 5, AUC = 0.961762478886365, Accuracy = 0.9124087591240876,           Precision_1 = 0.9294117647058824, Precision_0 = 0.9047619047619048          Recall_1 = 0.8144329896907216, Recall_0 = 0.9661016949152542, F1Score = 0.868131868131868, kappa = 0.802984001438073, MCC = 0.806908595817797


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 6, AUC = 0.9507833886656183, Accuracy = 0.9233576642335767,           Precision_1 = 0.9130434782608695, Precision_0 = 0.9285714285714286          Recall_1 = 0.865979381443299, Recall_0 = 0.9548022598870056, F1Score = 0.8888888888888888, kappa = 0.8304555365666804, MCC = 0.8311330005467628


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 7, AUC = 0.9775176189644126, Accuracy = 0.9306569343065694,           Precision_1 = 0.90625, Precision_0 = 0.9438202247191011          Recall_1 = 0.8969072164948454, Recall_0 = 0.9491525423728814, F1Score = 0.9015544041450777, kappa = 0.8480354953587483, MCC = 0.8480626211232735


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 8, AUC = 0.9509012172284644, Accuracy = 0.9087591240875912,           Precision_1 = 0.8446601941747572, Precision_0 = 0.9473684210526315          Recall_1 = 0.90625, Recall_0 = 0.9101123595505618, F1Score = 0.8743718592964823, kappa = 0.8028776978417267, MCC = 0.804103444314595


  df = {key: value[:,tf.newaxis] for key, value in dataframe.iloc[:,:-1].items()}


Fold = 9, AUC = 0.9463865348399246, Accuracy = 0.9084249084249084,           Precision_1 = 0.9176470588235294, Precision_0 = 0.9042553191489362          Recall_1 = 0.8125, Recall_0 = 0.96045197740113, F1Score = 0.8618784530386742, kappa = 0.7937630314568036, MCC = 0.7970514840865099


Mean Scores: AUC = 0.9562352694697547,       Accuracy = 0.9094556295286222,       Precision_1 = 0.8864971591920089, Precision_0 = 0.9228628923391892      Recall_1 = 0.8553801546391753, Recall_0 = 0.9390338348251127      F1Score = 0.8696380970044851       Kappa = 0.8003753190152244       MCC = 0.8017473171793632


In [35]:
fold_metrics = pd.DataFrame(columns=['Accuracy','AUC','Precision_1','Precision_0','Recall_1','Recall_0','F1score','Kappa','MCC'])
fold_metrics['Accuracy'] = np.array(accuracies)
fold_metrics['AUC'] = np.array(aucs)
fold_metrics['Precision_1'] = np.array(precisions_1)
fold_metrics['Precision_0'] = np.array(precisions_0)
fold_metrics['Recall_1'] = np.array(recalls_1)
fold_metrics['Recall_0'] = np.array(recalls_0)
fold_metrics['F1score'] = np.array(f1scores)
fold_metrics['Kappa'] = np.array(kappas)
fold_metrics['MCC'] = np.array(MCCs)
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.886861,0.940532,0.866667,0.896739,0.804124,0.932203,0.834225,0.748535,0.749744
1,0.945255,0.981245,0.93617,0.95,0.907216,0.966102,0.921466,0.879465,0.879721
2,0.890511,0.943648,0.876404,0.897297,0.804124,0.937853,0.83871,0.756069,0.757673
3,0.89781,0.953288,0.85567,0.920904,0.85567,0.920904,0.85567,0.776574,0.776574
4,0.890511,0.956287,0.819048,0.934911,0.886598,0.892655,0.851485,0.764995,0.766502
5,0.912409,0.961762,0.929412,0.904762,0.814433,0.966102,0.868132,0.802984,0.806909
6,0.923358,0.950783,0.913043,0.928571,0.865979,0.954802,0.888889,0.830456,0.831133
7,0.930657,0.977518,0.90625,0.94382,0.896907,0.949153,0.901554,0.848035,0.848063
8,0.908759,0.950901,0.84466,0.947368,0.90625,0.910112,0.874372,0.802878,0.804103
9,0.908425,0.946387,0.917647,0.904255,0.8125,0.960452,0.861878,0.793763,0.797051


In [36]:
fold_metrics.loc[10,:] = [np.mean(np.array(accuracies)), np.mean(np.array(aucs)), np.mean(np.array(precisions_1)),
                               np.mean(np.array(precisions_0)), np.mean(np.array(recalls_1)), np.mean(np.array(recalls_0)),
                            np.mean(np.array(f1scores)), np.mean(np.array(kappas)), np.mean(np.array(MCCs))]

fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.886861,0.940532,0.866667,0.896739,0.804124,0.932203,0.834225,0.748535,0.749744
1,0.945255,0.981245,0.93617,0.95,0.907216,0.966102,0.921466,0.879465,0.879721
2,0.890511,0.943648,0.876404,0.897297,0.804124,0.937853,0.83871,0.756069,0.757673
3,0.89781,0.953288,0.85567,0.920904,0.85567,0.920904,0.85567,0.776574,0.776574
4,0.890511,0.956287,0.819048,0.934911,0.886598,0.892655,0.851485,0.764995,0.766502
5,0.912409,0.961762,0.929412,0.904762,0.814433,0.966102,0.868132,0.802984,0.806909
6,0.923358,0.950783,0.913043,0.928571,0.865979,0.954802,0.888889,0.830456,0.831133
7,0.930657,0.977518,0.90625,0.94382,0.896907,0.949153,0.901554,0.848035,0.848063
8,0.908759,0.950901,0.84466,0.947368,0.90625,0.910112,0.874372,0.802878,0.804103
9,0.908425,0.946387,0.917647,0.904255,0.8125,0.960452,0.861878,0.793763,0.797051


In [37]:
fold_metrics.loc[11,:] = [np.std(np.array(accuracies)), np.std(np.array(aucs)), np.std(np.array(precisions_1)),
                               np.std(np.array(precisions_0)), np.std(np.array(recalls_1)), np.std(np.array(recalls_0)),
                            np.std(np.array(f1scores)), np.std(np.array(kappas)), np.std(np.array(MCCs))]

fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.886861,0.940532,0.866667,0.896739,0.804124,0.932203,0.834225,0.748535,0.749744
1,0.945255,0.981245,0.93617,0.95,0.907216,0.966102,0.921466,0.879465,0.879721
2,0.890511,0.943648,0.876404,0.897297,0.804124,0.937853,0.83871,0.756069,0.757673
3,0.89781,0.953288,0.85567,0.920904,0.85567,0.920904,0.85567,0.776574,0.776574
4,0.890511,0.956287,0.819048,0.934911,0.886598,0.892655,0.851485,0.764995,0.766502
5,0.912409,0.961762,0.929412,0.904762,0.814433,0.966102,0.868132,0.802984,0.806909
6,0.923358,0.950783,0.913043,0.928571,0.865979,0.954802,0.888889,0.830456,0.831133
7,0.930657,0.977518,0.90625,0.94382,0.896907,0.949153,0.901554,0.848035,0.848063
8,0.908759,0.950901,0.84466,0.947368,0.90625,0.910112,0.874372,0.802878,0.804103
9,0.908425,0.946387,0.917647,0.904255,0.8125,0.960452,0.861878,0.793763,0.797051


In [38]:
fold_metrics.index = ['Fold_0','Fold_1','Fold_2','Fold_3','Fold_4','Fold_5','Fold_6','Fold_7','Fold_8','Fold_9','Mean','Std']
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
Fold_0,0.886861,0.940532,0.866667,0.896739,0.804124,0.932203,0.834225,0.748535,0.749744
Fold_1,0.945255,0.981245,0.93617,0.95,0.907216,0.966102,0.921466,0.879465,0.879721
Fold_2,0.890511,0.943648,0.876404,0.897297,0.804124,0.937853,0.83871,0.756069,0.757673
Fold_3,0.89781,0.953288,0.85567,0.920904,0.85567,0.920904,0.85567,0.776574,0.776574
Fold_4,0.890511,0.956287,0.819048,0.934911,0.886598,0.892655,0.851485,0.764995,0.766502
Fold_5,0.912409,0.961762,0.929412,0.904762,0.814433,0.966102,0.868132,0.802984,0.806909
Fold_6,0.923358,0.950783,0.913043,0.928571,0.865979,0.954802,0.888889,0.830456,0.831133
Fold_7,0.930657,0.977518,0.90625,0.94382,0.896907,0.949153,0.901554,0.848035,0.848063
Fold_8,0.908759,0.950901,0.84466,0.947368,0.90625,0.910112,0.874372,0.802878,0.804103
Fold_9,0.908425,0.946387,0.917647,0.904255,0.8125,0.960452,0.861878,0.793763,0.797051
