# Imports

In [1]:
#Imports
from modules.preamble import *
import json

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from skfuzzy import cmeans

from modules.PFS import PFS_layer, PFS_CP
from modules.kappa import *

%matplotlib inline

# Loading the data

In [2]:
#Load the data
df_train = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/df_train.hdf'))
df_val = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/df_val.hdf'))
df_test = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/df_test.hdf'))

X_train = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/X_train.hdf')).values
X_val = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/X_val.hdf')).values
X_test = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/X_test.hdf')).values

y_train = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/y_train.hdf')).values.ravel() #flat
y_val = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/y_val.hdf')).values.ravel()
y_test = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/y_test.hdf')).values.ravel()

instance_weights_train = pd.read_hdf(os.path.join(data_base_path, 'modeling_data/instance_weights_train.hdf')).values.ravel()

## Comment w.r.t. class weight
Must (awkwardly) pass these as aggregated instance + class weights (multiply them) to tensorflow. Default interface offers no option to apply instance + class weights during training and only sample weights during testing (i.e. not possible when passing both arguments "class_weight" and "sample_weight"). Tested this awkward aspect extensively due to weird first results - pretty sure TF does not offer the feature we need out-of-the-box.

In [3]:
# Adjust sample weight in training w.r.t classes (i.e. multiply by class weight).

class_weights = compute_class_weight('balanced', [0,1], y_train) #Class weights as array
class_weights = {0: class_weights[0], 1: class_weights[1]} #Convert to dictionary

instance_class_weights_train = np.array(
    [sample_weight * class_weights[y] for sample_weight, y in zip(instance_weights_train, y_train)]
)

In [4]:
#Quick checking of data shapes to see if load was correct
print(df_train.shape, df_val.shape, df_test.shape)
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)
print(instance_weights_train.shape)

(688457, 31) (117419, 31) (118967, 31)
(688457, 26) (117419, 26) (118967, 26)
(688457,) (117419,) (118967,)
(688457,)


# Tuning the DNN with all features

NB: Set the results to be stored as raw Python types, otherwise we get issues w/ JSON writing.

In [5]:
def generate_random_dnn(input_shape):
    """
    Generate a random densely connected neural network & return the configuration + model.
    The possible value ranges for the hyperparameters are as specified in the thesis
        - they are hard-coded.
    
    Returns: n_layers, n_units_per_layer, dropouts, model
    """
    #Choose number of layers & number of units per layer
    n_layers = int(np.random.randint(1,11,1)[0])
    n_units_per_layer, l1_reg_per_layer, l2_reg_per_layer = [], [], []
    for i in range(n_layers):
        n_units_per_layer.append(int(np.random.randint(10,500,1)[0]))

    #Setup the DL model
    dropouts = []
    model = tf.keras.Sequential()
    for i, n_units in enumerate(n_units_per_layer):
        if (i==0):
            model.add(tf.keras.layers.Dense(n_units,
                                            activation='relu',
                                            input_shape=input_shape))
        else:
            #Add possible dropout and the dense layer with the random number of units
            dropout_rate = np.random.uniform(0, 0.3)
            dropouts.append(dropout_rate)
            model.add(tf.keras.layers.Dropout(dropout_rate))
            model.add(tf.keras.layers.Dense(n_units, activation='relu'))

    model.add(tf.keras.layers.Dense(1, activation='sigmoid')) #Add the output layer, which always has 1 hidden unit and sigmoid output

    model.compile(optimizer='Adam',
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(curve='ROC', name='ROC_AUC')])
    
    return n_layers, n_units_per_layer, dropouts, model

In [6]:
#Setup
results = {}
filenr=1 #Alter filenames sometimes such that we have restore points in case the data gets corrupted
iteration = 0

In [8]:
#Automated tuning - stop this manually after your time budget has run out.
while(True):
    #Update iteration & file number
    iteration += 1
    if (iteration%5 ==0):
        print(iteration)
        filenr += 1
    
    #store start time of the fit & set random seed
    start_time = time.time()
    np.random.seed(iteration)
    
    #Create a randomized DNN & fit it to the data
    n_layers, n_units_per_layer, dropouts, model = generate_random_dnn(input_shape=(X_train.shape[1],))

    model.fit(X_train,
              y_train,
              sample_weight = instance_class_weights_train, #custom approach to include class weights here
              validation_data=(X_val, y_val),
              epochs=100,
              batch_size=512,
              verbose=0,
              callbacks= [tf.keras.callbacks.EarlyStopping(monitor='val_ROC_AUC',
                                                           mode='max',
                                                           patience=5, 
                                                           min_delta=0.005)])

    #Compute performance of the PFS-ML
    y_score = model.predict(X_val)
    roc_auc = roc_auc_score(y_val, y_score)
    auk = auk_score(y_val, y_score)

    #Store the configuration & output (to raw python types, otherwise issues w/ writing JSON)
    results[iteration] = {
        'n_layers': n_layers,
        'n_units_per_layer': n_units_per_layer,
        'dropout_rate_per_layer': dropouts,
        'roc_auc': roc_auc,
        'auk': auk,
        'fitting_time (minutes)': (time.time() - start_time)/60
    }

    # with open(os.path.join(data_base_path, 'model_tuning', 'DNN-all_feats-results-{}.json'.format(filenr)), 'w') as f:
    #     json.dump(results, f, indent=4)

5
10
15


E0522 15:25:58.779148 18608 ultratb.py:152] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-55fd8f9866a7>", line 28, in <module>
    min_delta=0.005)])
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
    result = self._call(*args, **kwds)
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "C:\

KeyboardInterrupt: 

# Tuning the DNN with only the best features from the PFS

In [6]:
#Load the features from the best PFS configuration
with open(os.path.join(data_base_path, 'model_tuning', 'Best-PFS-config.json'), 'r') as f:
    pfs_config = json.load(f)
indices_sel_features = pfs_config['feature_indices']

In [7]:
#Setup
results = {}
filenr=1 #Alter filenames sometimes such that we have restore points in case the data gets corrupted
iteration = 0

In [8]:
#Automated tuning - stop this manually after your time budget has run out.
while(True):
    #Update iteration & file number
    iteration += 1
    if (iteration%5 ==0):
        print(iteration)
        filenr += 1
    
    #store start time of the fit & set random seed
    start_time = time.time()
    np.random.seed(iteration+100) #Different seed than earlier
    
    #Create a randomized DNN & fit it to the data
    n_layers, n_units_per_layer, dropouts, model = generate_random_dnn(input_shape=(len(indices_sel_features),))

    model.fit(X_train[:, indices_sel_features],
              y_train,
              sample_weight = instance_class_weights_train, #custom approach to include class weights here
              validation_data=(X_val[:, indices_sel_features], y_val),
              epochs=100,
              batch_size=512,
              verbose=0,
              callbacks= [tf.keras.callbacks.EarlyStopping(monitor='val_ROC_AUC',
                                                           mode='max',
                                                           patience=5, 
                                                           min_delta=0.005)])

    #Compute performance of the PFS-ML
    y_score = model.predict(X_val[:, indices_sel_features])
    roc_auc = roc_auc_score(y_val, y_score)
    auk = auk_score(y_val, y_score)

    #Store the configuration & output (to raw python types, otherwise issues w/ writing JSON)
    results[iteration] = {
        'n_layers': n_layers,
        'n_units_per_layer': n_units_per_layer,
        'dropout_rate_per_layer': dropouts,
        'roc_auc': roc_auc,
        'auk': auk,
        'fitting_time (minutes)': (time.time() - start_time)/60
    }

    # with open(os.path.join(data_base_path, 'model_tuning', 'DNN-pfs_feats-results-{}.json'.format(filenr)), 'w') as f:
    #     json.dump(results, f, indent=4)

5
10
15
20
25


E0528 20:18:25.993536 16828 ultratb.py:152] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-62beb51ebcc5>", line 28, in <module>
    min_delta=0.005)])
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
    result = self._call(*args, **kwds)
  File "C:\Users\KevinReijnders\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "C:\

KeyboardInterrupt: 

# Quick analysis of the results & storing the best configuration

In [29]:
df_all = pd.read_json(os.path.join(data_base_path, 'model_tuning', 'DNN-all_feats-results-4.json'),
                      orient='index')
df_pfs = pd.read_json(os.path.join(data_base_path, 'model_tuning', 'DNN-pfs_feats-results-6.json'),
                      orient='index')

In [34]:
df_all.describe()

Unnamed: 0,n_layers,roc_auc,auk,fitting_time (minutes)
count,15.0,15.0,15.0,15.0
mean,7.266667,0.739137,0.119071,3.603002
std,2.433888,0.02361,0.012293,1.281691
min,3.0,0.70647,0.099214,1.291333
25%,5.5,0.719896,0.109873,2.549833
50%,8.0,0.740079,0.118986,4.149024
75%,9.0,0.747592,0.123438,4.515132
max,10.0,0.785132,0.142987,5.616453


In [35]:
df_pfs.describe()

Unnamed: 0,n_layers,roc_auc,auk,fitting_time (minutes)
count,26.0,26.0,26.0,26.0
mean,4.076923,0.674427,0.083795,2.187101
std,2.496767,0.018851,0.009596,0.997053
min,1.0,0.648371,0.071111,0.941535
25%,2.0,0.660241,0.076181,1.267329
50%,3.5,0.671517,0.081965,2.050783
75%,6.0,0.683221,0.087315,2.730259
max,8.0,0.71104,0.102817,4.278874


In [36]:
df_all.sort_values('roc_auc', ascending=False)

Unnamed: 0,n_layers,n_units_per_layer,dropout_rate_per_layer,roc_auc,auk,fitting_time (minutes)
7,5,"[35, 77, 221, 417, 113]","[0.29339685359898005, 0.16154876112313002, 0.15033613909798102, 0.021615340007928002]",0.785132,0.142987,1.984094
2,9,"[25, 82, 32, 309, 476, 85, 370, 273, 172]","[0.046328001926799006, 0.20965880741016002, 0.035985162778916, 0.14555277287995902, 0.18982131918028103, 0.245468016406466, 0.204907798322841, 0.149568351015995]",0.776684,0.139136,3.418462
13,3,"[186, 84, 26]","[0.24728355979841002, 0.28972475941289905]",0.768721,0.133804,1.291333
5,4,"[216, 199, 496, 128]","[0.27558327238137603, 0.146523356638448, 0.183523158870793]",0.749018,0.124496,2.111435
11,10,"[201, 90, 101, 347, 449, 279, 342, 171, 81, 348]","[0.06771381500239801, 0.268902223901536, 0.24785440551148302, 0.027843747699547002, 0.266912456743339, 0.214128580187416, 0.15536956092842802, 0.23561661658545002, 0.176299754899188]",0.746166,0.12238,4.149024
9,6,"[448, 66, 261, 416, 135, 75]","[0.06556760268782301, 0.125552454153135, 0.074430350524067, 0.025217895349605002, 0.10364959201512]",0.743208,0.118986,2.65283
6,10,"[237, 478, 116, 375, 345, 372, 346, 328, 291, 11]","[0.158945208657846, 0.125642228566996, 0.100622354781998, 0.186755829658827, 0.13144242783266202, 0.22076463190460702, 0.15541092359978, 0.173657580067847, 0.193606528768678]",0.742828,0.1207,5.616453
12,7,"[263, 251, 140, 269, 461, 342, 442]","[0.27562410242996505, 0.27021445623510304, 0.010026428287903, 0.287084800882535, 0.041162796406822, 0.085148505887383]",0.740079,0.120719,4.704954
10,10,"[135, 494, 25, 330, 379, 133, 166, 483, 231, 423]","[0.05941885942788701, 0.22815921365968703, 0.05073325096876, 0.026501944252203, 0.205607945510339, 0.28601800385848103, 0.0011844798983740002, 0.15365767901573302, 0.24378628849563402]",0.731199,0.114089,4.792279
1,6,"[245, 406, 82, 265, 403, 213]","[0.29971215459724304, 0.070826693085592, 0.118974218188807, 0.116373222348602, 0.200923811041043]",0.729088,0.11482,2.939404


In [37]:
df_pfs.sort_values('roc_auc', ascending=False)

Unnamed: 0,n_layers,n_units_per_layer,dropout_rate_per_layer,roc_auc,auk,fitting_time (minutes)
5,1,[127],[],0.71104,0.102817,0.982926
18,1,[57],[],0.706301,0.100045,0.964883
2,1,[253],[],0.704983,0.100378,1.040051
17,1,[90],[],0.700566,0.09803,0.941535
7,1,[483],[],0.698401,0.094935,1.030251
10,1,[237],[],0.697966,0.096886,1.003557
22,7,"[342, 328, 282, 374, 281, 197, 236]","[0.28020264785680304, 0.276188325616364, 0.074993205492436, 0.177019209493915, 0.268192225256899, 0.28445940498646904]",0.683472,0.086693,3.846633
13,6,"[230, 172, 84, 374, 268, 160]","[0.072798210667843, 0.12150294089881501, 0.16686715149984702, 0.08885207049948, 0.23464488095621103]",0.682469,0.084337,2.779639
6,8,"[419, 457, 30, 62, 212, 413, 477, 127]","[0.299068935903381, 0.003915005603719, 0.19795684856640602, 0.05398634044720001, 0.20211287890138901, 0.2219051157207, 0.11587433020869801]",0.680697,0.087436,4.278874
14,4,"[304, 92, 390, 348]","[0.22964788155871302, 0.10854672874359, 0.039961410027859]",0.67607,0.082734,1.959259


In [39]:
#Export best configs the hyperparameters of the best DNN models
# with open(os.path.join(data_base_path, 'model_tuning', 'Best-DNN-configs.json'), 'w') as f:
#     best_dnn_config = {
#         1: {
#             'features': 'all',
#             'n_layers': 5,
#             'n_units_per_layer': [35, 77, 221, 417, 113],
#             'dropout_rates': [0.29339685359898005, 0.16154876112313002, 0.15033613909798102, 0.021615340007928002],
#         },
#         2: {
#             'features': 'best_pfs',
#             'n_layers': 1,
#             'n_units_per_layer': [127],
#             'dropout_rates': [],
#         },
#     }
#     json.dump(best_dnn_config, f, indent=4)