In [None]:
# Milestone 1 source code:

#I used OOP to streamline changing between a number of model structures. The first block below is code to
#define the Mymodel class and functions

#see the Mymodel.preprocess function for details on how the data was processed for analysis

#the second block contains the actual commands to run and execute the model construction and fit

In [2]:

import pandas as pd
import numpy as np
from tensorflow import keras as K
from sklearn.model_selection import train_test_split as tts
from tensorflow.keras import layers
import keras_tuner



def X_convert_onehot(X):
    
    for i in np.arange(X.shape[1]):
    
        if i == 0:
            #set new x matrix for onehot
            X_onehot = K.utils.to_categorical(X[:,i]-1)    
        else:
            #onehot encode
            onehot = K.utils.to_categorical(X[:,i]-1)
            #append onehot columns to matrix
            X_onehot = np.append(X_onehot, onehot, axis = 1)
            
    return X_onehot



class Mymodel():
    
    """
    A Keras model constructed on the COVID 19 data found here:
    https://www.kaggle.com/datasets/meirnizri/covid19-dataset
    
    """
    
    def __init__(self, objective = ['pred_outcome', 'pred_risk'],
                 pathname = ''):
        """
        Initialization of keras model constructed on Covid 19 model to assess risk/outcome
        
        
        objective: either pred_outcome or pred_risk. Outcome is a binary prediction problem to predict
                   whether a patient died or survived their covid infection. Risk is an 8-class prediction
                   problem which assess an individuals health risk by combing their clinical attributes
                   for ICU, INTUBED, and CLINICAL OUTCOME
                   
        pathname = /path/to/the/file/Covid Data.csv
                   
                  
        
        """
        #read in data
        self.covid_data = pd.read_csv(pathname+'/Covid Data.csv')
        #set the objective
        self.objective = objective
    
    def preprocess(self, convert_X = ['Onehot','Sum_coding'],
                   drop_features_with_high_nan = False):
        """
        This function automatically performs the preprocessing steps needed to get the covid data
        read for analysis
        
        convert_X: how to conver the categorical features (only works for Onehot right now)
        
        drop_features_with_high_nan: whether to remove features with more than 80% NaN values (ICU and INTUBED)

        """
        #replace 'missing' pregnancy values for males (97 or 98) as 2 ('no')
        self.covid_data.loc[self.covid_data.SEX == 2, 'PREGNANT'] = 2



        #replace all 97,98, and 99 values with NaNs
        covid_data_new = self.covid_data.replace([97,98,99], np.nan)


        #now we are going to add a new column called clinical outcome 
        covid_data_new = covid_data_new.assign(CLINICAL_OUTCOME = self.covid_data.DATE_DIED)
        covid_data_new.drop(columns = 'DATE_DIED', inplace = True)
        #we will use the 9999-99-99 which indicate the patient did not die as a proxy for 'lived' and
        #any other date as a proxy for 'died'
        #we will set 1 as died and 2 as lived
        covid_data_new.CLINICAL_OUTCOME.replace('9999-99-99', 2, inplace = True)
        which = np.unique(covid_data_new.CLINICAL_OUTCOME[covid_data_new.CLINICAL_OUTCOME != 2])
        covid_data_new.CLINICAL_OUTCOME.replace(which.tolist(), 1, inplace = True)
        
        
        #option to drop features with a high percentage of nan
        if (drop_features_with_high_nan == True) & (self.objective == 'pred_outcome'):
            
            for col in covid_data_new.columns:
                
                z = sum(np.isnan(covid_data_new[col]))/len(covid_data_new[col])
                print('proportion of NaNs for feature {}, is {}'.format(
                    col, z))
                
                if sum(np.isnan(covid_data_new[col]))/len(covid_data_new[col]) > 0.8:
                    #if number of nan's over 80% drop feature
                    covid_data_new.drop(columns = col, inplace = True)
                
            #remove rows with nans    
            covdata_nona = covid_data_new.dropna(axis = 0)
            print('---- size of data after column drop and nan removal: rows,cols {}, -----'.format(
            covdata_nona.shape))
                
        else:
            #remove rows with nans
            covdata_nona = covid_data_new.dropna(axis = 0)    
            print('---- size of data after column drop and nan removal: rows,cols {}, -----'.format(
                  covdata_nona.shape))
            
        #discretize age into age groups by rounding
        covdata_nona = covdata_nona.assign(AGE = np.round(covdata_nona.AGE, -1))
    

        #keep only samples with values between 1-3. values over 4 means covid test negative/inconclusive
        covdata_nona_final = covdata_nona[covdata_nona.CLASIFFICATION_FINAL < 4]
        if (drop_features_with_high_nan != True) & (self.objective == 'pred_risk'):
            #discretize age into age groups by rounding
            covdata_nona = covdata_nona.assign(AGE = np.round(covdata_nona.AGE, -1))
    

            #keep only samples with values between 1-3. values over 4 means covid test negative/inconclusive
            covdata_nona_final = covdata_nona[covdata_nona.CLASIFFICATION_FINAL < 4]
            #final dimension (108090, 22)


            #-------Createdting--Clinical--Risk--proxy--target--variable---------
            #our clinical risk is a combination of outcome (lived/died), intubed (intubated/not), and
            # icu (admitted/not)
            #convert CLINICAL_OUTCOME, ICU and INTUBED to unique strings
            outcomes_as_strings = covdata_nona_final.ICU.replace([1,2], ['Died_','Recovered_'])
            icu_as_strings = covdata_nona_final.ICU.replace([1,2], ['Admitted_ICU_','Not_Admitted_ICU_'])
            intubed_as_strings = covdata_nona_final.INTUBED.replace([1,2], ['Intubated', 'Not_Intubated'])

            #combine the strings element wise from Outcome, ICU, and Intubed
            l1 = [i + j for i, j in zip(outcomes_as_strings, icu_as_strings)]
            l2 = [i + j for i, j in zip(l1, intubed_as_strings)]
            #set new variable clinical risk as the element-wise joined list l2
            covdata_nona_final = covdata_nona_final.assign(CLINICAL_RISK = l2)
            #now we need to recode the unique strings back into numerical strings
            covdata_nona_final = covdata_nona_final.assign(CLINICAL_RISK_FINAL = covdata_nona_final.CLINICAL_RISK)
            #obtain the number of risk classes
            classes = np.unique(covdata_nona_final['CLINICAL_RISK_FINAL']).tolist()
            #recode to 0-7
            covdata_nona_final.CLINICAL_RISK_FINAL.replace(classes, np.arange(len(classes)).tolist(), inplace = True)


        #Set Y and X for model            
        if self.objective == 'pred_risk':
                
            self.Y = np.array(covdata_nona_final.CLINICAL_RISK_FINAL)
            #Drop Columns we created or used to create clinical risk
            self.covid_data_final = covdata_nona_final.drop(columns=['CLINICAL_OUTCOME', 'CLINICAL_RISK', 'CLINICAL_RISK_FINAL'])
            self.X = np.array(self.covid_data_final)
                
        if self.objective == 'pred_outcome':
            self.Y = np.array(covdata_nona_final.CLINICAL_OUTCOME)
            if drop_features_with_high_nan == True:
                self.covid_data_final = covdata_nona_final.drop(columns=['CLINICAL_OUTCOME'])
            else: 
                self.covid_data_final = covdata_nona_final.drop(columns=['CLINICAL_RISK', 'CLINICAL_RISK_FINAL', 'CLINICAL_OUTCOME'])
                
            self.X = np.array(self.covid_data_final)
        
        
        print('final size of feature matrix: rows,cols {}, with features ={}'.format(
                  self.covid_data_final.shape, self.covid_data_final.columns))
        #convert X into a one-hot matrix for all categorical variables
        if convert_X == 'Onehot':
            self.X_convert = X_convert_onehot(self.X)
            
            
            
            
    def build_model(self, conv_layers = None, mlp_layers = 3, mlp_hidden_act = ['leakyrelu'],
                    hidden_units = [50], conv_hidden_act= ['relu'],  padd_type = 'same', 
                    num_filters = [32], kernels = [3], layers_stride = 2, add_norm = False, 
                    pool_type = ['Avg', 'Max'], use_metric = 'accuracy', use_batch_norm = True, 
                    use_dropout_layers = True, dropout_rate = 0.2, use_encoding_layer = True,
                    out_mode = ['one_hot','multi_hot','count'], verbose = True):
        
        """
        this function wraps keras.model.sequential to build a keras model consisting of convolution and
        Dense Layers
        
        """
        self.encoding_layer = use_encoding_layer
        self.conv_layers = conv_layers
        self.mlp_layers = mlp_layers
        self.model = K.models.Sequential()
        
        if add_norm == True:
            self.model.add(layers.Normalization(axis=1))
            
        if use_encoding_layer == True:
            self.model.add(layers.CategoryEncoding(output_mode = out_mode))
        
        
        #convolutional layers
        if conv_layers != None:
            
            #set input shape 
            in_shape_conv = (self.X_convert.shape[0], self.X_convert.shape[1], 1)
        
            for i in np.arange(conv_layers):
                
                if i == 0:     
                    #conv 2D layer
                    self.model.add(layers.Conv1D(filters = num_filters[i], 
                                                 kernel_size = kernels[i],
                                                 padding = padd_type,
                                                 input_shape = in_shape_conv[1:],
                                                 data_format="channels_last",
                                                 strides = layers_stride))
                    #if batch normalization
                    if use_batch_norm == True:
                        self.model.add(layers.BatchNormalization())
                    
                    
                    #activate
                    self.model.add(layers.Activation(conv_hidden_act[i]))
                        
                    #pooling layer
                    if pool_type == 'Max':
                        self.model.add(layers.GlobalMaxPooling1D())
                    
                    if pool_type == 'Avg':
                        self.model.add(layers.GlobalAveragePooling1D())
                        
                else:
                    #conv 2D layer
                    self.model.add(layers.Conv1D(filters = num_filters[i], 
                                                 kernel_size = kernels[i],
                                                 padding = padd_type,
                                                 input_shape = in_shape_conv[1:],
                                                 data_format="channels_last",
                                                 strides = layers_stride))
                    #if batch normalization
                    if use_batch_norm == True:
                        self.model.add(layers.BatchNormalization())
                    
                    
                    #activate
                    self.model.add(layers.Activation(conv_hidden_act[i]))
                        
                    #pooling layer
                    if pool_type == 'Max':
                        self.model.add(layers.GlobalMaxPooling1D())
                    
                    if pool_type == 'Avg':
                        self.model.add(layers.GlobalAveragePooling1D())
                        
                    
            #flattening layer
            self.model.add(layers.Flatten())
        
        #input dimension for dense layers (only when no convolution layers are used)
        in_shape_dense = self.X_convert.shape
        
        #perceptron (dense) layers
        for i in np.arange(mlp_layers):
            
            #first hidden layer
            if i == 0:
                if conv_layers != None:
                    #if convolution layers no need to set input dim
                    #dense layer
                    self.model.add(layers.Dense(units = hidden_units[i],
                                                name = 'hidden_layer'+str(i)))
                    #if batch normalization
                    if use_batch_norm == True:
                        self.model.add(layers.BatchNormalization())
                    
                    #activate
                    self.model.add(layers.Activation(mlp_hidden_act[i]))
                    
                    #dropout layer
                    if use_dropout_layers == True:
                        self.model.add(layers.Dropout(dropout_rate))
                
                else:
                    #if no conv layers need to set input dimension to # of features
                    self.model.add(layers.Dense(units = hidden_units[i], 
                                                input_dim = in_shape_dense[1],
                                                name = 'hidden_layer'+str(i)))
                    
                    #if batch normalization
                    if use_batch_norm == True:
                        self.model.add(layers.BatchNormalization())
                    
                    #activate
                    self.model.add(layers.Activation(mlp_hidden_act[i]))
                    
                    
                    #dropout layer
                    if use_dropout_layers == True:
                        self.model.add(layers.Dropout(dropout_rate))
            
            #all other hidden layers
            else:
                #
                self.model.add(layers.Dense(units = hidden_units[i],
                                            name = 'hidden_layer'+str(i)))
                
                if use_batch_norm == True:
                        self.model.add(layers.BatchNormalization())
                        
                self.model.add(layers.Activation(mlp_hidden_act[i]))
                
                #dropout layer
                if use_dropout_layers == True:
                    self.model.add(layers.Dropout(dropout_rate))
        
        
        #Add output layer and compile model
        #set optimizer to adam
        opt = K.optimizers.Adam(learning_rate=0.001)
        
        #for binary lived/died prediction (clinical outcome)
        if self.objective == 'pred_outcome':
            #Dense output layer
            self.model.add(layers.Dense(units = 1, name = 'output_layer'))
            #if batch normalization
            if use_batch_norm == True:
                self.model.add(layers.BatchNormalization())
            
            #activate
            self.model.add(layers.Activation('sigmoid'))
            
            #compile model
            loss_func = K.losses.BinaryCrossentropy(from_logits=False)
            self.model.compile(loss=loss_func, optimizer=opt, metrics = 'accuracy') 
        
        #for multiclass 'risk' prediction -- combined lived/died+ICU+Intubed
        if self.objective == 'pred_risk':
            #dense layer
            self.model.add(layers.Dense(units = len(np.unique(self.Y)),
                                            name = 'output_layer'))
            
            
            #if batch normalization
            if use_batch_norm == True:
                self.model.add(layers.BatchNormalization())
            
            #activate
            self.model.add(layers.Activation('softmax'))
            
            
            #compile model
            loss_func = K.losses.CategoricalCrossentropy()
            self.model.compile(loss=loss_func, optimizer=opt, metrics = 'accuracy')
        
        #print final model
        if verbose == True:
            print(self.model.summary())
      
            
      
        
    def fit(self, train_size = 0.9, validation_size = 0.2, batchsize = 128,
            num_epochs = 50):
        
        if self.encoding_layer == True:
            
            self.x_train, self.x_test, self.y_train, self.y_test = tts(self.X, 
                                                                       self.Y, 
                                                                       test_size = 1 - train_size, 
                                                                       train_size = train_size)
            
        else:
            self.x_train, self.x_test, self.y_train, self.y_test = tts(self.X_convert, 
                                                                       self.Y, 
                                                                       test_size = 1 - train_size, 
                                                                       train_size = train_size)
        
        if self.conv_layers != None:
            train_shp = self.x_train.shape
            test_shp = self.x_test.shape
            self.x_train = self.x_train.reshape((train_shp[0], train_shp[1], 1))
            self.x_test = self.x_test.reshape((test_shp[0], test_shp[1], 1))
            
            
        if self.objective == 'pred_outcome':
            self.y_train_convert = self.y_train-1
            self.y_test_convert = self.y_test-1
            
        if self.objective == 'pred_risk':
            self.y_train_convert = K.utils.to_categorical(self.y_train)       
            self.y_test_convert = K.utils.to_categorical(self.y_test) 
            
        
            
        self.model.fit(self.x_train, 
                       self.y_train_convert, 
                       batch_size = batchsize, 
                       epochs = num_epochs, 
                       validation_split=validation_size, 
                       validation_batch_size= round(validation_size*batchsize))
        
        
        
    def evaluate(self, batchsize = 256):
        
        self.model.evaluate(x = self.x_test,
                            y = self.y_test_convert,
                            batch_size=batchsize)
        
        
        
        


def model_arch(hp, numlayers = 5, 
               units_in_layers = [[150,120,100], [75, 60, 50], [35, 30, 25], [20, 15, 10], [10, 8, 5]],
               use_batch_norm = True, use_dropout_layers = True, dropout_rate = 0.2):
        
    model = K.models.Sequential()
    for i in np.arange(numlayers):
        model.add(layers.Dense(hp.Choice('units',units_in_layers[i])))
        if use_batch_norm == True:
            model.add(layers.BatchNormalization())

        model.add(layers.Activation('relu'))
            
        if use_dropout_layers == True:
            model.add(layers.Dropout(dropout_rate))
            
    model.add(layers.Dense(units = 1))
    if use_batch_norm == True:
        model.add(layers.BatchNormalization())

    model.add(layers.Activation('sigmoid'))
        
    if use_dropout_layers == True:
        model.add(layers.Dropout(dropout_rate))
    
    opt = K.optimizers.Adam(learning_rate=0.001)
    loss_func = K.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=loss_func, optimizer=opt, metrics = 'accuracy') 
    
    #print(model.summary())
        
        
    return model

def tune_arch(x, x_val, y, y_val):
            
    arch_tuner = keras_tuner.RandomSearch(
        model_arch,
        objective='val_loss',
        max_trials=5)
    
    arch_tuner.search(x, y, epochs=5, validation_data=(x_val, y_val))
    best_model = arch_tuner.get_best_models()[0]
    
    return best_model
        
        
            
            


In [3]:

#import sys
#path = '/mnt/ceph/jarredk/Assignments/Deep_Learning/DLproject/'
#sys.path.append(path)

#path to covid data file
path = r'C:\Users\Bruin\Desktop\GS Academia\PhD\SEM 3 FALL 2022\Deep Learning\DLproject'
#sys.path.append(path)
#import final_proj_pack as fpp
#from final_proj_pack import tune_arch, model_arch



#create a Mymodel object
cov_model = Mymodel(objective = 'pred_outcome', pathname = path)
#proprocess the covid data set, this includes removing nuisance features and Nans
#as well as one hot converting the categorical features
cov_model.preprocess(convert_X = 'Onehot', drop_features_with_high_nan=True)
#fit an initial architechture
cov_model.build_model(conv_layers = None, 
                      mlp_layers = 5, 
                      mlp_hidden_act = ['relu']*5,
                      hidden_units = [120, 60, 30, 20, 10], 
                      conv_hidden_act= ['relu'], 
                      padd_type = 'same', 
                      num_filters = [128], 
                      kernels = [5], 
                      add_norm = False, 
                      pool_type = 'Max',
                      layers_stride = 3,
                      use_metric = 'accuracy', 
                      use_batch_norm = True, 
                      use_dropout_layers = True, 
                      dropout_rate = 0.3,
                      use_encoding_layer=False,
                      out_mode='one_hot',
                      verbose = True)

#fit for one epoch to get the training and testing data split
cov_model.fit(num_epochs = 1)

#set the data split point
split_point = round(cov_model.x_train.shape[0]*(1-0.2))

#get the training and validation data
x_train = cov_model.x_train[0:split_point,:]
y_train = cov_model.y_train_convert[:split_point]
x_valid = cov_model.x_train[split_point:cov_model.x_train.shape[0],:]
y_valid = cov_model.y_train_convert[split_point:]

#run keras tuner to get tune for number of units in 5 layer network
tune_suggestion = tune_arch(x_train, x_valid, y_train, y_valid)




Trial 3 Complete [00h 04m 46s]
val_loss: 0.34661296010017395

Best val_loss So Far: 0.3340647220611572
Total elapsed time: 00h 12m 24s
INFO:tensorflow:Oracle triggered exit
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden_layer0 (Dense)       (None, 120)               20040     
                                                                 
 batch_normalization_6 (Batc  (None, 120)              480       
 hNormalization)                                                 
                                                                 
 activation_6 (Activation)   (None, 120)               0         
                                                                 
 dropout_6 (Dropout)         (None, 120)               0         
                                                                 
 hidden_layer1 (Dense)       (None, 60)                7260      
             

Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [5]:
tune_suggestion.build(input_shape = x_train.shape)
tune_suggestion.summary

In [7]:
tune_suggestion.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (278978, 100)             16700     
                                                                 
 batch_normalization (BatchN  (278978, 100)            400       
 ormalization)                                                   
                                                                 
 activation (Activation)     (278978, 100)             0         
                                                                 
 dropout (Dropout)           (278978, 100)             0         
                                                                 
 dense_1 (Dense)             (278978, 100)             10100     
                                                                 
 batch_normalization_1 (Batc  (278978, 100)            400       
 hNormalization)                                        

In [11]:
#build final model according to tuner suggestions
cov_model.build_model(conv_layers = None, 
                      mlp_layers = 5, 
                      mlp_hidden_act = ['relu']*5,
                      hidden_units = [120, 100, 80, 50, 20], 
                      conv_hidden_act= ['relu'], 
                      padd_type = 'same', 
                      num_filters = [128], 
                      kernels = [5], 
                      add_norm = False, 
                      pool_type = 'Max',
                      layers_stride = 3,
                      use_metric = 'accuracy', 
                      use_batch_norm = True, 
                      use_dropout_layers = True, 
                      dropout_rate = 0.3,
                      use_encoding_layer=False,
                      out_mode='one_hot',
                      verbose = True)


#fit
cov_model.fit(num_epochs = 5, batchsize = 256)
#evaluate performance
cov_model.evaluate()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden_layer0 (Dense)       (None, 120)               20040     
                                                                 
 batch_normalization_24 (Bat  (None, 120)              480       
 chNormalization)                                                
                                                                 
 activation_24 (Activation)  (None, 120)               0         
                                                                 
 dropout_21 (Dropout)        (None, 120)               0         
                                                                 
 hidden_layer1 (Dense)       (None, 100)               12100     
                                                                 
 batch_normalization_25 (Bat  (None, 100)              400       
 chNormalization)                                     