# GRU Model with AutoEncoder

In [158]:
import pandas as pd
import numpy as np
from tensorflow.keras import Model
from tensorflow.keras.layers import GRU, LSTM, Input, Dropout, BatchNormalization, RepeatVector, TimeDistributed, Dense
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [159]:
# Load the dataset
train_x = pd.read_csv('/Users/mohanyang/Documents/GitHub/SUROP_time_series/SUROP/data/train_x.csv')
train_y = pd.read_csv('/Users/mohanyang/Documents/GitHub/SUROP_time_series/SUROP/data/train_y.csv')
test_x = pd.read_csv('/Users/mohanyang/Documents/GitHub/SUROP_time_series/SUROP/data/test_x.csv')

In [160]:
# Display the first few rows
print(train_x.head())
print(train_x.shape)

   Hour  ID    Age  Gender  Unit1  Unit2     HR   O2Sat   Temp     SBP  ...  \
0     1   2  66.67     1.0    0.0    1.0  74.78  100.06  35.61  121.68  ...   
1     2   2  66.67     1.0    0.0    1.0  74.78  100.06  35.61  121.68  ...   
2     3   2  66.67     1.0    0.0    1.0  74.78  100.06  35.61  121.68  ...   
3     4   2  66.67     1.0    0.0    1.0  74.78  100.06  35.61  121.68  ...   
4     5   2  66.67     1.0    0.0    1.0  84.42   99.58  35.61  114.60  ...   

   Phosphate  Potassium  Bilirubin_total  TroponinI    Hct   Hgb  PTT   WBC  \
0        NaN       4.48              NaN        NaN  28.83  9.21  NaN  8.15   
1        NaN       4.48              NaN        NaN  28.83  9.21  NaN  8.15   
2        NaN       4.48              NaN        NaN  28.83  9.21  NaN  8.15   
3        NaN       4.48              NaN        NaN  28.83  9.21  NaN  8.15   
4        NaN       4.48              NaN        NaN  28.83  9.21  NaN  8.15   

   Fibrinogen  Platelets  
0         NaN        Na

In [161]:
print(train_y.head())
print(train_y.shape)

   ID  Outcome
0   2        0
1   3        0
2   5        1
3   6        0
4  13        0
(12115, 2)


In [162]:
max_hours = train_x.groupby('ID')['Hour'].count().max()
print(f'Maximum number of hours: {max_hours}')

Maximum number of hours: 336


In [163]:
columns_to_exclude = ['ID', 'Hour', 'Gender']
columns_to_scale = [col for col in train_x.columns if col not in columns_to_exclude]

#Z-Score standardization
scaler = MinMaxScaler()
train_x[columns_to_scale] = scaler.fit_transform(train_x[columns_to_scale])
train_x

Unnamed: 0,Hour,ID,Age,Gender,Unit1,Unit2,HR,O2Sat,Temp,SBP,...,Phosphate,Potassium,Bilirubin_total,TroponinI,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets
0,1,2,0.606512,1.0,0.0,1.0,0.279669,0.969675,0.686651,0.366349,...,,0.274306,,,0.384131,0.305192,,0.025537,,
1,2,2,0.606512,1.0,0.0,1.0,0.279669,0.969675,0.686651,0.366349,...,,0.274306,,,0.384131,0.305192,,0.025537,,
2,3,2,0.606512,1.0,0.0,1.0,0.279669,0.969675,0.686651,0.366349,...,,0.274306,,,0.384131,0.305192,,0.025537,,
3,4,2,0.606512,1.0,0.0,1.0,0.279669,0.969675,0.686651,0.366349,...,,0.274306,,,0.384131,0.305192,,0.025537,,
4,5,2,0.606512,1.0,0.0,1.0,0.326289,0.963829,0.686651,0.341142,...,,0.274306,,,0.384131,0.305192,,0.025537,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482005,15,21633,0.513758,1.0,1.0,0.0,0.352452,0.943003,0.724590,0.285567,...,0.158613,0.226562,,,0.394177,0.347404,0.053781,0.053237,,0.070632
482006,16,21633,0.513758,1.0,1.0,0.0,0.343312,0.968092,0.730679,0.265345,...,0.158613,0.226562,,,0.394177,0.347404,0.053781,0.053237,,0.070632
482007,17,21633,0.513758,1.0,1.0,0.0,0.338959,0.960175,0.730679,0.271255,...,0.158613,0.226562,,,0.394177,0.347404,0.053781,0.053237,,0.070632
482008,18,21633,0.513758,1.0,1.0,0.0,0.338959,0.960175,0.730679,0.278838,...,0.158613,0.226562,,,0.374085,0.347404,0.053781,0.053237,,0.070632


In [164]:
# Get unique person IDs
person_ids = train_x['ID'].unique()

# Initialize tensors for features, masks, and time intervals
num_persons = len(person_ids)
num_features = train_x.shape[1] - 2  # excluding 'ID' and 'Hour' columns

In [165]:
person_ids

array([    2,     3,     5, ..., 21630, 21632, 21633])

In [166]:
#Initialize tensors
X_tensor = np.zeros((num_persons, max_hours, num_features+1))

In [167]:
for i, person_id in enumerate(person_ids):
    person_data = train_x[train_x['ID'] == person_id].sort_values('Hour')
    hours = person_data['Hour'].values
    features = person_data.drop(columns=['ID']).values
    
    # Fill X_tensor
    X_tensor[i, :len(hours), :] = features

In [168]:
X_tensor

array([[[1.        , 0.60651227, 1.        , ..., 0.02553738,
                nan,        nan],
        [2.        , 0.60651227, 1.        , ..., 0.02553738,
                nan,        nan],
        [3.        , 0.60651227, 1.        , ..., 0.02553738,
                nan,        nan],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ]],

       [[1.        , 0.75636322, 0.        , ..., 0.02136993,
         0.1769439 , 0.11987338],
        [2.        , 0.75636322, 0.        , ..., 0.02136993,
         0.1769439 , 0.11987338],
        [3.        , 0.75636322, 0.        , ..., 0.02136993,
         0.1769439 , 0.11987338],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0. 

In [169]:
print(X_tensor.shape)

(12115, 336, 39)


In [170]:
# Ensure train_y is aligned with X_tensor
labels = np.zeros(len(person_ids))

for i, person_id in enumerate(person_ids):
    if person_id in train_y['ID'].values:
        labels[i] = train_y[train_y['ID'] == person_id]['Outcome'].values[0]

print(labels.shape)

(12115,)


In [171]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, labels, test_size=0.2, random_state=10
)

In [172]:
def build_rdae(input_shape):
    inputs = Input(shape=input_shape)
    
    # Encoder
    encoded = LSTM(256, activation='relu', return_sequences=True)(inputs)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.2)(encoded)
    
    encoded = LSTM(128, activation='relu', return_sequences=False)(encoded)
    
    # Decoder
    decoded = RepeatVector(input_shape[0])(encoded)
    
    decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
    decoded = Dropout(0.2)(decoded)
    
    decoded = LSTM(256, activation='relu', return_sequences=True)(decoded)
    decoded = BatchNormalization()(decoded)
    
    # Reconstruction Layer
    outputs = TimeDistributed(Dense(input_shape[1]))(decoded)
    
    # Define the model
    autoencoder = Model(inputs, outputs)
    autoencoder.compile(optimizer=Adam(), loss='mae')
    
    return autoencoder

In [173]:
input_shape = (X_train.shape[1], X_train.shape[2])
print(input_shape)
rdae = build_rdae(input_shape)

(336, 39)


ValueError: Input 0 of layer "lstm_34" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)

In [None]:
#data with missing values replaced by zero
X_train_filled = np.nan_to_num(X_train)

In [None]:
# Train the autoencoder
rdae.fit(X_train_filled, X_train_filled, epochs=20, batch_size=16, validation_split=0.1)

In [None]:
# Use the trained autoencoder to impute missing values
X_train_rdae_imputed = rdae.predict(X_train_filled)
X_train_rdae_imputed

In [None]:
def create_gru_model(input_shape, learning_rate=0.001, dropout_rate=0.2, regularization_factor=0.001):
    x_input = Input(shape=input_shape)
    
    # GRU layers with dropout, batch normalization, and regularization
    gru_output = GRU(units=64, return_sequences=False, kernel_regularizer=l2(regularization_factor))(x_input)
    gru_output = BatchNormalization()(gru_output)
    gru_output = Dropout(dropout_rate)(gru_output)
    
    # Output layer
    output = Dense(1, activation='sigmoid', kernel_regularizer=l2(regularization_factor))(gru_output)
    
    # Create model
    model = Model(inputs=x_input, outputs=output)
    
    # Create an Adam optimizer with a custom learning rate
    optimizer = Adam(learning_rate=learning_rate)
    
    # Compile the model with the custom optimizer
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC'])
    
    return model

In [None]:
input_shape = (max_hours, num_features+1)
model_gru_rdae = create_gru_model(input_shape)

history = model_gru_rdae.fit(X_train_rdae_imputed, y_train, epochs=50, batch_size=16, validation_split=0.2)

In [None]:
# Evaluate the model on the test set
test_loss, test_auc = model_gru_rdae.evaluate(X_test, y_test)
print(f'Test AUC: {test_auc}')

In [None]:
# Plot training & validation AUC values
plt.plot(history.history['AUC'])
plt.plot(history.history['val_AUC'])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()