In [1]:
import os
import sys
import random
import pandas as pd
import numpy as np
from scipy.linalg import toeplitz
from copy import copy
import matplotlib.pyplot as plt
%matplotlib inline

# Geniuses that worked on hypertools did not update certain package and thus it produces warnings (they break jupyter lab)
import warnings
warnings.filterwarnings("ignore")

# Comment out if you don't want to see all of the values being printed (i.e. default)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

current_dir = os.getcwd()
# utils_path = os.path.join(current_dir, '..', 'utils')
utils_path = os.path.join(current_dir, '../')
utils_abs_path = os.path.abspath(utils_path)
if utils_abs_path not in sys.path:
    sys.path.append(utils_abs_path)

import utils.get_data as get_data
# from impute_methods import *
from utils.impute_methods import impute_linear_interpolation

DATA_PATH = get_data.get_dataset_abspath()

training_setA_path = DATA_PATH + 'training_setA'
training_setB_path = DATA_PATH + 'training_setB'

In [2]:
import os
import sys
import random
import pandas as pd
import numpy as np
from scipy.linalg import toeplitz
from copy import copy
import matplotlib.pyplot as plt
%matplotlib inline

# Geniuses that worked on hypertools did not update certain package and thus it produces warnings (they break jupyter lab)
import warnings
warnings.filterwarnings("ignore")

# Comment out if you don't want to see all of the values being printed (i.e. default)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

current_dir = os.getcwd()
# utils_path = os.path.join(current_dir, '..', 'utils')
utils_path = os.path.join(current_dir, '../')
utils_abs_path = os.path.abspath(utils_path)
if utils_abs_path not in sys.path:
    sys.path.append(utils_abs_path)

import utils.get_data as get_data
# from impute_methods import *
from utils.impute_methods import impute_linear_interpolation

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerTimeSeries(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=4, num_layers=2, dropout=0.2):
        super(TransformerTimeSeries, self).__init__()
        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x.transpose(0, 1))
        x = self.decoder(x.transpose(0, 1))
        return x.squeeze(-1)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerTimeSeries(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=4, num_layers=2, dropout=0.2):
        super(TransformerTimeSeries, self).__init__()

        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x.transpose(0, 1))
        x = self.decoder(x)
        return x.squeeze(-1)
        
        


In [5]:
# Load and preprocess data
dataset, patient_id_map = get_data.get_dataset()

   20337
   40337
Dataset loaded into a MultiIndex DataFrame.


In [6]:
dataset.shape

(1552210, 41)

In [7]:
# First lets experiment with only raw data 
# We have to however impute NaN values since Neural Networks can't (natively) handle them

columns_to_linearly_interpolate = [
    'HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Resp'
]

# Feel free to omit this (EXPERIMENTAL)
# Normilize the dataset
if True:
    # Check if multiindex_df is indeed a MultiIndex DataFrame
    if isinstance(dataset.index, pd.MultiIndex):
        # Exclude 'SepsisLabel' from normalization
        features_to_normalize = dataset.columns.difference(['SepsisLabel'])

        # Normalize each patient's data
        # This will apply z-score normalization per patient per feature, excluding 'SepsisLabel'
        normalized_data = dataset[features_to_normalize].groupby(level=0).transform(
            lambda x: (x - x.mean()) / x.std())

        # Optionally fill NaN values if they are created by division by zero in cases where std is zero
        normalized_data = normalized_data.fillna(0)

        # Merge normalized data with the 'SepsisLabel' column
        dataset = pd.concat([normalized_data, dataset['SepsisLabel']], axis=1)
    else:
        print("The dataframe does not have a MultiIndex as expected.")

# Linear Interpolation
print("Linearly interpolating:")
for col in columns_to_linearly_interpolate:
    if col != 'SepsisLabel':  # Ensure we do not interpolate 'SepsisLabel'
        dataset = impute_linear_interpolation(dataset, col)
        print(col)
print("Done")

Linearly interpolating:
HR
O2Sat
SBP
MAP
DBP
Resp
Done


In [8]:
dataset.shape

(1552210, 41)

In [9]:
def add_nan_indicators(df):
    for column in df.columns:
        df[column + '_nan'] = df[column].isna().astype(int)
    return df

In [10]:
def downsample(X, y):
    index_0 = np.where(y == 0)[0]
    index_1 = np.where(y == 1)[0]
    print(index_0, index_1)

    if len(index_0) > len(index_1):
        index_0 = np.random.choice(index_0, size=len(index_1), replace=False)

    balanced_indices = np.concatenate([index_0, index_1])
    np.random.shuffle(balanced_indices)

    x_balanced = X.iloc[balanced_indices]
    y_balanced = y.iloc[balanced_indices]

    return x_balanced, y_balanced
    

In [125]:
import torch
from torchsummary import summary
model = TimeSeriesTransformer(input_dim=X_train.shape[1])
summary(model, input_size=(1, 64))

ModuleNotFoundError: No module named 'torchsummary'

In [11]:
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for training.")

Using cpu for training.


In [12]:
X = dataset.drop('SepsisLabel', axis=1)
X = add_nan_indicators(X)
y = dataset['SepsisLabel']

for patient_id in X.index.get_level_values('patient_id').unique():
    print(patient_id)
    break

a = 0
for patient_id, patient_data in X.groupby(level='patient_id'):
    print(patient_data.shape)
    if a > 10:
        break
    a += 1


1.0
(28, 80)
(34, 80)
(37, 80)
(29, 80)
(47, 80)
(26, 80)
(44, 80)
(56, 80)
(37, 80)
(32, 80)
(26, 80)
(27, 80)


In [16]:

# Prepare data and add missingness indicators
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val = add_nan_indicators(X_train), add_nan_indicators(X_val)

# X_train.fillna(0, inplace=True)
# X_val.fillna(0, inplace=True)

# Convert to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to(device)

# Create DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=1, shuffle=False)

# Initialize the model
# model = TimeSeriesTransformer(num_features=X_train.shape[1])  # Assuming features were doubled to account for indicators
model = TransformerTimeSeries(input_dim=X_train.shape[1])
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# To keep all the data focus on the minority class (sepsis = 1)

# class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.values)

class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

criterion = nn.BCELoss(weight=class_weights[1])  # Focus more on the minority class

TransformerTimeSeries(
  (encoder): Linear(in_features=120, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Linear(in_features=64, out_features=1, bias=True)
)

In [22]:
from sklearn.preprocessing import StandardScaler

def prepare_patient_data(patient_data, max_length):
    # Standardizing the data
    scaler = StandardScaler()
    features = scaler.fit_transform(patient_data)
    # Padding
    padded_features = np.zeros((max_length, features.shape[1]))
    sequence_length = min(max_length, features.shape[0])
    padded_features[:sequence_length] = features[:sequence_length]
    return torch.tensor(padded_features, dtype=torch.float32), sequence_length


In [24]:
df = dataset
X = df.drop('SepsisLabel', axis=1)
X = add_nan_indicators(X)
y = df['SepsisLabel']

# Find the maximum sequence length for padding
max_length = X.groupby('patient_id').size().max()

patient_ids = X.index.get_level_values('patient_id').unique()
train_ids, val_ids = train_test_split(patient_ids, test_size=0.2, random_state=42)

# Initialize model, criterion, and optimizer
input_dim = X.shape[1]
model = TransformerTimeSeries(input_dim=input_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for patient_id in train_ids:
        patient_data = X.loc[patient_id]
        
        X_train, sequence_length = prepare_patient_data(patient_data, max_length)
        y_train = torch.tensor(y.loc[patient_id].values, dtype=torch.float32)
        
        optimizer.zero_grad()
        outputs = model(X_train.unsqueeze(0))  # Unsqueeze to add a batch dimension
        
        loss = criterion(outputs[:sequence_length].squeeze(), y_train[:sequence_length])
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for patient_id in val_ids:
            patient_data = X.loc[patient_id]
            X_val, sequence_length = prepare_patient_data(patient_data, max_length)
            y_val = torch.tensor(y.loc[patient_id].values, dtype=torch.float32)
            val_outputs = model(X_val.unsqueeze(0))
            v_loss = criterion(val_outputs[:sequence_length], y_val[:sequence_length])  # Only consider non-padded outputs
            val_loss += v_loss.item()

    print(f'Epoch {epoch+1}, Avg Training Loss: {train_loss / len(train_ids)}, Avg Validation Loss: {val_loss / len(val_ids)}')


TransformerTimeSeries(
  (encoder): Linear(in_features=80, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Linear(in_features=64, out_features=1, bias=True)
)

KeyboardInterrupt: 

In [31]:
X = df.drop('SepsisLabel', axis=1)
print(X.shape)

max_length = X.groupby('patient_id').size().max()
print("Max length (inputs will be padded to): ", max_length)

patient_ids = X.index.get_level_values('patient_id').unique()
for patient_id in patient_ids:
    max_length_patient = X.loc[patient_id].shape[0]
    if max_length_patient == max_length:
        print(patient_id, max_length_patient)

(1552210, 40)
Max length (inputs will be padded to):  336
457.0 336
1126.0 336
7228.0 336
12849.0 336
14201.0 336
23823.0 336
25080.0 336
26436.0 336
33001.0 336
36236.0 336


In [28]:
df = dataset
X = df.drop('SepsisLabel', axis=1)
X = add_nan_indicators(X)
y = df['SepsisLabel']

# Find the maximum sequence length for padding
# Yes it's really high, 336, consider making it larger to accommodate actual test set
max_length = X.groupby('patient_id').size().max()
print("Max length (inputs will be padded to): ", max_length)

patient_ids = X.index.get_level_values('patient_id').unique()
train_ids, val_ids = train_test_split(patient_ids, test_size=0.2, random_state=42)

# Initialize model, criterion, and optimizer
input_dim = X.shape[1]
model = TransformerTimeSeries(input_dim=input_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    for patient_id in train_ids:
        patient_data = X.loc[patient_id]
        
        X_train, sequence_length = prepare_patient_data(patient_data, max_length)
        y_train = torch.tensor(y.loc[patient_id].values, dtype=torch.float32)
        
        optimizer.zero_grad()
        outputs = model(X_train.unsqueeze(0))  # Unsqueeze to add a batch dimension
        
        loss = criterion(outputs[:sequence_length].squeeze(), y_train[:sequence_length])
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        # Calculate accuracy
        predicted_labels = torch.round(torch.sigmoid(outputs[:sequence_length].squeeze()))
        train_correct += (predicted_labels == y_train[:sequence_length]).sum().item()
        train_total += sequence_length

    # Validation phase
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for patient_id in val_ids:
            patient_data = X.loc[patient_id]
            X_val, sequence_length = prepare_patient_data(patient_data, max_length)
            y_val = torch.tensor(y.loc[patient_id].values, dtype=torch.float32)
            val_outputs = model(X_val.unsqueeze(0))
            v_loss = criterion(val_outputs[:sequence_length], y_val[:sequence_length])  # Only consider non-padded outputs
            val_loss += v_loss.item()

            # Calculate accuracy
            val_predicted_labels = torch.round(torch.sigmoid(val_outputs[:sequence_length].squeeze()))
            val_correct += (val_predicted_labels == y_val[:sequence_length]).sum().item()
            val_total += sequence_length

    train_accuracy = train_correct / train_total
    val_accuracy = val_correct / val_total

    print(f'Epoch {epoch+1}, Avg Training Loss: {train_loss / len(train_ids)}, Training Accuracy: {train_accuracy}, Avg Validation Loss: {val_loss / len(val_ids)}, Validation Accuracy: {val_accuracy}')


Max length (inputs will be padded to):  336
Patient ID: 1.0, Max Length (inputs will be padded to): 28
Patient ID: 2.0, Max Length (inputs will be padded to): 34
Patient ID: 3.0, Max Length (inputs will be padded to): 37
Patient ID: 4.0, Max Length (inputs will be padded to): 29
Patient ID: 5.0, Max Length (inputs will be padded to): 47
Patient ID: 6.0, Max Length (inputs will be padded to): 26
Patient ID: 7.0, Max Length (inputs will be padded to): 44
Patient ID: 8.0, Max Length (inputs will be padded to): 56
Patient ID: 9.0, Max Length (inputs will be padded to): 37
Patient ID: 10.0, Max Length (inputs will be padded to): 32
Patient ID: 11.0, Max Length (inputs will be padded to): 26
Patient ID: 12.0, Max Length (inputs will be padded to): 27
Patient ID: 13.0, Max Length (inputs will be padded to): 37
Patient ID: 14.0, Max Length (inputs will be padded to): 27
Patient ID: 15.0, Max Length (inputs will be padded to): 47
Patient ID: 16.0, Max Length (inputs will be padded to): 54
Patie

KeyboardInterrupt: 