In [1]:
# Get the same number of sepsis label = 0 as there are sepsis label =  1

In [21]:
import torch
import sklearn as sk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import xgboost
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [22]:
import utils.get_data as get_data
from utils.impute_methods import impute_linear_interpolation
from utils.feature_engineering import preprecess_data

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device}.")

Using cuda:0.


In [24]:
cmap = plt.get_cmap("viridis")

In [25]:
# Load xgboost
xgb_path = "models/saved/xgboost_model1.mdl"
xgb = xgboost.XGBClassifier()
xgb.load_model(xgb_path)

In [26]:
# Load transformers
transformers_dir = os.getcwd() + "/models/transformer"
transformer_paths = [os.path.join(transformers_dir, path) for path in os.listdir(transformers_dir)]
transformers = []
for model_path in transformer_paths[:-2]:
    model = torch.load(model_path).to("cuda")
    transformers.append(model)

In [27]:
class PatientDataset(Dataset):
    def __init__(self, patient_ids, X, y, max_length, device):
        self.patient_ids = patient_ids
        self.device = device
        self.max_length = max_length
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.patient_ids)
    
    def __getitem__(self, index):
        pid = self.patient_ids[index]
        patient_data = self.X.loc[pid]
        X_train, seq_length = prepare_patient_data(patient_data, self.max_length)
        y_train = torch.tensor(self.y.loc[pid].values, dtype=torch.float32)

        # Ensure y_train is appropriately padded or trimmed to match X_train's length
        if len(y_train) > self.max_length:
            y_train = y_train[:self.max_length]
        elif len(y_train) < self.max_length:
            y_train = pad(torch.tensor(y_train, dtype=torch.float32), (0, self.max_length - len(y_train)), value=0)
        
        return X_train, y_train, len(y_train)
        # return X_train, y_train, seq_length


def prepare_patient_data(patient_data, max_length): 
        # Standardizing the data
        scaler = StandardScaler()
        features = scaler.fit_transform(patient_data)
        # Padding
        padded_features = np.zeros((max_length, features.shape[1]))
        sequence_length = min(max_length, features.shape[0])
        padded_features[:sequence_length] = features[:sequence_length]
        return torch.tensor(padded_features, dtype=torch.float32), sequence_length

In [28]:
dataset, patient_id_map = get_data.get_dataset()

   20337
   40337
Dataset loaded into a MultiIndex DataFrame.


In [37]:
dataset.info

<bound method DataFrame.info of                        HR  O2Sat   Temp    SBP     MAP    DBP  Resp  EtCO2  \
patient_id                                                                   
20343.0    790481    84.0   96.0    NaN  142.0   89.00   69.0  16.0    NaN   
1192.0     46265     89.0   97.0    NaN  150.0   92.00   87.0  21.0    NaN   
16323.0    632249    75.0  100.0  36.39    NaN   64.00    NaN  18.0    NaN   
541.0      20936     61.0   98.0    NaN  112.0   66.67    NaN  20.0    NaN   
24136.0    933450    81.0   94.0    NaN  140.0   95.00   83.0  20.0    NaN   
...                   ...    ...    ...    ...     ...    ...   ...    ...   
18941.0    735246   115.0   95.0  37.89  138.0  122.00  112.0  32.0    NaN   
34537.0    1331595   74.0   98.0  36.40  172.0  108.00   87.0  18.0   37.0   
4642.0     180365    86.0  100.0    NaN  104.0   69.00   53.0  18.0    NaN   
6673.0     258910    79.0   99.0    NaN  150.0   93.00   57.0  23.0    NaN   
16575.0    642389    65.0   94.0

In [32]:
# Get a random sample of the non-sepsis dataset the same length as the sepsis dataset
sepsis_zero = dataset[dataset['SepsisLabel'] == 0]
sepsis_one = dataset[dataset['SepsisLabel'] == 1]
random_subset_sepsis_zero = sepsis_zero.sample(n=len(sepsis_one.index), random_state=42)

combined = pd.concat([random_subset_sepsis_zero, sepsis_one])

subset_proportion = 0.2
n = round(len(combined) * subset_proportion)

# Combine and get a random subset
dataset = combined.sample(n, random_state=42)

In [33]:
len(dataset)

11166

In [35]:
dataset.shape

(11166, 41)

In [13]:
downsampling = True
if downsampling:
    sepsis_groups = dataset.groupby(level="patient_id")["SepsisLabel"].max()
    patients_sepsis = sepsis_groups[sepsis_groups == 1].index
    patients_no_sepsis = sepsis_groups[sepsis_groups == 0].index
    min_size = len(patients_sepsis)
    sampled_no_sepsis = np.random.choice(patients_no_sepsis, min_size, replace=False)
    downsampled_dataset = dataset.loc[np.concatenate([patients_sepsis, sampled_no_sepsis])]
    print("Dataset after downsampling: ", downsampled_dataset.shape)

Dataset after downsampling:  (279625, 41)


In [14]:
columns_to_linearly_interpolate = [
    'HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Resp'
]

# Linear Interpolation
print("Linearly interpolating:")
for col in columns_to_linearly_interpolate:
    if col != 'SepsisLabel':  # Ensure we do not interpolate 'SepsisLabel'
        dataset = impute_linear_interpolation(dataset, col)
        print(col)

Linearly interpolating:
HR
O2Sat
SBP
MAP
DBP
Resp


In [None]:
# Linear Interpolation
print("Linearly interpolating:")
for col in columns_to_linearly_interpolate:
    if col != 'SepsisLabel':  # Ensure we do not interpolate 'SepsisLabel'
        downsampled_dataset = impute_linear_interpolation(downsampled_dataset, col)
        print(col)

In [15]:
def add_nan_indicators(df):
    for column in df.columns:
        df[column + '_nan'] = df[column].isna().astype(int)
    return df

In [20]:
X = add_nan_indicators(dataset)
X_downsampled = add_nan_indicators(downsampled_dataset)

# Engineered
XX, y_engineered = preprecess_data(X)
new_feature_names = [f"new_feature_{i}" for i in range(XX.shape[1])]
XX_df = pd.DataFrame(XX, columns=new_feature_names, index=X.index)
X_engineered = pd.concat([X, XX_df], axis=1)

# Engineered downsampled
XX, y_engineered = preprecess_data(X_downsampled)
new_feature_names = [f"new_feature_{i}" for i in range(XX.shape[1])]
XX_df = pd.DataFrame(XX, columns=new_feature_names, index=X.index)
X_downsampled_engineered = pd.concat([X, XX_df], axis=1)

# Base
X = dataset.drop('SepsisLabel', axis=1) 
y = dataset['SepsisLabel']

KeyboardInterrupt: 

In [None]:
# just in case
dataset *= 0

print("Seeing if there are still any nan values or +/- infinities")
# Just trying to fix some errors I got only on a GPU
# if X.isin([np.nan, np.inf, -np.inf]).any().any():
#     print("Data contains NaN or infinite values. Handling...")
#     X.replace([np.inf, -np.inf], np.nan, inplace=True)
#     X.fillna(method='ffill', inplace=True)
if X.isin([np.nan, np.inf, -np.inf]).any().any():
    print("Data contains NaN or infinite values. Handling...")
    # Replace infinite values with NaN so they can be filled too
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # First apply forward fill
    X.fillna(method='ffill', inplace=True)
    # Then apply backward fill for any remaining NaNs
    X.fillna(method='bfill', inplace=True)

# Ensure no NaNs or infinities in the target variable as well
if y.isin([np.nan, np.inf, -np.inf]).any():
    print("Target contains NaN or infinite values. Handling...")
    y.replace([np.inf, -np.inf], np.nan, inplace=True)
    y.fillna(method='ffill', inplace=True)

In [None]:
# Find the maximum sequence length for padding
# Yes it's really high, 336, consider making it larger to accommodate actual test set
max_length = X.groupby('patient_id').size().max()

print("Max length (inputs will be padded to): ", max_length)

patient_ids = X.index.get_level_values('patient_id').unique()
train_ids, val_ids = train_test_split(patient_ids, test_size=0.2, random_state=42)

In [None]:
np.shape(dataset)

In [None]:
np.shape(X)

In [None]:
np.shape(XX)

In [None]:
np.shape(y)