In [1]:
# Get the same number of sepsis label = 0 as there are sepsis label =  1

In [2]:
import torch
import sklearn as sk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import xgboost
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import train_test_split



In [3]:
import utils.get_data as get_data
from utils.impute_methods import impute_linear_interpolation
from utils.feature_engineering import preprecess_data

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device}.")

Using cuda:0.


In [5]:
cmap = plt.get_cmap("viridis")

# Data Loading

In [6]:
class PatientDataset(Dataset):
    def __init__(self, patient_ids, X, y, max_length, device):
        self.patient_ids = patient_ids
        self.device = device
        self.max_length = max_length
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.patient_ids)
    
    def __getitem__(self, index):
        pid = self.patient_ids[index]
        patient_data = self.X.loc[pid]
        X_train, seq_length = prepare_patient_data(patient_data, self.max_length)
        y_train = torch.tensor(self.y.loc[pid].values, dtype=torch.float32)

        # Ensure y_train is appropriately padded or trimmed to match X_train's length
        if len(y_train) > self.max_length:
            y_train = y_train[:self.max_length]
        elif len(y_train) < self.max_length:
            y_train = pad(torch.tensor(y_train, dtype=torch.float32), (0, self.max_length - len(y_train)), value=0)
        
        return X_train, y_train, len(y_train)
        # return X_train, y_train, seq_length


def prepare_patient_data(patient_data, max_length): 
        # Standardizing the data
        scaler = StandardScaler()
        features = scaler.fit_transform(patient_data)
        # Padding
        padded_features = np.zeros((max_length, features.shape[1]))
        sequence_length = min(max_length, features.shape[0])
        padded_features[:sequence_length] = features[:sequence_length]
        return torch.tensor(padded_features, dtype=torch.float32), sequence_length

In [7]:
dataset_raw, patient_id_map = get_data.get_dataset()

   20337
   40337
Dataset loaded into a MultiIndex DataFrame.


In [8]:
sepsis_groups = dataset_raw.groupby(level="patient_id")["SepsisLabel"].max()
patients_sepsis = sepsis_groups[sepsis_groups == 1].index
patients_no_sepsis = sepsis_groups[sepsis_groups == 0].index
patients_no_sepsis_sample = sepsis_groups.loc[patients_no_sepsis].sample(n=len(patients_sepsis)).index

In [9]:
indices = patients_no_sepsis_sample.append(patients_sepsis)

# Get the equal number of sepsis and non-sepsis patients and shuffle
dataset = dataset_raw.loc[indices].sample(frac=1)

# Check that there is equal number of sepsis and non-sepsis patients
dataset.groupby(level="patient_id")["SepsisLabel"].max().value_counts()

SepsisLabel
0    2932
1    2932
Name: count, dtype: int64

In [10]:
# subset_proportion = 0.2
# num_patients = len(dataset.index.levels[0])
# subset_size = int(subset_proportion * num_patients)
# print(f"Test set size: {subset_size} patients")

In [11]:
dataset.shape

(279461, 41)

In [12]:
columns_to_linearly_interpolate = [
    'HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Resp'
]

# Linear Interpolation
print("Linearly interpolating:")
for col in columns_to_linearly_interpolate:
    if col != 'SepsisLabel':  # Ensure we do not interpolate 'SepsisLabel'
        dataset = impute_linear_interpolation(dataset, col)
        print(col)

Linearly interpolating:
HR
O2Sat
SBP
MAP
DBP
Resp


In [14]:
def add_nan_indicators(df):
    for column in df.columns:
        df[column + '_nan'] = df[column].isna().astype(int)
    return df

In [15]:
X = add_nan_indicators(dataset)

# Engineered
XX, y_engineered = preprecess_data(X)
new_feature_names = [f"new_feature_{i}" for i in range(XX.shape[1])]
XX_df = pd.DataFrame(XX, columns=new_feature_names, index=X.index)
X_engineered = pd.concat([X, XX_df], axis=1)

# Base
X = dataset.drop('SepsisLabel', axis=1) 
y = dataset['SepsisLabel']

In [16]:
if X_engineered.isin([np.nan, np.inf, -np.inf]).any().any():
    print("Data contains NaN or infinite values. Handling...")
    # Replace infinite values with NaN so they can be filled too
    X_engineered.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # First apply forward fill
    X_engineered.fillna(method='ffill', inplace=True)
    # Then apply backward fill for any remaining NaNs
    X_engineered.fillna(method='bfill', inplace=True)

if X.isin([np.nan, np.inf, -np.inf]).any().any():
    print("Data contains NaN or infinite values. Handling...")
    # Replace infinite values with NaN so they can be filled too
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # First apply forward fill
    X.fillna(method='ffill', inplace=True)
    # Then apply backward fill for any remaining NaNs
    X.fillna(method='bfill', inplace=True)

# Ensure no NaNs or infinities in the target variable as well
if y.isin([np.nan, np.inf, -np.inf]).any():
    print("Target contains NaN or infinite values. Handling...")
    y.replace([np.inf, -np.inf], np.nan, inplace=True)
    y.fillna(method='ffill', inplace=True)

Data contains NaN or infinite values. Handling...
Data contains NaN or infinite values. Handling...


In [31]:
# Find the maximum sequence length for padding
# Yes it's really high, 336, consider making it larger to accommodate actual test set
max_length = X.groupby('patient_id').size().max()
print("Max length (inputs will be padded to): ", max_length)

X_patient_ids = X.index.get_level_values('patient_id').unique()
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_engineered_patient_ids = X_engineered.index.get_level_values('patient_id').unique()
_, X_engineered_test, _, y_engineered_test = train_test_split(X_engineered, y, test_size=0.2, random_state=42)

Max length (inputs will be padded to):  336


In [33]:
np.shape(X_test)

(55893, 81)

In [34]:
np.shape(X_engineered_test)

(55893, 301)

In [35]:
np.shape(y_test)

(55893,)

In [36]:
np.shape(y_engineered_test)

(55893,)

# Model evaluation

In [20]:
# Load xgboost
xgb_path = "models/saved/xgboost_model1.mdl"
xgb = xgboost.XGBClassifier()
xgb.load_model(xgb_path)

In [37]:
xgb_pred = xgb.predict(X_test)

ValueError: Feature shape mismatch, expected: 176, got 81

In [None]:
# Load transformers
transformers_dir = os.getcwd() + "/models/transformer"
transformer_paths = [os.path.join(transformers_dir, path) for path in os.listdir(transformers_dir)]