In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from xgboost import XGBClassifier

In [5]:
df.columns

Index(['Hour', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'Patient_ID'],
      dtype='str')

In [4]:
df = pd.read_csv("Dataset.csv")

# Drop useless columns
df = df.drop(columns=["Unnamed: 0", "EtCO2"], errors="ignore")

# Remove rows with missing structure
df = df.dropna(subset=["Patient_ID", "SepsisLabel"])

# Convert types
df["Patient_ID"] = df["Patient_ID"].astype(int)
df["SepsisLabel"] = df["SepsisLabel"].astype(int)

# Sort time-series correctly
df = df.sort_values(["Patient_ID", "Hour"])

print("Shape:", df.shape)

Shape: (546122, 42)


In [7]:
def create_early_label(group, shift=5):
    group = group.copy()
    group["EarlyLabel"] = 0
    
    if group["SepsisLabel"].sum() > 0:
        t = group[group["SepsisLabel"] == 1]["Hour"].min()
        group.loc[
            (group["Hour"] >= t - shift) &
            (group["Hour"] < t),
            "EarlyLabel"
        ] = 1
        
    return group

df = df.groupby("Patient_ID", group_keys=False).apply(create_early_label)

  df = df.groupby("Patient_ID", group_keys=False).apply(create_early_label)


In [8]:
patients = df["Patient_ID"].unique()

train_ids, test_ids = train_test_split(
    patients,
    test_size=0.2,
    random_state=42
)

train_df = df[df["Patient_ID"].isin(train_ids)].copy()
test_df = df[df["Patient_ID"].isin(test_ids)].copy()

print("Train patients:", len(train_ids))
print("Test patients:", len(test_ids))

Train patients: 11245
Test patients: 2812


In [9]:
train_df = (
    train_df
    .groupby("Patient_ID", as_index=False)
    .apply(lambda x: x.ffill().bfill())
    .reset_index(drop=True)
)

test_df = (
    test_df
    .groupby("Patient_ID", as_index=False)
    .apply(lambda x: x.ffill().bfill())
    .reset_index(drop=True)
)

medians = train_df.median(numeric_only=True)

train_df = train_df.fillna(medians)
test_df = test_df.fillna(medians)

  .apply(lambda x: x.ffill().bfill())
  .apply(lambda x: x.ffill().bfill())


In [10]:
df.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,...,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID,EarlyLabel
132940,0,,,,,,,,,,...,,83.14,0.0,,,-0.03,1.0,0,1,0
132941,1,97.0,95.0,,98.0,75.33,,19.0,,,...,,83.14,0.0,,,-0.03,2.0,0,1,0
132942,2,89.0,99.0,,122.0,86.0,,22.0,,,...,,83.14,0.0,,,-0.03,3.0,0,1,0
132943,3,90.0,95.0,,,,,30.0,24.0,,...,,83.14,0.0,,,-0.03,4.0,0,1,0
132944,4,103.0,88.5,,122.0,91.33,,24.5,,,...,,83.14,0.0,,,-0.03,5.0,0,1,0


In [11]:
def add_features(df):
    
    df = df.sort_values(["Patient_ID", "Hour"])
    
    # Shock Index
    df["ShockIndex"] = df["HR"] / df["SBP"]
    
    # Deltas
    df["HR_delta"] = df.groupby("Patient_ID")["HR"].diff()
    df["MAP_delta"] = df.groupby("Patient_ID")["MAP"].diff()
    df["Lactate_delta"] = df.groupby("Patient_ID")["Lactate"].diff()
    
    # Rolling Means
    df["HR_roll3"] = (
        df.groupby("Patient_ID")["HR"]
          .rolling(3)
          .mean()
          .reset_index(level=0, drop=True)
    )
    
    df["MAP_roll3"] = (
        df.groupby("Patient_ID")["MAP"]
          .rolling(3)
          .mean()
          .reset_index(level=0, drop=True)
    )
    
    return df

# train_df = add_features(train_df)
# test_df = add_features(test_df)

In [12]:
def add_temporal_features(df):

    df = df.sort_values(["Patient_ID", "Hour"])

    for col in ["HR", "MAP", "Lactate"]:
        
        df[f"{col}_roll6"] = (
            df.groupby("Patient_ID")[col]
              .rolling(6)
              .mean()
              .reset_index(level=0, drop=True)
        )

        df[f"{col}_std6"] = (
            df.groupby("Patient_ID")[col]
              .rolling(6)
              .std()
              .reset_index(level=0, drop=True)
        )

    return df

train_df = add_temporal_features(train_df)
test_df = add_temporal_features(test_df)

In [13]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [14]:
EXCLUDE = ["Patient_ID", "Hour", "SepsisLabel", "EarlyLabel"]

features = [col for col in train_df.columns if col not in EXCLUDE]

print("Total features:", len(features))

Total features: 45


In [15]:
pos = train_df["EarlyLabel"].sum()
neg = len(train_df) - pos

scale_weight = neg / pos

print("Positive:", pos)
print("Negative:", neg)

Positive: 4234
Negative: 435148


In [16]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    scale_pos_weight=scale_weight,
    eval_metric="aucpr",
    random_state=42
)

model.fit(train_df[features], train_df["EarlyLabel"])

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
probs = model.predict_proba(test_df[features])[:, 1]

auroc = roc_auc_score(test_df["EarlyLabel"], probs)
auprc = average_precision_score(test_df["EarlyLabel"], probs)

print("AUROC:", auroc)
print("AUPRC:", auprc)

AUROC: 0.7739725663338535
AUPRC: 0.038443575601797175


In [18]:
baseline = test_df["EarlyLabel"].mean()
print("Baseline prevalence:", baseline)

Baseline prevalence: 0.009115608019486602


In [45]:
sequence_length = 15

In [46]:
LSTM_FEATURES = [
    "HR",
    "SBP",
    "DBP",
    "MAP",
    "Resp",
    "O2Sat",
    "Temp",
    "Lactate",
    "Creatinine",
    "WBC",
    "Platelets"
]

In [47]:
import numpy as np

def create_sequences(df, features, target, seq_len=6):
    
    X, y = [], []
    
    for pid in df["Patient_ID"].unique():
        
        patient_data = df[df["Patient_ID"] == pid]
        patient_data = patient_data.sort_values("Hour")
        
        values = patient_data[features].values
        labels = patient_data[target].values
        
        for i in range(seq_len, len(patient_data)):
            X.append(values[i-seq_len:i])
            y.append(labels[i])
    
    return np.array(X), np.array(y)

In [48]:
X_train, y_train = create_sequences(train_df, LSTM_FEATURES, "EarlyLabel", seq_len=6)
X_test, y_test = create_sequences(test_df, LSTM_FEATURES, "EarlyLabel", seq_len=6)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (371912, 6, 11)
Test shape: (89868, 6, 11)


In [49]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# reshape to 2D
X_train_2d = X_train.reshape(-1, X_train.shape[-1])
X_test_2d = X_test.reshape(-1, X_test.shape[-1])

scaler.fit(X_train_2d)

X_train_scaled = scaler.transform(X_train_2d).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_2d).reshape(X_test.shape)

In [50]:
import torch
import torch.nn as nn

class SepsisLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # take last time step
        out = self.fc(out)
        return self.sigmoid(out)

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [52]:
import torch
from torch.utils.data import Dataset, DataLoader

class SepsisDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [53]:
train_dataset = SepsisDataset(X_train_scaled, y_train)
test_dataset = SepsisDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512)

In [54]:
class SepsisLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [58]:
model = SepsisLSTM(input_size=len(LSTM_FEATURES)).to(device)

pos_weight = torch.tensor([neg/pos], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
# criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [59]:
for epoch in range(30):
    model.train()
    total_loss = 0

    for X_batch, y_batch in train_loader:

        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch}, Loss: {total_loss/len(train_loader)}")

Epoch 0, Loss: 1.2961189801788067
Epoch 1, Loss: 1.2414790394381612
Epoch 2, Loss: 1.197407513733907
Epoch 3, Loss: 1.1479569619426702
Epoch 4, Loss: 1.094899061457491
Epoch 5, Loss: 1.0337656922074754
Epoch 6, Loss: 0.975691263826546
Epoch 7, Loss: 0.915822208296646
Epoch 8, Loss: 0.8569638604884955
Epoch 9, Loss: 0.7986685936191878
Epoch 10, Loss: 0.7449367006182507
Epoch 11, Loss: 0.6947185489793591
Epoch 12, Loss: 0.6449511801360234
Epoch 13, Loss: 0.6081267456732885
Epoch 14, Loss: 0.5620166698299052
Epoch 15, Loss: 0.5242440291415218
Epoch 16, Loss: 0.49709318859049184
Epoch 17, Loss: 0.45778922816582196
Epoch 18, Loss: 0.42997223499552584
Epoch 19, Loss: 0.4075733370966236
Epoch 20, Loss: 0.39898478683530053
Epoch 21, Loss: 0.3531062772710189
Epoch 22, Loss: 0.3463765571337127
Epoch 23, Loss: 0.3214526803449227
Epoch 24, Loss: 0.3119223543444067
Epoch 25, Loss: 0.30219835758291214
Epoch 26, Loss: 0.2683861074864127
Epoch 27, Loss: 0.26378346596328234
Epoch 28, Loss: 0.2606377125

In [60]:
from sklearn.metrics import roc_auc_score, average_precision_score
import torch.nn.functional as F

model.eval()
all_probs = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).squeeze()
        probs = torch.sigmoid(outputs)
        all_probs.extend(probs.cpu().numpy())

auroc = roc_auc_score(y_test, all_probs)
auprc = average_precision_score(y_test, all_probs)

print("AUROC:", auroc)
print("AUPRC:", auprc)

AUROC: 0.6232559304035981
AUPRC: 0.014687033366751652
