In [53]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

import pandas as pd

# Define the file path
data_path = "D:/Datathon 3/Datathon3_GOSSIS_MIT.csv"
dictionary_path = "D:/Datathon 3/Datathon3_GOSSIS_MIT_Data Dictionary.csv"
# Load the dataset
df = pd.read_csv(data_path)

# load the data dictionary 
dictionary_df = pd.read_csv(dictionary_path)


# Confirm successful load
print("Cleaned dataset loaded successfully.")
print(f"Dataset Shape: {df.shape}")  # Check rows & columns
print("\nData Preview:")
print(df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(df.info())

# Confirm successful load
print("Cleaned dataset loaded successfully.")
print(f"Dictionary Shape: {dictionary_df.shape}")  # Check rows & columns
print("\nData Preview:")
print(dictionary_df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(dictionary_df.info())

Cleaned dataset loaded successfully.
Dataset Shape: (91713, 186)

Data Preview:
   encounter_id  patient_id  hospital_id  hospital_death   age    bmi  \
0         66154       25312          118               0  68.0  22.73   
1        114252       59342           81               0  77.0  27.42   
2        119783       50777          118               0  25.0  31.95   
3         79267       46918          118               0  81.0  22.64   
4         92056       34377           33               0  19.0    NaN   

   elective_surgery  ethnicity gender  height  ... aids cirrhosis  \
0                 0  Caucasian      M   180.3  ...  0.0       0.0   
1                 0  Caucasian      F   160.0  ...  0.0       0.0   
2                 0  Caucasian      F   172.7  ...  0.0       0.0   
3                 1  Caucasian      F   165.1  ...  0.0       0.0   
4                 0  Caucasian      M   188.0  ...  0.0       0.0   

   diabetes_mellitus hepatic_failure immunosuppression  leukemia  

In [55]:
unique_vals = dictionary_df['Category'].unique()
print(unique_vals)


['identifier' 'demographic' 'APACHE covariate' 'vitals' 'labs'
 'labs blood gas' 'APACHE prediction' 'APACHE comorbidity'
 'APACHE grouping' 'GOSSIS example prediction']


In [57]:
# Keep all categories of coulmns except APACHE 'APACHE prediction' 'APACHE comorbidity'
# 'APACHE grouping' 'GOSSIS example prediction'

categories_to_keep = ["identifier","demographic", "vitals","labs","labs blood gas"]

# Get all columns that have categories not in categories_to_keep
cols_to_drop = dictionary_df.loc[
    ~dictionary_df["Category"].isin(categories_to_keep),  # "~" means "NOT in"
    "Variable Name"
].tolist()

df = df.drop(columns=cols_to_drop, errors="ignore")
print("df shape after keep:", df.shape)


df shape after keep: (91713, 146)


In [59]:
 
print("\nData Preview:")
print(df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(df.info())


Data Preview:
   encounter_id  patient_id  hospital_id  hospital_death   age    bmi  \
0         66154       25312          118               0  68.0  22.73   
1        114252       59342           81               0  77.0  27.42   
2        119783       50777          118               0  25.0  31.95   
3         79267       46918          118               0  81.0  22.64   
4         92056       34377           33               0  19.0    NaN   

   elective_surgery  ethnicity gender  height  ... d1_pao2fio2ratio_max  \
0                 0  Caucasian      M   180.3  ...                  NaN   
1                 0  Caucasian      F   160.0  ...                 54.8   
2                 0  Caucasian      F   172.7  ...                  NaN   
3                 1  Caucasian      F   165.1  ...                342.5   
4                 0  Caucasian      M   188.0  ...                  NaN   

  d1_pao2fio2ratio_min  h1_arterial_pco2_max h1_arterial_pco2_min  \
0                  NaN    

In [61]:
import pandas as pd
import numpy as np
import torch as t
import torch.nn as nn
from torch.nn.functional import sigmoid, tanh
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For building a quick neural network using nn.Sequential
from torch.nn import Linear, Sequential, Tanh, Sigmoid, Dropout

import matplotlib.pyplot as plt


In [63]:
# 1. Keep numeric features only (excluding the target)
numeric_cols = [col for col in df.columns
                if col != target_col
                and pd.api.types.is_numeric_dtype(df[col])]

# 2. Build X and Y
X_data = df[numeric_cols].to_numpy()
Y_data = df[target_col].to_numpy().astype(np.float32)

print("Number of numeric features:", len(numeric_cols))
print("Example numeric columns:", numeric_cols[:10])


Number of numeric features: 139
Example numeric columns: ['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'height', 'icu_id', 'pre_icu_los_days', 'readmission_status']


In [65]:
# 3. Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X_data,
    Y_data,
    test_size=0.3,
    random_state=42,
    stratify=Y_data  # Keep the same 0/1 ratio in train/test
)

# 4. Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [67]:
X_train_t = t.tensor(X_train, dtype=t.float32)
Y_train_t = t.tensor(Y_train, dtype=t.float32)

X_test_t = t.tensor(X_test, dtype=t.float32)
Y_test_t = t.tensor(Y_test, dtype=t.float32)


In [69]:
train_dataset = TensorDataset(X_train_t, Y_train_t)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [71]:
#  Define your model
input_dim = X_train.shape[1]
hidden_units = 16  # Adjust as needed

model = Sequential(
    Linear(input_dim, hidden_units),
    Tanh(),        # or ReLU()
    Dropout(0.2),  # 20% dropout, tune as appropriate
    Linear(hidden_units, 1),
    Sigmoid()      # final activation for binary classification
)

print(model)


Sequential(
  (0): Linear(in_features=139, out_features=16, bias=True)
  (1): Tanh()
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=16, out_features=1, bias=True)
  (4): Sigmoid()
)


In [73]:
# S  Define the loss function and optimizer

loss_fn = nn.BCELoss()            # for binary classification, expects outputs in [0,1]
learning_rate = 0.01
optimizer = Adam(model.parameters(), lr=learning_rate)

print("Loss function:", loss_fn)
print("Optimizer:", optimizer)


Loss function: BCELoss()
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)


In [75]:
# Step 3: Set up training parameters and metric lists

epochs = 50  # You can adjust this
train_loss_list = []
train_acc_list = []
val_acc_list = []

print("Set epochs =", epochs)
print("Initialized train_loss_list, train_acc_list, val_acc_list.")


Set epochs = 50
Initialized train_loss_list, train_acc_list, val_acc_list.


In [77]:
# S  Partial Training Loop (First 5 Batches)

model.train()  # set to training mode
epoch_losses = []  # We'll store the per-batch losses temporarily here

for i, (X_batch, Y_batch) in enumerate(train_loader):
    # 1) Forward pass
    probs = model(X_batch).view(-1)  # shape: (batch_size,)

    # 2) Compute loss
    loss = loss_fn(probs, Y_batch)

    # 3) Backward & optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 4) Store batch loss
    epoch_losses.append(loss.item())

    # Just process the first 5 batches, then stop
    if i == 4:
        break

print(f"Processed 5 batches. Collected {len(epoch_losses)} losses.")
print("Losses from these batches:", epoch_losses)


RuntimeError: all elements of input should be between 0 and 1

In [79]:
#1) Rerun the Partial Loop with Diagnostics 
model.train()  # ensure we are in training mode
epoch_losses = []

# We'll do 5 batches again, but with the debug prints.
for i, (X_batch, Y_batch) in enumerate(train_loader):
    # 1) Forward pass
    probs = model(X_batch).view(-1)

    # 1a) DEBUG: Print ranges
    print(f"\n=== Batch {i} ===")
    print("Model output range -> min:", probs.min().item(), "max:", probs.max().item())
    print("Target unique values:", Y_batch.unique())

    # 2) Compute loss
    loss = loss_fn(probs, Y_batch)  # <-- might throw error if out of range

    # 3) Backward & optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_losses.append(loss.item())

    # Stop after 5 batches
    if i == 4:
        break

print(f"\nCollected {len(epoch_losses)} losses:", epoch_losses)



=== Batch 0 ===
Model output range -> min: nan max: nan
Target unique values: tensor([0.])


RuntimeError: all elements of input should be between 0 and 1

In [81]:
# Check if there are any NaNs in X_train or X_test
import numpy as np

nan_train = np.isnan(X_train)
nan_test = np.isnan(X_test)

print("Total NaNs in X_train:", nan_train.sum())
print("Total NaNs in X_test:", nan_test.sum())

# If you want to see which columns have NaNs:
cols_with_nan = np.where(np.isnan(X_train).any(axis=0))[0]
print("Columns with NaNs in X_train:", cols_with_nan) 


Total NaNs in X_train: 3536117
Total NaNs in X_test: 1515142
Columns with NaNs in X_train: [  3   4   6  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24
  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42
  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138]


In [83]:
from sklearn.impute import SimpleImputer

# Step A1: Create an imputer that replaces NaN with the mean of each column
imputer = SimpleImputer(strategy='median')

# Step A2: Fit imputer on X_train (and transform X_train)
X_train_imputed = imputer.fit_transform(X_train)

# Step A3: Transform X_test using the same statistics
X_test_imputed = imputer.transform(X_test)

print("Done mean imputation on X_train, X_test.")


Done mean imputation on X_train, X_test.


In [99]:
X_train = X_train_imputed
X_test = X_test_imputed


In [101]:
import numpy as np

print("NaNs in X_train_imputed:", np.isnan(X_train_imputed).sum())
print("NaNs in X_test_imputed:", np.isnan(X_test_imputed).sum())


NaNs in X_train_imputed: 0
NaNs in X_test_imputed: 0


In [103]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [89]:
# Step D.1: Convert data to PyTorch tensors and build a DataLoader

import torch

# Convert arrays to float32 tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
Y_train_t = torch.tensor(Y_train, dtype=torch.float32)

X_test_t = torch.tensor(X_test, dtype=torch.float32)
Y_test_t = torch.tensor(Y_test, dtype=torch.float32)

# Create a TensorDataset for training data
train_dataset = TensorDataset(X_train_t, Y_train_t)

# Define batch size and DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

print("Step D.1 done: Tensors & train_loader created.")
print("X_train_t shape:", X_train_t.shape)
print("Y_train_t shape:", Y_train_t.shape)


Step D.1 done: Tensors & train_loader created.
X_train_t shape: torch.Size([64199, 139])
Y_train_t shape: torch.Size([64199])


In [105]:
print("NaNs in final X_train:", np.isnan(X_train).sum())
# This must be 0


NaNs in final X_train: 0


In [107]:
# Suppose X_train_imputed is your fully imputed data
stds = X_train_imputed.std(axis=0)
zero_var_cols = np.where(stds == 0)[0]
print("Zero variance columns:", zero_var_cols)

# If there are any zero-variance columns, drop them:
X_train_imputed = np.delete(X_train_imputed, zero_var_cols, axis=1)
X_test_imputed = np.delete(X_test_imputed, zero_var_cols, axis=1)


Zero variance columns: [9]


In [109]:
# Final check
print("NaN in X_train after impute & scale:", np.isnan(X_train).sum())
print("NaN in X_test after impute & scale:", np.isnan(X_test).sum())


NaN in X_train after impute & scale: 0
NaN in X_test after impute & scale: 0


In [111]:
# Step D.2: Define or re-initialize the model, loss, and optimizer

input_dim = X_train_t.shape[1]  # number of features
hidden_units = 16               # you can tweak this

model = Sequential(
    Linear(input_dim, hidden_units),
    Tanh(),
    Dropout(0.2),
    Linear(hidden_units, 1),
    Sigmoid()
)

print("Model redefined:\n", model)

# Define BCELoss for binary classification
loss_fn = nn.BCELoss()

# Choose an optimizer, e.g. Adam
learning_rate = 0.01
optimizer = Adam(model.parameters(), lr=learning_rate)

print("\nLoss function:", loss_fn)
print("Optimizer:", optimizer)


Model redefined:
 Sequential(
  (0): Linear(in_features=139, out_features=16, bias=True)
  (1): Tanh()
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=16, out_features=1, bias=True)
  (4): Sigmoid()
)

Loss function: BCELoss()
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)


In [113]:
# Step D.3: Partial training loop (first 5 batches)

model.train()  # set to training mode
epoch_losses = []

for i, (X_batch, Y_batch) in enumerate(train_loader):
    # 1) Forward pass
    probs = model(X_batch).view(-1)  # shape: (batch_size,)

    # 2) Compute loss
    loss = loss_fn(probs, Y_batch)

    # 3) Zero gradients, backward pass, update
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 4) Store batch loss
    epoch_losses.append(loss.item())

    # Just do the first 5 batches
    if i == 4:
        break

print(f"Processed {len(epoch_losses)} batches. Losses:\n", epoch_losses)


RuntimeError: all elements of input should be between 0 and 1

In [97]:
# Step D.4: Forward-pass debug snippet (no loss computation)

model.train()  # or model.eval(), doesn't matter for this quick check
for i, (X_batch, Y_batch) in enumerate(train_loader):
    with torch.no_grad():
        # Check the input batch stats
        print(f"\n=== Debugging batch {i} ===")
        print("X_batch min:", X_batch.min().item(), 
              "X_batch max:", X_batch.max().item(), 
              "X_batch has_nan:", X_batch.isnan().any().item())
        
        # Forward pass
        out = model(X_batch)
        
        # Check the model output stats
        print("Model output shape:", out.shape)
        print("Model output min:", out.min().item(), 
              "max:", out.max().item(), 
              "has_nan:", out.isnan().any().item())
        
        # Check the target
        print("Y_batch unique:", Y_batch.unique())
        
    break  # stop after first batch




=== Debugging batch 0 ===
X_batch min: nan X_batch max: nan X_batch has_nan: True
Model output shape: torch.Size([32, 1])
Model output min: nan max: nan has_nan: True
Y_batch unique: tensor([0., 1.])
