In [51]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

import pandas as pd

# Define the file path
data_path = "D:/Datathon 3/Datathon3_GOSSIS_MIT.csv"
dictionary_path = "D:/Datathon 3/Datathon3_GOSSIS_MIT_Data Dictionary.csv"
# Load the dataset
df = pd.read_csv(data_path)

# load the data dictionary 
dictionary_df = pd.read_csv(dictionary_path)


# Confirm successful load
print("Cleaned dataset loaded successfully.")
print(f"Dataset Shape: {df.shape}")  # Check rows & columns
print("\nData Preview:")
print(df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(df.info())

# Confirm successful load
print("Cleaned dataset loaded successfully.")
print(f"Dictionary Shape: {dictionary_df.shape}")  # Check rows & columns
print("\nData Preview:")
print(dictionary_df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(dictionary_df.info())

Cleaned dataset loaded successfully.
Dataset Shape: (91713, 186)

Data Preview:
   encounter_id  patient_id  hospital_id  hospital_death   age    bmi  \
0         66154       25312          118               0  68.0  22.73   
1        114252       59342           81               0  77.0  27.42   
2        119783       50777          118               0  25.0  31.95   
3         79267       46918          118               0  81.0  22.64   
4         92056       34377           33               0  19.0    NaN   

   elective_surgery  ethnicity gender  height  ... aids cirrhosis  \
0                 0  Caucasian      M   180.3  ...  0.0       0.0   
1                 0  Caucasian      F   160.0  ...  0.0       0.0   
2                 0  Caucasian      F   172.7  ...  0.0       0.0   
3                 1  Caucasian      F   165.1  ...  0.0       0.0   
4                 0  Caucasian      M   188.0  ...  0.0       0.0   

   diabetes_mellitus hepatic_failure immunosuppression  leukemia  

In [53]:
# Keep all categories of coulmns except APACHE 'APACHE prediction' 'APACHE comorbidity'
# 'APACHE grouping' 'GOSSIS example prediction'

categories_to_keep = ["identifier","demographic", "vitals","labs","labs blood gas"]

# Get all columns that have categories not in categories_to_keep
cols_to_drop = dictionary_df.loc[
    ~dictionary_df["Category"].isin(categories_to_keep),  # "~" means "NOT in"
    "Variable Name"
].tolist()

df = df.drop(columns=cols_to_drop, errors="ignore")
print("df shape after keep:", df.shape)


df shape after keep: (91713, 146)


In [41]:
# Numeric Pipeline Step 1: Identify numeric columns (excluding target)

target_col = 'hospital_death'  # Adjust if your outcome is named differently

numeric_cols = []
for col in df.columns:
    # We'll keep only columns that are numeric (float/int)
    # and are not the target column
    if col != target_col and pd.api.types.is_numeric_dtype(df[col]):
        numeric_cols.append(col)

print("Total numeric columns (excluding target):", len(numeric_cols))
print("First 10 numeric columns:", numeric_cols[:10])


# Now let's see how many NaNs each numeric column has
na_counts = df[numeric_cols].isna().sum().sort_values(ascending=False)
print("\nTop 10 columns with the most NaNs:\n", na_counts.head(10))


Total numeric columns (excluding target): 139

Top 10 columns with the most NaNs:
 h1_bilirubin_min        84619
h1_bilirubin_max        84619
h1_lactate_max          84369
h1_lactate_min          84369
h1_albumin_min          83824
h1_albumin_max          83824
h1_pao2fio2ratio_min    80195
h1_pao2fio2ratio_max    80195
h1_arterial_ph_max      76424
h1_arterial_ph_min      76424
dtype: int64


In [55]:
# Numeric Pipeline Step 2: Drop columns with >80% missingness

nrows = len(df)
threshold = 0.8  # i.e., 80%
cols_to_drop = []

for col in numeric_cols:
    missing_frac = df[col].isna().mean()
    if missing_frac > threshold:
        cols_to_drop.append(col)

print("Columns exceeding 80% missingness:\n", cols_to_drop)

df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Update numeric_cols to remove dropped columns
numeric_cols = [c for c in numeric_cols if c not in cols_to_drop]

print(f"\nAfter dropping high-NaN columns, numeric_cols count = {len(numeric_cols)}")


Columns exceeding 80% missingness:
 []

After dropping high-NaN columns, numeric_cols count = 104


In [57]:
# Numeric Pipeline Step 3: Mean-impute remaining NaNs

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

df_numeric = df[numeric_cols]  # Sub-DataFrame of numeric features

# Fit & transform for imputation
df_numeric_imputed = imputer.fit_transform(df_numeric)

# Convert back to DataFrame with original column names
df_numeric_imputed = pd.DataFrame(df_numeric_imputed, columns=numeric_cols, index=df.index)

# Check how many NaNs remain
na_count = df_numeric_imputed.isna().sum().sum()
print("After mean imputation, total NaNs in numeric DataFrame:", na_count)

# Update your main df
df[numeric_cols] = df_numeric_imputed

print("Step: Mean imputation done. No more NaNs in numeric columns.")


After mean imputation, total NaNs in numeric DataFrame: 0
Step: Mean imputation done. No more NaNs in numeric columns.


In [47]:
unique_vals = dictionary_df['Category'].unique()
print(unique_vals)


['identifier' 'demographic' 'APACHE covariate' 'vitals' 'labs'
 'labs blood gas' 'APACHE prediction' 'APACHE comorbidity'
 'APACHE grouping' 'GOSSIS example prediction']


In [61]:
# Numeric Pipeline Step 4: Drop zero-variance columns

df_numeric_imputed = df[numeric_cols]  # The numeric features after imputation

stds = df_numeric_imputed.std(axis=0)
zero_var_cols = stds[stds == 0].index.tolist()

print("Zero-variance columns:", zero_var_cols)

if zero_var_cols:
    df.drop(columns=zero_var_cols, inplace=True)
    numeric_cols = [c for c in numeric_cols if c not in zero_var_cols]

print("\nAfter removing zero-variance cols, numeric_cols count =", len(numeric_cols))


Zero-variance columns: []

After removing zero-variance cols, numeric_cols count = 104


In [63]:
 
print("\nData Preview:")
print(df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(df.info())


Data Preview:
   encounter_id  patient_id  hospital_id  hospital_death   age        bmi  \
0       66154.0     25312.0        118.0               0  68.0  22.730000   
1      114252.0     59342.0         81.0               0  77.0  27.420000   
2      119783.0     50777.0        118.0               0  25.0  31.950000   
3       79267.0     46918.0        118.0               0  81.0  22.640000   
4       92056.0     34377.0         33.0               0  19.0  29.185818   

   elective_surgery  ethnicity gender  height  ... d1_pao2fio2ratio_max  \
0               0.0  Caucasian      M   180.3  ...           285.667079   
1               0.0  Caucasian      F   160.0  ...            54.800000   
2               0.0  Caucasian      F   172.7  ...           285.667079   
3               1.0  Caucasian      F   165.1  ...           342.500000   
4               0.0  Caucasian      M   188.0  ...           285.667079   

  d1_pao2fio2ratio_min  h1_arterial_pco2_max h1_arterial_pco2_min  \
0 

In [67]:
# Numeric Pipeline Step 5: Split into X, y, then train-test split

from sklearn.model_selection import train_test_split

target_col = 'hospital_death'  # Adjust if needed

# 1) Build X, y from the DataFrame
X = df[numeric_cols].to_numpy()
y = df[target_col].to_numpy().astype('float32')

print("X shape:", X.shape)
print("y shape:", y.shape)

# 2) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,      # 30% test set
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_test shape:",  X_test.shape,  "y_test shape:",  y_test.shape)



X shape: (91713, 104)
y shape: (91713,)
X_train shape: (64199, 104) y_train shape: (64199,)
X_test shape: (27514, 104) y_test shape: (27514,)


In [69]:
# Numeric Pipeline Step 6: Scale X_train, X_test

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Done scaling. Checking for any NaNs in scaled sets...")
print("NaNs in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaNs in X_test_scaled:", np.isnan(X_test_scaled).sum())

# (Optional) Overwrite X_train, X_test with scaled versions
X_train = X_train_scaled
X_test = X_test_scaled

print("Scaling complete. X_train, X_test updated.")


Done scaling. Checking for any NaNs in scaled sets...
NaNs in X_train_scaled: 0
NaNs in X_test_scaled: 0
Scaling complete. X_train, X_test updated.


In [71]:
# Step 7: Convert scaled data to PyTorch tensors & build DataLoader

import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert numpy arrays to float32 tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)

X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32)

# Create a TensorDataset for the training set
train_dataset = TensorDataset(X_train_t, y_train_t)

# Define a batch size and create a DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

print("Shapes:")
print("X_train_t:", X_train_t.shape, "y_train_t:", y_train_t.shape)
print("X_test_t:", X_test_t.shape, "y_test_t:", y_test_t.shape)
print("DataLoader created with batch_size =", batch_size)


Shapes:
X_train_t: torch.Size([64199, 104]) y_train_t: torch.Size([64199])
X_test_t: torch.Size([27514, 104]) y_test_t: torch.Size([27514])
DataLoader created with batch_size = 32


In [79]:
# Step 8: Define model, loss, and optimizer
# (Re)define the model, loss, optimizer
import torch.nn as nn
from torch.nn import Sequential, Linear, Tanh, Dropout, Sigmoid
from torch.optim import Adam

input_dim = X_train_t.shape[1]
hidden_units = 16

model = Sequential(
    Linear(input_dim, hidden_units),
    Tanh(),
    Dropout(0.2),
    Linear(hidden_units, 1),
    Sigmoid()
)

loss_fn = nn.BCELoss()
learning_rate = 0.01
optimizer = Adam(model.parameters(), lr=learning_rate)

print("Model, loss, optimizer re-initialized.")

# Now do the partial training loop on the first 5 batches
model.train()
epoch_losses = []

for i, (X_batch, Y_batch) in enumerate(train_loader):
    probs = model(X_batch).view(-1)
    loss = loss_fn(probs, Y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_losses.append(loss.item())

    if i == 4:
        break

print(f"Processed {len(epoch_losses)} batches. Partial losses:\n", epoch_losses)



Model, loss, optimizer re-initialized.
Processed 5 batches. Partial losses:
 [0.6956530213356018, 0.68333500623703, 0.7190823554992676, 0.6102806329727173, 0.6172333359718323]


In [83]:
# Step 10: Full training loop for multiple epochs

epochs = 5  # you can increase this as needed
train_loss_history = []

for epoch in range(epochs):
    model.train()
    batch_losses = []

    for X_batch, Y_batch in train_loader:
        # Forward
        probs = model(X_batch).view(-1)
        loss = loss_fn(probs, Y_batch)

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_losses.append(loss.item())

    # Average loss for this epoch
    avg_loss = sum(batch_losses) / len(batch_losses)
    train_loss_history.append(avg_loss)

    print(f"Epoch [{epoch+1}/{epochs}] - Avg training loss: {avg_loss:.4f}")



Epoch [1/5] - Avg training loss: 0.2343
Epoch [2/5] - Avg training loss: 0.2236
Epoch [3/5] - Avg training loss: 0.2224
Epoch [4/5] - Avg training loss: 0.2222
Epoch [5/5] - Avg training loss: 0.2210


In [85]:
# Evaluating Accuracy and Loss on the Test Set
model.eval()  # set to evaluation mode
with torch.no_grad():
    # Forward pass on entire X_test
    test_probs = model(X_test_t).view(-1)
    
    # If test_probs is in [0,1], we can do a threshold at 0.5
    test_preds = (test_probs >= 0.5).float()
    
    # Compute binary accuracy
    correct = (test_preds == y_test_t).sum().item()
    total = y_test_t.shape[0]
    test_accuracy = correct / total
    
    # If we want to compute the average loss
    test_loss = loss_fn(test_probs, y_test_t).item()
    
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")


Test Accuracy: 0.9226
Test Loss: 0.2173


In [87]:
# Generate Additional Metrics (Precision, Recall, F1)
from sklearn.metrics import classification_report, confusion_matrix

# Convert PyTorch tensors to NumPy for scikit-learn
test_preds_np = test_preds.numpy()
y_test_np = y_test_t.numpy()

print("Confusion Matrix:")
print(confusion_matrix(y_test_np, test_preds_np))

print("\nClassification Report:")
print(classification_report(y_test_np, test_preds_np, target_names=['Survived', 'Died']))


Confusion Matrix:
[[24965   174]
 [ 1955   420]]

Classification Report:
              precision    recall  f1-score   support

    Survived       0.93      0.99      0.96     25139
        Died       0.71      0.18      0.28      2375

    accuracy                           0.92     27514
   macro avg       0.82      0.58      0.62     27514
weighted avg       0.91      0.92      0.90     27514



# Results
the overall accuracy is high (about 92%), the model is missing many of the “Died” cases:

Recall for “Died” is only 0.18. This indicates the model correctly identifies only 18% of actual deaths.
Precision for “Died” is 0.71, meaning when it does predict death, it's often correct—but it simply doesn't predict death very often.
This imbalance between high performance on the majority class (Survived) and poor performance on the minority class (Died) is common in medical classification tasks where one outcome is rarer.

In [89]:
#XGB Next 
# Step XGB.1

# If needed  
# !pip install xgboost

import xgboost as xgb

print("xgboost imported successfully.")


xgboost imported successfully.


In [91]:
# Step XGB.2: Define and train an XGBClassifier

from xgboost import XGBClassifier

# Create an XGBClassifier instance
# You can tune hyperparameters like n_estimators, max_depth, learning_rate, etc.
xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

# Fit on the training data
xgb_clf.fit(X_train, y_train)

print("XGBClassifier trained successfully.")


XGBClassifier trained successfully.


In [93]:
# Step XGB.3: Evaluate XGBClassifier

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on the test set
xgb_test_preds = xgb_clf.predict(X_test)

# Compute accuracy
xgb_accuracy = accuracy_score(y_test, xgb_test_preds)

print(f"XGBClassifier Test Accuracy: {xgb_accuracy:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, xgb_test_preds)
print("\nConfusion Matrix:")
print(cm)

# Classification Report


XGBClassifier Test Accuracy: 0.9290

Confusion Matrix:
[[24864   275]
 [ 1678   697]]


In [95]:
from sklearn.metrics import classification_report

print("\nClassification Report:")
print(classification_report(y_test, xgb_test_preds, target_names=["Survived", "Died"]))



Classification Report:
              precision    recall  f1-score   support

    Survived       0.94      0.99      0.96     25139
        Died       0.72      0.29      0.42      2375

    accuracy                           0.93     27514
   macro avg       0.83      0.64      0.69     27514
weighted avg       0.92      0.93      0.92     27514



#XGBoost model has boosted the recall for the “Died” class to 29% (up from 18% in the neural network), though it is still missing many positive cases.

In [97]:
# Adjust class imbalance
from xgboost import XGBClassifier

negative_count = (y_train == 0).sum()
positive_count = (y_train == 1).sum()
scale_pos_weight = negative_count / positive_count

xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
xgb_clf.fit(X_train, y_train)


In [99]:
#threshold tuning 
import numpy as np
probs = xgb_clf.predict_proba(X_test)[:,1]  # probabilities for class=1
threshold = 0.3  # e.g., a lower threshold

preds_custom = (probs >= threshold).astype('float32')

# Evaluate
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, preds_custom))
print(classification_report(y_test, preds_custom, target_names=['Survived','Died']))


[[17447  7692]
 [  308  2067]]
              precision    recall  f1-score   support

    Survived       0.98      0.69      0.81     25139
        Died       0.21      0.87      0.34      2375

    accuracy                           0.71     27514
   macro avg       0.60      0.78      0.58     27514
weighted avg       0.92      0.71      0.77     27514



In [103]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [4,6,8],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,200],
    'scale_pos_weight': [scale_pos_weight, scale_pos_weight*1.5]
}
xgb_clf = XGBClassifier(random_state=42)
grid = GridSearchCV(xgb_clf, param_grid, scoring='f1', cv=3, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)



Best params: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 200, 'scale_pos_weight': 10.58826714801444}


Model	Accuracy	Survived_Precision	Survived_Recall	Survived_F1	Died_Precision	Died_Recall	Died_F1
XGB	    0.93	0.94	0.99	0.96	0.72	0.29	0.42
XGB2	0.71	0.98	0.69	0.81	0.21	0.87	0.34
DL NN	0.92	0.93	0.99	0.96	0.71	0.18	0.28