# Women in Data Science Dataset

**Dataset:** Women in Data Science (91,713 encounters)  
**Microskill:** Model Comparison  
**Date:** February 13, 2025  
**Authors:** Jeremy Balch & Mackenzie Meni

### To Do List

- [ ] Upsampling and downsampling and assess performance by category  
- [ ] Debiasing strategies  
- [ ] Switch model architecture (RF, XGBoost, etc.)  
- [ ] Switch feature set (use all features, use subset of features)  
- [ ] Switch evaluation metric (AUROC, Accuracy, etc.)  
- [ ] Bootstrap for confidence intervals 


## Import Libraries

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tableone import TableOne
import warnings
from sklearn.model_selection import train_test_split


import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings("ignore")


## Load our preprocessed data from notebook '1_data_handling.ipynb'

In [20]:
data = pd.read_csv('../preprocessed_data.csv')

## Creating the train test split for the Random Forrest model

In [21]:
# Drop the 'gender_M' and 'ethnicity_Other' columns
data = data.drop(columns=['gender_M', 'ethnicity_Other/Unknown'])

# Define your features and target variable
X = data.drop('hospital_death', axis=1)  # Features
y = data['hospital_death']               # Target variable

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



# Shapes should be:
# X_train shape: (62060, 38)
# X_test shape: (15515, 38)
# y_train shape: (62060,)
# y_test shape: (15515,)


X_train shape: (62060, 38)
X_test shape: (15515, 38)
y_train shape: (62060,)
y_test shape: (15515,)


### Random Forest Model

In [7]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


# Save the model
joblib.dump(rf_model, "../original_data_random_forest_model.pkl")


# Calculate AUROC
from sklearn.metrics import roc_auc_score
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_pred_proba)
print(f"AUROC: {auroc:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.92
AUROC: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     14214
           1       0.59      0.21      0.31      1301

    accuracy                           0.92     15515
   macro avg       0.76      0.60      0.63     15515
weighted avg       0.90      0.92      0.90     15515

Confusion Matrix:
[[14027   187]
 [ 1032   269]]


# Training a MLP Model:

Whatâ€™s Inside the Model?
- Three Key Processing Steps:

    - The first layer extracts important patterns from patient data.
    - The second layer refines the information further.
    - The final layer makes the prediction: high risk or low risk.

- Learning with Feedback:
    - The model improves over time by comparing its predictions to actual outcomes and adjusting accordingly.


In [None]:

# Define features and target
X = data.drop(columns=['hospital_death'])  # Features
y = data['hospital_death']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train.astype(np.float32).values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.astype(np.float32).values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_classes=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)  # Keep raw logits
        return x

# Initialize model
input_size = X_train.shape[1]
num_classes = len(set(y_train))
model = MLP(input_size, hidden_size=64, num_classes=num_classes)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Train model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluate model
model.eval()
y_pred_list = []
y_pred_proba_list = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        probabilities = torch.softmax(outputs, dim=1)
        predictions = torch.argmax(probabilities, dim=1)

        y_pred_list.extend(predictions.cpu().numpy())
        y_pred_proba_list.extend(probabilities[:, 1].cpu().numpy())

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred_list)
auroc = roc_auc_score(y_test, y_pred_proba_list)

print(f"Accuracy: {accuracy:.2f}")
print(f"AUROC: {auroc:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_list))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_list))


Epoch 1/20, Loss: 9.9698
Epoch 2/20, Loss: 3.1315
Epoch 3/20, Loss: 1.0413
Epoch 4/20, Loss: 0.4193
Epoch 5/20, Loss: 0.3042
Epoch 6/20, Loss: 0.2995
Epoch 7/20, Loss: 0.2888
Epoch 8/20, Loss: 0.2889
Epoch 9/20, Loss: 0.2889
Epoch 10/20, Loss: 0.2888
Epoch 11/20, Loss: 0.2887
Epoch 12/20, Loss: 0.2887
Epoch 13/20, Loss: 0.2887
Epoch 14/20, Loss: 0.2887
Epoch 15/20, Loss: 0.2888
Epoch 16/20, Loss: 0.2887
Epoch 17/20, Loss: 0.2887
Epoch 18/20, Loss: 0.2889
Epoch 19/20, Loss: 0.2887
Epoch 20/20, Loss: 0.2887
Accuracy: 0.92
AUROC: 0.50
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     14214
           1       0.00      0.00      0.00      1301

    accuracy                           0.92     15515
   macro avg       0.46      0.50      0.48     15515
weighted avg       0.84      0.92      0.88     15515

Confusion Matrix:
[[14214     0]
 [ 1301     0]]


# Why Does the MLP Have an AUROC of 0.5 Despite High Accuracy?

- **AUROC (Area Under the Receiver Operating Characteristic Curve)** measures how well the model separates positive and negative cases. A value of 0.5 means the model is no better than  random guessing for the minority class.

- **Accuracy** is misleading in imbalanced datasets because a model can achieve high accuracy simply by predicting the majority class most of the time.

#### Since the **dataset has very few instances of the minority class** (e.g., hospital deaths), the MLP (neural network) likely learned to predict the majority class nearly all the time. This results in:

 - High accuracy (because it's correct most of the time on the dominant class). 
 - Low AUROC (because it's not distinguishing between classes well).

### Why Does the Random Forest (RF) Perform Better?
-  Tree-based models like Random Forest are more robust to **imbalanced data** because they can focus on the minority class better, using techniques like bootstrap aggregation and feature splits that naturally find small but important patterns in the data.

-   RF has an AUROC of 0.84, meaning it is significantly better at distinguishing between classes, likely because it is making more meaningful predictions for the minority class.


# Retrain your Random Forest model with your altered data

In [8]:
# Loading altered datasets
data_altered_20_percent_african_american= pd.read_csv('../data_altered_20_percent_african_american.csv')
data_altered_80_percent_female = pd.read_csv('../data_altered_80_percent_female.csv')

In [9]:
# choose the data set to experiment with
data_altered = data_altered_20_percent_african_american

# Drop the 'gender_M' and 'ethnicity_Other' columns
data_altered = data_altered.drop(columns=['gender_M', 'ethnicity_Other/Unknown'])


# Define your features and target variable
X = data_altered.drop('hospital_death', axis=1)  # Features
y = data_altered['hospital_death']               # Target variable

# Perform the train-test split
X_train_altered, X_test_altered, y_train_altered, y_test_altered = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train_altered.shape)
print("X_test shape:", X_test_altered.shape)
print("y_train shape:", y_train_altered.shape)
print("y_test shape:", y_test.shape)

### Retrain Model on Altered Data

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest model
rf_model_altered = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model_altered.fit(X_train_altered, y_train_altered)

# Save the model
joblib.dump(rf_model_altered, f'../altered_rf_model.pkl')

# Make predictions on the test set
y_pred_altered = rf_model_altered.predict(X_test_altered)


X_train shape: (62060, 38)
X_test shape: (15515, 38)
y_train shape: (62060,)
y_test shape: (15515,)
