# **Notebook 1: Exploratory Data Analysis (EDA)**

## **Welcome to the Bias-Athon 2025 - Exploratory Data Analysis!**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tableone import TableOne
import warnings
warnings.filterwarnings("ignore")

### Load Train Data

In [None]:
data = pd.read_csv('data_split/wids_train.csv')


# Display the first few rows of the dataset
data.head()

### Data visualizations (Table One)

In [None]:
# Define Columns for Table One
columns = ['age', 'bmi', 'ethnicity', 'gender', 'hospital_death', 'd1_spo2_min', 'd1_spo2_min_new', 'd1_lactate_max', 'd1_lactate_max_new']
categorical = ['gender', 'ethnicity']
groupby = ['hospital_death']
nonnormal = ['bmi']
labels = {
    'hospital_death': "Mortality",
    'age': "Age",
    'ethnicity': "Race and Ethnicity",
    'bmi': "BMI",
    'gender': "Sex",
    'd1_spo2_min': "Original SpO2 Min",
    'd1_spo2_min_new': "Biased SpO2 Min",
    'd1_lactate_max': "Original Lactate Max",
    'd1_lactate_max_new': "Biased Lactate Max",
}

In [None]:
# Generate Table One
table1 = TableOne(data, columns=columns, categorical=categorical, groupby=groupby, nonnormal=nonnormal, rename=labels, overall=True, missing=True, pval=True, decimals=2)
print(table1)


### Visualize Mortality per Ethnic Group (Normalized)

In [None]:
mortality_per_ethnicity = data.groupby('ethnicity')['hospital_death'].mean()
plt.figure(figsize=(10, 6))
mortality_per_ethnicity.plot(kind='bar', color='skyblue')
plt.xlabel('Ethnicity')
plt.ylabel('Mean Hospital Deaths')
plt.title('Mean Mortality per Ethnic Group')
plt.xticks(rotation=45)
plt.show()

### Visualize Original vs Biased SpO2 Distributions

In [None]:

plt.figure(figsize=(10, 6))
plt.hist(data['d1_spo2_min'], bins=20, alpha=0.5, label="Original SpO2 Min", color='blue')
plt.hist(data['d1_spo2_min_new'], bins=20, alpha=0.5, label="Biased SpO2 Min", color='orange')
plt.xlabel('SpO2 Min')
plt.ylabel('Frequency')
plt.title('Distribution of Original vs Biased SpO2 Min')
plt.legend()
plt.show()


### Visualize Original vs Biased Lactate Distributions

In [None]:

plt.figure(figsize=(10, 6))
plt.hist(data['d1_lactate_max'].dropna(), bins=20, alpha=0.5, label="Original Lactate Max", color='green')
plt.hist(data['d1_lactate_max_new'].dropna(), bins=20, alpha=0.5, label="Biased Lactate Max", color='red')
plt.xlabel('Lactate Max')
plt.ylabel('Frequency')
plt.title('Distribution of Original vs Biased Lactate Max')
plt.legend()
plt.show()

### Explore additional plots: Add your team's visualizations below
 **Example: Distribution of BMI by Ethnicity**

In [None]:
plt.figure(figsize=(10, 6))
for ethnicity in data['ethnicity'].unique():
    subset = data[data['ethnicity'] == ethnicity]
    plt.hist(subset['bmi'], alpha=0.5, label=ethnicity)
plt.legend()
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title('BMI Distribution by Ethnicity')
plt.show()
