# Project

## Data loading

### Create Parquet file Q1.1

In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt

# Define paths
data_dir = "../ml4h_data/p1/"#"data"

pd.set_option('display.max_columns', None)

def read_patient_data(file_path):
    df = pd.read_csv(file_path, sep=",")
    return df

# Process each patient file
def process_patient_data(patient_file):
    df = read_patient_data(patient_file)

    df['Time'] = pd.to_timedelta(df['Time'] + ':00')
    df['Time'] = df['Time'].dt.ceil('h')
    df['Time'] = df['Time'].dt.total_seconds() // 3600
    
    return df

def generate_parquet(letter:str, keepICUType=False):

    folder_path = data_dir + "set-" + letter
    static_columns = ["RecordID", "Age", "Gender", "Height", "ICUType", "Weight"] #ICU-Type can be dropped later
    df_processed = pd.DataFrame()

    for file_name in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        try:
            df = process_patient_data(file_path)
        except:
            print("Could not read file"+file_name)
            continue
        df = df.pivot_table(index="Time", columns="Parameter", values="Value", aggfunc="last").reset_index()
        df = df.reindex(columns= ["Time"] + static_columns + [col for col in df.columns if col not in static_columns + ["Time"]])
        df.set_index('Time', inplace=True)
        df = df.reindex(range(49), fill_value=np.nan)
        df[static_columns] = df[static_columns].ffill().bfill()
        df.reset_index(drop=False, inplace=True)
        df_processed = pd.concat([df_processed, df], ignore_index=True)

    if not keepICUType:
        df_processed = df_processed.drop(columns=["ICUType"])

    df_target = read_patient_data(data_dir+f"Outcomes-{letter}.txt")
    df_target = df_target[['RecordID', 'In-hospital_death']]
    df_processed = df_processed.merge(df_target, on="RecordID", how="left")

    df_processed.to_parquet(f"data_{letter}_raw.parquet", engine="pyarrow", index=False)

    return df_processed



In [2]:
for letter in ["a", "b", "c"]:
    generate_parquet(letter,keepICUType=True)


 89%|████████▉ | 3571/4001 [01:03<00:10, 42.14it/s]

Could not read file.ipynb_checkpoints


100%|██████████| 4001/4001 [01:14<00:00, 53.53it/s]
100%|██████████| 4000/4000 [01:14<00:00, 53.73it/s]
100%|██████████| 4000/4000 [01:14<00:00, 53.55it/s]


### TBD: Generate nice Plots Q1.2

In [8]:
letter = "a"
df = pd.read_parquet(f"data_{letter}_cleaned.parquet")
for letter in ["b","c"]:
    df = pd.concat([df, pd.read_parquet(f"data_{letter}_cleaned.parquet")]) 

In [9]:
bins = [36]*df.columns.shape[0]
bins[0] = 49
bins[11] = 12
bins[10] = 10
bins[29] = 10

In [None]:
log_df = np.log(df + 1e-9)
for i, var in enumerate(list(df.columns)):
    fig, ax = plt.subplots()  # Create a figure and an axes object
    ax.hist(df[var], bins=bins[i])  # Plot the histogram on the axes
    ax.set_title(f"{i}: Histogram of {var}")  # Set the title
    ax.set_xlabel(var)  # Label the x-axis
    ax.set_ylabel("Frequency")  # Label the y-axis
    #ax.set_ylim(0,100)
    plt.show()
    plt.close(fig) # Closes the figure

In [11]:
vars = list(df.columns)
print(vars)
dem_vars = vars[2:4]
print(dem_vars)
ts_vars = vars[4:42].remove('ICUType')
print(ts_vars)

['Time', 'RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight', 'BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP', 'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'SaO2', 'SysABP', 'Temp', 'Urine', 'WBC', 'pH', 'Lactate', 'ALP', 'ALT', 'AST', 'Albumin', 'Bilirubin', 'Cholesterol', 'TroponinT', 'RespRate', 'TroponinI', 'In-hospital_death']
['Age', 'Gender']
None


In [None]:
ts_vars = ts_vars = [var for var in list(df.columns)[4:42] if var != 'ICUType']
for i, var in enumerate(ts_vars):
    fig, ax = plt.subplots(1,2)  # Create a figure and an axes object
    for j in range(0,7):
        binstart = j*10 + 16
        agegroup = df.loc[(df["Age"] >= binstart) & (df["Age"] < binstart + 10), var]
        ax[0].hist(agegroup, alpha=0.4, bins=bins[i])  # Plot the histogram on the axes
    ax[0].set_title(f"{i}: Histogram of {var} per age group")  # Set the title
    ax[0].set_xlabel(var)  # Label the x-axis
    ax[0].set_ylabel("Frequency")  # Label the y-axis

    for j in range(0,2):
        gendergroup = df.loc[(df["Gender"] == j), var]
        ax[1].hist(gendergroup, bins=bins[i])  # Plot the histogram on the axes
    ax[1].set_title(f"{i}: Histogram of {var} per gender group")  # Set the title
    ax[1].set_xlabel(var)  # Label the x-axis
    ax[1].set_ylabel("Frequency")  # Label the y-axis
    plt.show()
    plt.close(fig) # Closes the figure

### Data imputation Q1.3

In [7]:
## Forward fill TODO: maybe later different imputation and fill values that never come up (mean value)
from sklearn.preprocessing import StandardScaler

for letter in ["a", "b", "c"]:
    ##Forwardfill and impute
    df = pd.read_parquet(f"data_{letter}_raw.parquet")

    ##Remove negatives
    df['Height'] = df['Height'].replace(-1.0, np.nan)
    df['Weight'] = df['Weight'].replace(-1.0, np.nan)
    df['NIDiasABP'] = df['NIDiasABP'].replace(-1.0, np.nan)
    df['DiasABP'] = df['DiasABP'].replace(-1.0, np.nan)
    df['Gender'] = df['Gender'].replace(-1.0, np.nan)
    
    #Definetely wrong
    df['Height'] = df['Height'].replace(0, np.nan)
    df.loc[df['Height'] < 20, 'Height'] = np.nan
    df.loc[df['Height'] > 300, 'Height'] = np.nan
    df.loc[df['pH'] > 14, 'pH'] = np.nan
    df['Weight'] = df['Weight'].replace(0, np.nan)
    df.loc[df['Temp'] < 1, 'Temp'] = np.nan
    
    #Maybe wrong?
    df['Weight'] = df['Weight'].replace(472.0, np.nan)

    #Definetely dead if this is right
    df['NIDiasABP'] = df['NIDiasABP'].replace(0, np.nan)
    df['DiasABP'] = df['DiasABP'].replace(0, np.nan)
    df['NIMAP'] = df['NIMAP'].replace(0, np.nan)
    df['SysABP'] = df['SysABP'].replace(0, np.nan)
    df['NISysABP'] = df['NISysABP'].replace(0, np.nan)
    df['HR'] = df['HR'].replace(0, np.nan)
    df['MAP'] = df['MAP'].replace(0, np.nan)
    df['PaCO2'] = df['PaCO2'].replace(0, np.nan)
    df['SaO2'] = df['SaO2'].replace(0, np.nan)
    df.loc[df['pH'] < 6, 'pH'] = np.nan
    df.loc[df['Mg'] > 15, 'Mg'] = np.nan
    df.loc[df['K'] > 20, 'K'] = np.nan
    
    df.update(df.groupby('RecordID').ffill())

    df.to_parquet(f"data_{letter}_cleaned.parquet", engine="pyarrow", index=False)

In [20]:
for var in list(df.columns):
    print(var+":"+str(df[var].min())+"-"+str(df[var].max()))

Time:0-48
RecordID:152871.0-163037.0
Age:16.0-90.0
Gender:0.0-1.0
Height:121.9-210.8
Weight:6.0-278.5
BUN:1.0-198.0
Creatinine:0.1-18.8
DiasABP:5.0-247.0
GCS:3.0-15.0
Glucose:21.0-1591.0
HCO3:5.0-49.0
HCT:9.0-58.0
HR:3.4-198.0
K:1.5-13.0
Lactate:0.0-27.6
MAP:6.0-298.0
Mg:0.4-9.4
Na:104.0-180.0
PaCO2:13.0-100.0
PaO2:18.0-500.0
Platelets:6.0-1073.0
RespRate:0.0-98.0
SysABP:3.0-271.0
Temp:1.2-41.5
Urine:0.0-7400.0
WBC:0.1-486.5
pH:6.76-7.72
FiO2:0.21-1.0
MechVent:1.0-1.0
NIDiasABP:10.0-171.0
NIMAP:18.67-228.0
NISysABP:0.15-246.0
SaO2:1.06-100.0
Albumin:1.0-4.9
ALP:8.0-4695.0
ALT:1.0-16780.0
AST:5.0-36400.0
Bilirubin:0.1-82.8
Cholesterol:52.0-362.0
TroponinT:0.01-29.91
TroponinI:0.1-48.7
In-hospital_death:0-1


In [None]:
for letter in ["a", "b", "c"]:
    df = pd.read_parquet(f"data_{letter}_cleaned.parquet")
    ##Scale
    scaler = StandardScaler()
    columns_to_scale = ['Age', 'Height', 'Weight', 'BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'MAP', 'Mg', 'NIDiasABP', 'NIMAP', 'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'Urine', 'WBC', 'pH', 'MechVent', 'TroponinT', 'ALP', 'ALT', 'AST', 'Albumin', 'Bilirubin', 'Cholesterol', 'TroponinI']
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    df.to_parquet(f"data_{letter}_scaled.parquet", engine="pyarrow", index=False)

In [91]:
df = pd.read_parquet(f"data_a_cleaned.parquet")
df.head()

Unnamed: 0,Time,RecordID,Age,Gender,Height,Weight,BUN,Creatinine,GCS,Glucose,HCO3,HCT,HR,K,Mg,NIDiasABP,NIMAP,NISysABP,Na,Platelets,RespRate,Temp,TroponinT,Urine,WBC,ALP,ALT,AST,Albumin,Bilirubin,FiO2,MechVent,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI,In-hospital_death
0,0,132592.0,-1.665694,0.0,,-0.444181,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
1,1,132592.0,-1.665694,0.0,,-0.444181,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,2,132592.0,-1.665694,0.0,,-0.444181,,,0.831874,,,,1.425634,,,-0.941302,-0.494435,0.121859,,,0.422332,-0.314854,,,,,,,,,,,,,,,,,,,,,0
3,3,132592.0,-1.665694,0.0,,-0.444181,1.98615,0.633894,0.831874,8.374957,-2.854564,-1.215369,1.481737,1.946572,1.961486,-0.293028,0.020733,0.281725,0.232021,0.768255,0.243402,-0.314854,-0.349712,-0.064763,0.376511,,,,,,,,,,,,,,,,,,0
4,4,132592.0,-1.665694,0.0,,-0.444181,1.98615,0.633894,0.831874,8.374957,-2.854564,-1.215369,1.425634,1.946572,1.961486,-0.617165,-0.323141,0.041925,0.232021,0.768255,0.780193,-0.314854,-0.349712,-0.382261,0.376511,,,,,,,,,,,,,,,,,,0


In [78]:
df.shape

(196000, 43)

### Q 1.4 Train Simple Model

In [None]:
## TODO get X and y