# Project

## Data loading

### Create Parquet file Q1.1

In [54]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

# Define paths
data_dir = "data"

pd.set_option('display.max_columns', None)

def read_patient_data(file_path):
    df = pd.read_csv(file_path, sep=",")
    return df

# Process each patient file
def process_patient_data(patient_file):
    df = read_patient_data(patient_file)

    df['Time'] = pd.to_timedelta(df['Time'] + ':00')
    df['Time'] = df['Time'].dt.ceil('h')
    df['Time'] = df['Time'].dt.total_seconds() // 3600
    
    return df

def generate_parquet(letter:str, keepICUType=False):

    folder_path = "data/set-" + letter
    static_columns = ["RecordID", "Age", "Gender", "Height", "ICUType", "Weight"] #ICU-Type can be dropped later
    df_processed = pd.DataFrame()

    for file_name in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        df = process_patient_data(file_path)
        df = df.pivot_table(index="Time", columns="Parameter", values="Value", aggfunc="last").reset_index()
        df = df.reindex(columns= ["Time"] + static_columns + [col for col in df.columns if col not in static_columns + ["Time"]])
        df.set_index('Time', inplace=True)
        df = df.reindex(range(49), fill_value=np.nan)
        df[static_columns] = df[static_columns].ffill().bfill()
        df.reset_index(drop=False, inplace=True)
        df_processed = pd.concat([df_processed, df], ignore_index=True)

    if not keepICUType:
        df_processed = df_processed.drop(columns=["ICUType"])

    df_target = read_patient_data(f"Outcomes-{letter}.txt")
    df_target = df_target[['RecordID', 'In-hospital_death']]
    df_processed = df_processed.merge(df_target, on="RecordID", how="left")

    df_processed.to_parquet(f"data_{letter}_raw.parquet", engine="pyarrow", index=False)

    return df_processed



In [55]:
for letter in ["a", "b", "c"]:
    generate_parquet(letter)


100%|██████████| 4000/4000 [00:28<00:00, 142.66it/s]
100%|██████████| 4000/4000 [00:30<00:00, 130.87it/s]
100%|██████████| 4000/4000 [00:30<00:00, 129.94it/s]


### TBD: Generate nice Plots Q1.2

In [61]:
letter = "a"
df = pd.read_parquet(f"data_{letter}_raw.parquet")
df.head(100)

Unnamed: 0,Time,RecordID,Age,Gender,Height,Weight,BUN,Creatinine,GCS,Glucose,HCO3,HCT,HR,K,Mg,NIDiasABP,NIMAP,NISysABP,Na,Platelets,RespRate,Temp,TroponinT,Urine,WBC,ALP,ALT,AST,Albumin,Bilirubin,FiO2,MechVent,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI,In-hospital_death
0,0,132592.0,35.0,0.0,-1.0,71.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
1,1,132592.0,35.0,0.0,-1.0,71.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,2,132592.0,35.0,0.0,-1.0,71.8,,,15.0,,,,112.0,,,43.0,68.67,120.0,,,22.0,36.6,,,,,,,,,,,,,,,,,,,,,0
3,3,132592.0,35.0,0.0,-1.0,71.8,68.0,2.3,,603.0,11.0,25.5,113.0,5.3,2.8,53.0,76.67,124.0,140.0,287.0,21.0,,0.15,120.0,15.3,,,,,,,,,,,,,,,,,,0
4,4,132592.0,35.0,0.0,-1.0,71.8,,,,,,,112.0,,,48.0,71.33,118.0,,,24.0,,,60.0,,,,,,,,,,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,46,140662.0,42.0,1.0,-1.0,138.1,,,15.0,,,,119.0,,,82.0,,129.0,,,,38.0,,60.0,,,,,,,,1.0,,,,,,,,,,,0
96,47,140662.0,42.0,1.0,-1.0,138.1,,,,,,,116.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
97,48,140662.0,42.0,1.0,-1.0,138.1,,,,,,,114.0,,,87.0,,139.0,,,,,,80.0,,,,,,,,,,,,,,,,,,,0
98,0,140104.0,61.0,1.0,188.0,80.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0


### Data imputation Q1.3

In [62]:
## Forward fill TODO: maybe later different imputation and fill values that never come up (mean value)
from sklearn.preprocessing import StandardScaler

for letter in ["a", "b", "c"]:
    ##Forwardfill and impute
    df = pd.read_parquet(f"data_{letter}_raw.parquet")
    df['Height'] = df['Height'].replace(-1.0, np.nan)
    df['Weight'] = df['Weight'].replace(-1.0, np.nan)
    df['Gender'] = df['Gender'].replace(-1.0, np.nan)
    df['NIDiasABP'] = df['NIDiasABP'].replace(-1.0, np.nan)
    df['DiasABP'] = df['DiasABP'].replace(-1.0, np.nan)
    df['Temp'] = df['Temp'].replace(-0.5, np.nan)
    df['Temp'] = df['Temp'].replace(-17.8, np.nan) # What to do with negative temp? Theoretically possible but not likely
    df.update(df.groupby('RecordID').ffill())

    ##Scale
    scaler = StandardScaler()
    columns_to_scale = ['Age', 'Height', 'Weight', 'BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'MAP', 'Mg', 'NIDiasABP', 'NIMAP', 'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'Urine', 'WBC', 'pH', 'MechVent', 'TroponinT', 'ALP', 'ALT', 'AST', 'Albumin', 'Bilirubin', 'Cholesterol', 'TroponinI']
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])



    df.to_parquet(f"data_{letter}_cleaned.parquet", engine="pyarrow", index=False)

In [63]:
df = pd.read_parquet(f"data_a_cleaned.parquet")
df.head()

Unnamed: 0,Time,RecordID,Age,Gender,Height,Weight,BUN,Creatinine,GCS,Glucose,HCO3,HCT,HR,K,Mg,NIDiasABP,NIMAP,NISysABP,Na,Platelets,RespRate,Temp,TroponinT,Urine,WBC,ALP,ALT,AST,Albumin,Bilirubin,FiO2,MechVent,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI,In-hospital_death
0,0,132592.0,-1.665694,0.0,,-0.444181,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
1,1,132592.0,-1.665694,0.0,,-0.444181,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,2,132592.0,-1.665694,0.0,,-0.444181,,,0.831874,,,,1.425634,,,-0.941302,-0.494435,0.121859,,,0.422332,-0.314854,,,,,,,,,,,,,,,,,,,,,0
3,3,132592.0,-1.665694,0.0,,-0.444181,1.98615,0.633894,0.831874,8.374957,-2.854564,-1.215369,1.481737,1.946572,1.961486,-0.293028,0.020733,0.281725,0.232021,0.768255,0.243402,-0.314854,-0.349712,-0.064763,0.376511,,,,,,,,,,,,,,,,,,0
4,4,132592.0,-1.665694,0.0,,-0.444181,1.98615,0.633894,0.831874,8.374957,-2.854564,-1.215369,1.425634,1.946572,1.961486,-0.617165,-0.323141,0.041925,0.232021,0.768255,0.780193,-0.314854,-0.349712,-0.382261,0.376511,,,,,,,,,,,,,,,,,,0


In [64]:
df.shape

(196000, 43)