# Project

## Data loading

### Create Parquet file Q1.1

In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

# Define paths
data_dir = "data"

pd.set_option('display.max_columns', None)

def read_patient_data(file_path):
    df = pd.read_csv(file_path, sep=",")
    return df

# Process each patient file
def process_patient_data(patient_file):
    df = read_patient_data(patient_file)

    df['Time'] = pd.to_timedelta(df['Time'] + ':00')
    df['Time'] = df['Time'].dt.ceil('h')
    df['Time'] = df['Time'].dt.total_seconds() // 3600
    
    return df

def generate_parquet(letter:str, keepICUType=False):

    folder_path = "data/set-" + letter
    static_columns = ["RecordID", "Age", "Gender", "Height", "ICUType", "Weight"] #ICU-Type can be dropped later
    df_processed = pd.DataFrame()

    for file_name in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        df = process_patient_data(file_path)
        df = df.pivot_table(index="Time", columns="Parameter", values="Value", aggfunc="last").reset_index()
        df = df.reindex(columns= ["Time"] + static_columns + [col for col in df.columns if col not in static_columns + ["Time"]])
        df.set_index('Time', inplace=True)
        df = df.reindex(range(49), fill_value=np.nan)
        df[static_columns] = df[static_columns].ffill().bfill()
        df.reset_index(drop=False, inplace=True)
        df_processed = pd.concat([df_processed, df], ignore_index=True)

    if not keepICUType:
        df_processed = df_processed.drop(columns=["ICUType"])

    df_processed.to_parquet(f"data_{letter}_raw.parquet", engine="pyarrow", index=False)

    return df_processed



In [12]:
for letter in ["a", "b", "c"]:
    generate_parquet(letter)


100%|██████████| 4000/4000 [00:23<00:00, 167.34it/s]
100%|██████████| 4000/4000 [00:30<00:00, 130.16it/s]
100%|██████████| 4000/4000 [00:31<00:00, 128.70it/s]


### TBD: Generate nice Plots Q1.2

### Data imputation Q1.3

In [22]:
## Forward fill TODO: maybe later different imputation and fill values that never come up (mean value)

for letter in ["a", "b", "c"]:
    df = pd.read_parquet(f"data_{letter}_raw.parquet")
    df['Height'] = df['Height'].replace(-1.0, np.nan)
    df['Weight'] = df['Weight'].replace(-1.0, np.nan)
    df['Gender'] = df['Gender'].replace(-1.0, np.nan)
    df['NIDiasABP'] = df['NIDiasABP'].replace(-1.0, np.nan)
    df['DiasABP'] = df['DiasABP'].replace(-1.0, np.nan)
    df['Temp'] = df['Temp'].replace(-0.5, np.nan)
    df['Temp'] = df['Temp'].replace(-17.8, np.nan) # What to do with negative temp? Theoretically possible but not likely
    df.update(df.groupby('RecordID').ffill())
    df.to_parquet(f"data_{letter}_imputed.parquet", engine="pyarrow", index=False)

In [16]:
df = pd.read_parquet(f"data_a_imputed.parquet")
df.head()

Unnamed: 0,Time,RecordID,Age,Gender,Height,Weight,BUN,Creatinine,GCS,Glucose,HCO3,HCT,HR,K,Mg,NIDiasABP,NIMAP,NISysABP,Na,Platelets,RespRate,Temp,TroponinT,Urine,WBC,ALP,ALT,AST,Albumin,Bilirubin,FiO2,MechVent,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,0,132592.0,35.0,0.0,,71.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,132592.0,35.0,0.0,,71.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,132592.0,35.0,0.0,,71.8,,,15.0,,,,112.0,,,43.0,68.67,120.0,,,22.0,36.6,,,,,,,,,,,,,,,,,,,,
3,3,132592.0,35.0,0.0,,71.8,68.0,2.3,15.0,603.0,11.0,25.5,113.0,5.3,2.8,53.0,76.67,124.0,140.0,287.0,21.0,36.6,0.15,120.0,15.3,,,,,,,,,,,,,,,,,
4,4,132592.0,35.0,0.0,,71.8,68.0,2.3,15.0,603.0,11.0,25.5,112.0,5.3,2.8,48.0,71.33,118.0,140.0,287.0,24.0,36.6,0.15,60.0,15.3,,,,,,,,,,,,,,,,,
