# Project

## Data loading

### Create Parquet file

In [150]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

# Define paths
data_dir = "data"

def read_patient_data(file_path):
    df = pd.read_csv(file_path, sep=",")
    return df

# Process each patient file
def process_patient_data(patient_file):
    df = read_patient_data(patient_file)

    df['Time'] = pd.to_timedelta(df['Time'] + ':00')
    df['Time'] = df['Time'].dt.ceil('h')
    df['Time'] = df['Time'].dt.total_seconds() // 3600
    
    return df

def generate_parquet(letter:str, keepICUType=False):

    folder_path = "data/set-" + letter
    static_columns = ["RecordID", "Age", "Gender", "Height", "ICUType", "Weight"] #ICU-Type can be dropped later
    df_processed = pd.DataFrame()

    for file_name in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        df = process_patient_data(file_path)
        df = df.pivot_table(index="Time", columns="Parameter", values="Value", aggfunc="last").reset_index()
        df = df.reindex(columns= ["Time"] + static_columns + [col for col in df.columns if col not in static_columns + ["Time"]])
        df.set_index('Time', inplace=True)
        df = df.reindex(range(49), fill_value=np.nan)
        df[static_columns] = df[static_columns].ffill().bfill()
        df.reset_index(drop=False, inplace=True)
        df_processed = pd.concat([df_processed, df], ignore_index=True)

    if not keepICUType:
        df_processed = df_processed.drop(columns=["ICUType"])

    df_processed.to_parquet(f"data_{letter}.parquet", engine="pyarrow", index=False)

    return df_processed



In [151]:
for letter in ["a", "b", "c"]:
    generate_parquet(letter)


100%|██████████| 4000/4000 [00:19<00:00, 203.25it/s]
100%|██████████| 4000/4000 [00:19<00:00, 205.38it/s]
100%|██████████| 4000/4000 [00:19<00:00, 203.14it/s]


In [152]:
df = pd.read_parquet("data_b.parquet")

In [153]:
df.shape

(196000, 42)

In [None]:
fglök