# MIMIC Pre-Processing Example
The output of the [MIMIC Pre-Processing Example](./MimicPreprocessingExample.ipynb) notebook generates a series of irregularly spaced timestamped dynamic data features.  This notebook demonstrates how to group those dynamic features into one or more regularly spaced time windows.

In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import os
from sklearn.preprocessing import LabelEncoder 

from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
dataDirStr = '/Users/MusaJTaib/Desktop/Education/Semesters/Faith_JobWork/MIMIC_Codes/MIMIC_3/'   # MIMIC CSV file location.
cacheDirStr = '/Users/MusaJTaib/Desktop/Education/Semesters/Faith_JobWork/MIMIC_Codes/MIMIC_3/cache/'  # Cache directory for intermediate files.

## Load Data
---

In [None]:
events = pd.read_parquet(cacheDirStr + 'ElapsedTimeEvents.parquet')

In [None]:
events['GscEyes'] = events['GscEyes'].fillna(0)
events['GscVerbal']=events['GscVerbal'].fillna(0)
events['GscMotor']=events['GscMotor'].fillna(0)

In [None]:
events['GscEyes'] = events['GscEyes'].replace(0, np.nan)
events['GscVerbal'] = events['GscVerbal'].replace(0, np.nan)
events['GscMotor'] = events['GscMotor'].replace(0, np.nan)

In [None]:
unique_values = events['GscEyes'].unique()

In [None]:
events

### Feature Types

In [None]:
catFtrs = [ 'ServiceType', 'AdmitType', 'GscVerbal', 'GscMotor', 'GscEyes' ] 

contFtrs = [ 'SystolicBloodPressure', 'HeartRate', 'Temperature', 'BloodO2', 'Urine' ]

staticFtrs = [  'Age', 'Aids', 'Cancer', 'Death' ]

## Define Observation Window
---

In [None]:
tO = pd.Timedelta(f'{48*60*60}S') # Observation window length, T_O (seconds). 48 hours
events = events.loc[events.Time < tO]

In [None]:
df = events.reset_index()

In [None]:
df['Time'] = pd.to_timedelta(df['Time'])

new_rows = []

for subject in df['SubjectId'].unique():
    time_range = pd.timedelta_range(start='0 days', end='2 days', freq='30T')
    
    subject_df = pd.DataFrame({'Time': time_range})
    subject_df['SubjectId'] = subject
    
    new_rows.append(subject_df)

new_rows_df = pd.concat(new_rows)

merged_df = pd.merge(df, new_rows_df, on=['SubjectId', 'Time'], how='outer')

merged_df = merged_df.sort_values(by=['SubjectId', 'Time']).reset_index(drop=True)

In [None]:
events = merged_df

## One Hot Encode Categorical Features
---

In [None]:
oneHot = pd.DataFrame(index=events.index)

In [None]:
print(events.columns)

In [None]:
for c in tqdm(catFtrs):
    oneHot = pd.concat([ oneHot, pd.get_dummies(events[c],prefix=c) ], axis=1)

In [None]:
oneHotCols = list(oneHot.columns)

In [None]:
events = pd.concat([ events.drop(columns=catFtrs), oneHot ], axis=1)

## Resample Into Time Bins
---

In [None]:
NW = 12

In [None]:
tB = f'{(48*60*60)/NW}S' # Bin size (seconds).

### One Hot Encoded Features
Sum up how many events fall into each time bin.

In [None]:
eventsBin = events[['Time']+oneHotCols].groupby(level=0).resample(on='Time',rule=tB).sum()

### Continuous Features
Take the average of the feature value for each time bin.

In [None]:
eventsBinNew = events[['Time']+contFtrs].groupby(level=0).resample(on='Time',rule=tB).mean()

In [None]:
eventsBin = pd.concat([eventsBin, eventsBinNew], axis=1)

### Static Features
Copy static features across all time bins since they don't change.

In [None]:
staticTbl = events[staticFtrs].groupby(level=0).apply(lambda x: x.iloc[0,:])

In [None]:
idx = eventsBin.index.get_level_values(0)

In [None]:
for c in staticFtrs:
    eventsBin[c] = staticTbl.loc[idx,c].values

## Save Windowed Data

In [None]:
eventsBin

In [None]:
eventsBin.to_csv('MIMIC/DATA_' + str(NW) + '.csv')