In [38]:
import pandas as pd
import numpy as np
import sklearn

In [39]:
df = pd.read_csv("data/processed_all.csv")
df = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])

In [40]:
to_drop = df[(df.current_hypotensive == 1) & (df.hypotensive_in_15 == 1.0)]

In [41]:
new_df = df.drop(to_drop.index)

In [42]:
new_df["hypotensive_in_15"] = new_df["hypotensive_in_15"].astype(int)

In [43]:
# Add training labels
from src.utils.get_labels import get_training_labels

training_labels = get_training_labels()

new_df["training_label"] = new_df["wave"].apply(lambda x: training_labels[x])

In [44]:
new_df.to_csv("src/data/processed_cleaned.csv", index=False)

In [45]:
## Convert wave data to numerical
new_df.wave, mapping = pd.factorize(new_df.wave)


In [46]:

def split_by_patient(df, train_split=0.6, test_split=0.2):
    """Split into training and testing DFs by patients"""
    # Drop the pressor patients to begin with
    #df = df.drop(df[df.training_label == 'H1'].index)
    
    # Initialize training and testing sets
    train_df = pd.DataFrame().reindex_like(df)
    test_df = pd.DataFrame().reindex_like(df)
    valid_df = pd.DataFrame().reindex_like(df)
    train_idx = 0
    test_idx = 0
    valid_idx = 0
    for label in ("H2", "C1", "C2", "H1"):
        subset = df[df.training_label == label]
        wave_ids = list(set(subset.wave))
        n_ids = len(wave_ids)
        n_train = round(n_ids * train_split)
        n_test = round(n_ids * test_split)
        n_valid = n_ids - n_train - n_test
        train_ids = wave_ids[0:n_train]
        test_ids = wave_ids[n_train:n_train+n_test]
        valid_ids = wave_ids[n_train+n_test:n_ids]
        train_data = subset[subset.wave.isin(train_ids)]
        test_data = subset[subset.wave.isin(test_ids)]
        valid_data = subset[subset.wave.isin(valid_ids)]
        train_df.iloc[train_idx:(train_idx+len(train_data))] = train_data
        test_df.iloc[test_idx:(test_idx+len(test_data))] = test_data
        valid_df.iloc[valid_idx:(valid_idx+len(valid_data))] = valid_data 
        
        train_idx += len(train_data)
        test_idx += len(test_data)
        valid_idx += len(valid_data)
        
    # Drop NAs
    train_df = train_df.dropna()
    test_df = test_df.dropna()
    valid_df = valid_df.dropna()
    
    return train_df, test_df, valid_df

In [51]:
train_df, test_df, valid_df = split_by_patient(new_df)

train_df[['wave', 'start_window', 'end_window', 'avg_sys', 'avg_dias', 'avg_map',
       'current_hypotensive', 'hypotensive_in_15']].to_csv('data/dnn/training_data_cleaned.csv',header=None,index = False)

test_df[['wave', 'start_window', 'end_window', 'avg_sys', 'avg_dias', 'avg_map',
       'current_hypotensive', 'hypotensive_in_15']].to_csv('data/dnn/test_data_cleaned.csv',header=None,index = False)

valid_df[['wave', 'start_window', 'end_window', 'avg_sys', 'avg_dias', 'avg_map',
       'current_hypotensive', 'hypotensive_in_15']].to_csv('data/dnn/valid_data_cleaned.csv',header=None,index = False)

Unnamed: 0,wave,start_window,end_window,avg_sys,avg_dias,avg_map,current_hypotensive,hypotensive_in_15
0,5.0,0.0,7500.0,97.481739,69.718261,78.972754,0.0,0.0
1,5.0,7500.0,15000.0,98.726957,69.940870,79.536232,0.0,0.0
2,5.0,15000.0,22500.0,100.919298,70.800000,80.839766,0.0,0.0
3,5.0,22500.0,30000.0,102.196491,71.038596,81.424561,0.0,0.0
4,5.0,30000.0,37500.0,100.400000,71.075862,80.850575,0.0,0.0
...,...,...,...,...,...,...,...,...
255270,46.0,74842500.0,74850000.0,114.988281,45.882371,68.917675,0.0,1.0
255271,46.0,74850000.0,74857500.0,119.725762,48.171448,72.022886,0.0,1.0
255272,46.0,74857500.0,74865000.0,115.018228,46.000018,69.006088,0.0,1.0
255273,46.0,74865000.0,74872500.0,108.581862,44.109109,65.600026,0.0,1.0
