In [17]:
import pandas as pd
import numpy as np


In [18]:
task1_name = 'Weaning'
task2_name = 'Weaning_successful'
task3_name = 'SBT_start'
task4_name = 'SBT_successful'
task5_name = 'Mortality_30d'
task6_name = 'Mortality_60d'
task7_name = 'Mortality_90d'
task8_name = 'Vasopressor'

task_list = [task1_name,task2_name,task3_name,task4_name,task5_name,task6_name,task7_name,task8_name]

In [19]:
data_path = "./data/sample"

all_task_data = {}

for task_name in task_list:
    
    # read
    train_X = np.load(f"{data_path}/train_X_{task_name}.npy", allow_pickle=True)
    train_Y = np.load(f"{data_path}/train_Y_{task_name}.npy", allow_pickle=True)
    validation_X = np.load(f"{data_path}/validation_X_{task_name}.npy", allow_pickle=True)
    validation_Y = np.load(f"{data_path}/validation_Y_{task_name}.npy", allow_pickle=True)
    test_X = np.load(f"{data_path}/test_X_{task_name}.npy", allow_pickle=True)
    test_Y = np.load(f"{data_path}/test_Y_{task_name}.npy", allow_pickle=True)
    
    # last day
    last_day = train_X.shape[1] - 1
    train_X = train_X[:, last_day, :]
    validation_X = validation_X[:, last_day, :]
    test_X = test_X[:, last_day, :]

    all_task_data[task_name] = {
        'train': {'X': train_X[:,1:], 'X_with_id': train_X, 'Y': train_Y},
        'validation': {'X': validation_X[:,1:], 'X_with_id': validation_X,'Y': validation_Y},
        'test': {'X': test_X[:,1:], 'X_with_id': test_X, 'Y': test_Y}
    }
    

In [20]:
num_features = all_task_data[task_name]['train']['X'].shape[1]

combined_train_X = np.empty((0, num_features))

for task_name in task_list:
    train_x = all_task_data[task_name]['train']['X']
    combined_train_X = np.vstack((combined_train_X, train_x))

print(combined_train_X.shape)

(47318, 101)


In [21]:
print(combined_train_X[:,1:].shape)

(47318, 100)


# Standardization

In [22]:
from sklearn.preprocessing import MinMaxScaler
import joblib

scaler = MinMaxScaler(feature_range=(0, 1)).fit(combined_train_X[:,:])

#保存轉換器
joblib.dump(scaler, './data/scaler_model.joblib')
#input()

['./data/scaler_model.joblib']

In [23]:
def scale_data(data, scaler):
    return scaler.transform(data)

In [24]:
for task_name in task_list:
    # 對訓練集進行縮放
    all_task_data[task_name]['train']['scalar_X'] = scale_data(all_task_data[task_name]['train']['X'][:,:], scaler)
    
    # 對驗證集進行縮放
    all_task_data[task_name]['validation']['scalar_X'] = scale_data(all_task_data[task_name]['validation']['X'][:,:], scaler)
    
    # 對測試集進行縮放
    all_task_data[task_name]['test']['scalar_X'] = scale_data(all_task_data[task_name]['test']['X'][:,:], scaler)
    

In [25]:
for task_name in task_list:
    train_y = all_task_data[task_name]['train']['Y']
    val_y = all_task_data[task_name]['validation']['Y']
    test_y = all_task_data[task_name]['test']['Y']
    
    total = np.concatenate((train_y, val_y), axis=0)
    total = np.concatenate((total, test_y), axis=0)
    
    one_count = np.count_nonzero(total == 1)
    zero_count = np.count_nonzero(total == 0)
    
    print(f'Task:{task_name} => 1:{one_count}({round(one_count*100/(one_count+zero_count),2)}%)   0:{zero_count}({round(zero_count*100/(one_count+zero_count),2)}%)')

Task:Weaning => 1:1182(32.72%)   0:2430(67.28%)
Task:Weaning_successful => 1:1122(31.16%)   0:2479(68.84%)
Task:SBT_start => 1:1893(37.34%)   0:3177(62.66%)
Task:SBT_successful => 1:1708(34.36%)   0:3263(65.64%)
Task:Mortality_30d => 1:3411(32.43%)   0:7107(67.57%)
Task:Mortality_60d => 1:4043(37.23%)   0:6817(62.77%)
Task:Mortality_90d => 1:4361(39.53%)   0:6670(60.47%)
Task:Vasopressor => 1:1925(20.29%)   0:7564(79.71%)


# Save

In [26]:
save_path = './data/sample/standard_data/'

In [27]:
import os
for task_name in task_list: #[Weaning、SBT、 ...]
    for data_type in all_task_data[task_name].keys():  #train、validation、test
        for data_name in all_task_data[task_name][data_type].keys():  #X、Y、scalar_X
            file_name = f'{data_type}_{data_name}_{task_name}'
            folder = os.path.join(save_path, file_name)
            data = all_task_data[task_name][data_type][data_name]
            
            if data.ndim == 2:
                data = data.reshape(data.shape[0], 1, data.shape[1])
            np.save(f'{folder}.npy', data)
print('finish')

finish
