In [2]:
import statsmodels.api as sm


def smooth(y):
    #return savitzky_golay(y, window_size=2001, order=3)
    return lowess(y)

# 0.2가 제일 잘 없앴음
def lowess(y, f=0.2):
    x = np.arange(0, len(y))
    return sm.nonparametric.lowess(y, x, frac=f, it=0)[:, 1].T

import numpy as np
import pandas as pd
import os, pickle, sys
from scipy import signal
#sys.path.append('DL_model')


### input 설정
SRATE = 250 # 250Hz
LEN_INPUT = 20 # input 10s
LEN_PER_NRS = 300 # vital length for each NRS
OVERLAP = 5
n_aug = int((LEN_PER_NRS-LEN_INPUT)/OVERLAP) + 1 # data augmentation 개수


vital_list = pickle.load(open('../Preprocessing/201227_modified_vital','rb'))
input_path = '../DL_model/dataset/preprocess5/'
if not os.path.exists(input_path[:-1]):
    os.mkdir(input_path[:-1])

if os.path.exists(input_path+'x_train_pacu.npz'):
    print('loading train...', flush=True, end='')
    x_train_pacu = np.load(input_path+'x_train_pacu.npz', allow_pickle=True)['arr_0']
    y_train_pacu = np.load(input_path+'y_train_pacu.npz')['arr_0']
    x_test_pacu = np.load(input_path+'x_test_pacu.npz', allow_pickle=True)['arr_0']
    y_test_pacu = np.load(input_path+'y_test_pacu.npz')['arr_0']
    print('done', flush=True)

else:
    false_row_list_preop = []
    
    vital_path = '../../cranberry2/Preprocessing/vital_data/PPG_100Hz_ECG_100Hz_pacu_5min/'
    ecg_path = '../../cranberry2/Preprocessing/vital_data/ECG_250Hz_pacu_5min/'
    df_preprocess_pacu = pickle.load(open('../DL_model/cache/preprocess5/df_preprocess_pacu_agender','rb'))
    df_demograph = pickle.load(open('../DL_model/df_caseids+age+gender', 'rb'))
    

    ### test set에 해당하는 file_path
    caseid_test = pickle.load(open('../DL_model/caseid_test_new', 'rb'))
    caseid_train = pickle.load(open('../DL_model/caseid_train_new', 'rb'))
    caseid_val = pickle.load(open('../DL_model/caseid_val_new', 'rb'))
    
    
    non_lis = []
    x_train_pacu, y_train_pacu = [], []
    x_test_pacu, y_test_pacu = [], []
    x_val_pacu, y_val_pacu = [], []
    age_train_pacu, gender_train_pacu = [], []
    age_test_pacu, gender_test_pacu = [], []
    age_val_pacu, gender_val_pacu = [], []

    
    cnt = 0
    for _, row in df_preprocess_pacu.iterrows():
        cnt += 1
        if cnt<=7000:
            continue
        if cnt>8000:
            break
            
            
        print('loading data {}/{} ...'.format(cnt, len(df_preprocess_pacu)), end='')
        row_demo = df_demograph[df_demograph['caseids']==row['caseids']].iloc[0]
            

        LEN = LEN_PER_NRS*SRATE
        # vital data - PPG (resampling 100 Hz to 250 Hz)
        df_vital = pickle.load(open(vital_path+row['file_path'], 'rb')).reset_index()
        pleth_samp = df_vital[['PPG']].fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        pleth_resamp = signal.resample(pleth_samp, LEN)
        #ppg_per_NRS = np.full(LEN, np.nan)
        #ppg_per_NRS[-len(pleth_resamp):] = pleth_resamp
        ppg_per_NRS = pleth_resamp
        

        # vital data - ECG (250Hz)
        ecg_samp = pickle.load(open(ecg_path+row['file_path'], 'rb')).reset_index()[['ECG']]
        ecg_samp = ecg_samp.fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        #ecg_per_NRS = np.full(LEN,np.nan)
        #ecg_per_NRS[-len(ecg_samp):] = ecg_samp
        ecg_per_NRS = ecg_samp
        
   
        if len(pleth_resamp) != LEN or len(ecg_samp) != LEN:
            print(row['caseids'])
            break
            

        save_path = '../../cranberry2/Preprocessing/cache/lowess_filtered/preprocess5/pacu_'+row['file_path']
        # 한 NRS에 대해 23개의 input 확인
        for i in range(23,57):
            # input이 전처리 통과한 경우
            if row[str(i+1)][0]:
                if os.path.exists(save_path+'_{}'.format(i)) and row['file_path'] not in vital_list:
                    ppg_inp, ecg_inp = pickle.load(open(save_path+'_{}'.format(i), 'rb'))
                
                else:
                    start_idx = i*OVERLAP*SRATE # 500i
                    end_idx = (i*OVERLAP+LEN_INPUT)*SRATE # 500i + 1000

                    # PPG smoothing
                    ppg_inp = np.copy(ppg_per_NRS[start_idx:end_idx])
                    if np.sum(np.isnan(ppg_inp))!=0:
                        ppg_inp = pd.DataFrame(ppg_inp).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
                    ppg_inp = ppg_inp - smooth(ppg_inp)
                    
                    # ECG smoothing
                    ecg_inp = np.copy(ecg_per_NRS[start_idx:end_idx])
                    if np.sum(np.isnan(ecg_inp))!=0:
                        ecg_inp = pd.DataFrame(ecg_inp).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
                    ecg_inp = ecg_inp - smooth(ecg_inp)                    
                
                    # pickle에 저장
                    pickle.dump([ppg_inp, ecg_inp],  open(save_path+'_{}'.format(i), 'wb'))
                    print('{}...'.format(i+1), end='')
        print('done')
'''                
                # normalization
                pleth_inp = ppg_inp - np.nanmean(ppg_inp)
                ecg_inp2 = (ecg_inp - np.nanmean(ecg_inp)) / np.nanstd(ecg_inp)
                
                
                # 해당 caseid가 test set에 속하는 경우
                if row['caseids'] in caseid_test:
                    age_test_pacu.append(int(row_demo['age']))
                    if row_demo['gender']=='F':
                        gender_test_pacu.append(1)
                    else:
                        gender_test_pacu.append(0)
                    x_test_pacu.append([pleth_inp, ecg_inp2])
                    y_test_pacu.append(int(float(row['NRS'])))

                # 해당 caseid가 train set에 해당하는 경우
                elif row['caseids'] in caseid_val:
                    age_val_pacu.append(int(row_demo['age']))
                    if row_demo['gender']=='F':
                        gender_val_pacu.append(1)
                    else:
                        gender_val_pacu.append(0)                    
                    x_val_pacu.append([pleth_inp, ecg_inp2])
                    y_val_pacu.append(int(float(row['NRS'])))
                    
                elif row['caseids'] in caseid_train:
                    age_train_pacu.append(int(row_demo['age']))
                    if row_demo['gender']=='F':
                        gender_train_pacu.append(1)
                    else:
                        gender_train_pacu.append(0)                    
                    x_train_pacu.append([pleth_inp, ecg_inp2])
                    y_train_pacu.append(int(float(row['NRS'])))
                    
                else:
                    non_lis.append(row['caseids'])
                    
        print('completed')
        
        
    x_train_pacu = np.array(x_train_pacu, np.float32)
    x_test_pacu = np.array(x_test_pacu, np.float32)
    y_train_pacu = np.array(y_train_pacu, int)
    y_test_pacu = np.array(y_test_pacu, int)
    x_val_pacu = np.array(x_val_pacu, np.float32)
    y_val_pacu = np.array(y_val_pacu, int)
    
    age_train_pacu = np.array(age_train_pacu, int)
    age_test_pacu = np.array(age_test_pacu, int)
    age_val_pacu = np.array(age_val_pacu, int)
    gender_train_pacu = np.array(gender_train_pacu, int)
    gender_test_pacu = np.array(gender_test_pacu, int)
    gender_val_pacu = np.array(gender_val_pacu, int)
    
        
    # 저장하기
    print('saving...', end='', flush=True)
    np.savez_compressed(input_path+'x_train_pacu2.npz', x_train_pacu)
    np.savez_compressed(input_path+'x_test_pacu2.npz', x_test_pacu)
    np.savez_compressed(input_path+'x_val_pacu2.npz', x_val_pacu)
    np.savez_compressed(input_path+'y_train_pacu2.npz', y_train_pacu)
    np.savez_compressed(input_path+'y_test_pacu2.npz', y_test_pacu)
    np.savez_compressed(input_path+'y_val_pacu2.npz', y_val_pacu)
    
    np.savez_compressed(input_path+'age_train_pacu2.npz', age_train_pacu)
    np.savez_compressed(input_path+'age_test_pacu2.npz', age_test_pacu)
    np.savez_compressed(input_path+'age_val_pacu2.npz', age_val_pacu)    
    np.savez_compressed(input_path+'gender_train_pacu2.npz', gender_train_pacu)
    np.savez_compressed(input_path+'gender_test_pacu2.npz', gender_test_pacu)
    np.savez_compressed(input_path+'gender_val_pacu2.npz', gender_val_pacu)    
    
    print('done', flush=True)

    
    
    
print('size of training set(pacu):', len(x_train_pacu))
print('size of validation set(pacu):', len(x_val_pacu))
print('size of test set(pacu):', len(x_test_pacu))
'''

loading data 7001/9949 ...done
loading data 7002/9949 ...done
loading data 7003/9949 ...done
loading data 7004/9949 ...done
loading data 7005/9949 ...done
loading data 7006/9949 ...done
loading data 7007/9949 ...done
loading data 7008/9949 ...done
loading data 7009/9949 ...done
loading data 7010/9949 ...done
loading data 7011/9949 ...done
loading data 7012/9949 ...done
loading data 7013/9949 ...30...done
loading data 7014/9949 ...done
loading data 7015/9949 ...done
loading data 7016/9949 ...done
loading data 7017/9949 ...done
loading data 7018/9949 ...done
loading data 7019/9949 ...done
loading data 7020/9949 ...done
loading data 7021/9949 ...done
loading data 7022/9949 ...done
loading data 7023/9949 ...done
loading data 7024/9949 ...done
loading data 7025/9949 ...done
loading data 7026/9949 ...done
loading data 7027/9949 ...done
loading data 7028/9949 ...done
loading data 7029/9949 ...done
loading data 7030/9949 ...done
loading data 7031/9949 ...done
loading data 7032/9949 ...done
loa

loading data 7264/9949 ...done
loading data 7265/9949 ...done
loading data 7266/9949 ...done
loading data 7267/9949 ...done
loading data 7268/9949 ...29...30...31...32...33...38...done
loading data 7269/9949 ...done
loading data 7270/9949 ...done
loading data 7271/9949 ...done
loading data 7272/9949 ...done
loading data 7273/9949 ...32...33...40...41...42...43...44...45...46...done
loading data 7274/9949 ...done
loading data 7275/9949 ...done
loading data 7276/9949 ...done
loading data 7277/9949 ...done
loading data 7278/9949 ...done
loading data 7279/9949 ...done
loading data 7280/9949 ...done
loading data 7281/9949 ...done
loading data 7282/9949 ...done
loading data 7283/9949 ...done
loading data 7284/9949 ...done
loading data 7285/9949 ...done
loading data 7286/9949 ...done
loading data 7287/9949 ...done
loading data 7288/9949 ...done
loading data 7289/9949 ...done
loading data 7290/9949 ...done
loading data 7291/9949 ...done
loading data 7292/9949 ...done
loading data 7293/9949 ...

loading data 7517/9949 ...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...done
loading data 7518/9949 ...done
loading data 7519/9949 ...done
loading data 7520/9949 ...done
loading data 7521/9949 ...done
loading data 7522/9949 ...done
loading data 7523/9949 ...done
loading data 7524/9949 ...done
loading data 7525/9949 ...24...25...26...27...28...34...35...36...37...38...39...40...41...42...43...44...45...done
loading data 7526/9949 ...done
loading data 7527/9949 ...done
loading data 7528/9949 ...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...40...41...42...43...44...45...46...47...48...49...50...51...52...53...done
loading data 7529/9949 ...done
loading data 7530/9949 ...done
loading data 7531/9949 ...done
loading data 7532/9949 ...done
loading data 7533/9949 ...done
loading data 7534/9949 ...done
loading data 7535/9949 ...done
loading data 7536/9949 ...done
loading data 7537/9949 ...done
loading data 7538/9949 ...done
loa

loading data 7760/9949 ...36...37...38...42...43...44...done
loading data 7761/9949 ...done
loading data 7762/9949 ...done
loading data 7763/9949 ...done
loading data 7764/9949 ...done
loading data 7765/9949 ...done
loading data 7766/9949 ...done
loading data 7767/9949 ...done
loading data 7768/9949 ...done
loading data 7769/9949 ...done
loading data 7770/9949 ...done
loading data 7771/9949 ...done
loading data 7772/9949 ...done
loading data 7773/9949 ...done
loading data 7774/9949 ...done
loading data 7775/9949 ...done
loading data 7776/9949 ...done
loading data 7777/9949 ...done
loading data 7778/9949 ...done
loading data 7779/9949 ...done
loading data 7780/9949 ...done
loading data 7781/9949 ...done
loading data 7782/9949 ...done
loading data 7783/9949 ...done
loading data 7784/9949 ...done
loading data 7785/9949 ...done
loading data 7786/9949 ...done
loading data 7787/9949 ...done
loading data 7788/9949 ...done
loading data 7789/9949 ...done
loading data 7790/9949 ...done
loading d

"                \n                # normalization\n                pleth_inp = ppg_inp - np.nanmean(ppg_inp)\n                ecg_inp2 = (ecg_inp - np.nanmean(ecg_inp)) / np.nanstd(ecg_inp)\n                \n                \n                # 해당 caseid가 test set에 속하는 경우\n                if row['caseids'] in caseid_test:\n                    age_test_pacu.append(int(row_demo['age']))\n                    if row_demo['gender']=='F':\n                        gender_test_pacu.append(1)\n                    else:\n                        gender_test_pacu.append(0)\n                    x_test_pacu.append([pleth_inp, ecg_inp2])\n                    y_test_pacu.append(int(float(row['NRS'])))\n\n                # 해당 caseid가 train set에 해당하는 경우\n                elif row['caseids'] in caseid_val:\n                    age_val_pacu.append(int(row_demo['age']))\n                    if row_demo['gender']=='F':\n                        gender_val_pacu.append(1)\n                    else:\n          