In [4]:
import os, sys
import pickle
import pandas as pd
import numpy as np

# 피크 사이 wave를 모두 같은 length로 만들기 위한 함수
def linear_connection(list, idx):
    int_idx = int(idx)
    return list[int_idx] + (list[int_idx+1] - list[int_idx]) * (idx - int_idx)


# input 설정
LEN_INPUT = 10 # input 10s
LEN_PER_NRS = 120 # vital length for each NRS
OVERLAP = 2
n_aug = int((LEN_PER_NRS-LEN_INPUT)/OVERLAP) + 1 # data augmentation 개수


# vital data 저장 경로
vital_path = '../../cranberry2/Preprocessing/preop_vital/preop'
f_vital_list = os.listdir(vital_path)

# 전처리 정보를 담을 Dataframe
column_list = ['file_path'] + [str(i+1) for i in range(n_aug)] #+ ['NRS']
df_preprocess = pd.DataFrame(columns = column_list)


SRATE, f_num = 100, 0
for f_vital in f_vital_list[0:1000]:
    f_num += 1
    
    print('###Input', f_num,'/ '+str(len(f_vital_list))+': '+f_vital+'###')
    
    # ppg, ecg peaks 불러오기
    # peaks가 없는 경우는 ECG나 PPG data가 없는 case들
    if not os.path.exists('../../cranberry2/Preprocessing/cache/PPG_peaks/'+f_vital):
        print('no existing PPG peaks: ', f_vital)
        continue
    if not os.path.exists('../../cranberry2/Preprocessing/cache/ECG_peaks/'+f_vital):
        print('no existing ECG peaks: ', f_vital)
        continue
    
    # vital data 불러오기    
    df_vital = pickle.load(open(vital_path+'/'+f_vital, 'rb')).reset_index()
    
    #dataframe에 새로운 행 만들기
    df_preprocess.loc[f_num-1,'file_path'] = f_vital
        
    ppg_min, ppg_peak = pickle.load(open('../../cranberry2/Preprocessing/cache/PPG_peaks/'+f_vital, 'rb'))
    ecg_peak = pickle.load(open('../../cranberry2/Preprocessing/cache/ECG_peaks/'+f_vital, 'rb'))
    
    ppg_min, ppg_peak = np.array([ppg_min]), np.array([ppg_peak])
    ecg_peak= np.array([ecg_peak])
    
    
    # 10초 단위로 끊기
    for i in range(n_aug):
        start_idx = i*OVERLAP*SRATE # 500i
        end_idx = (i*OVERLAP + LEN_INPUT)*SRATE # 500i + 1000
        
        seg_ppg, seg_ecg = [np.nan for j in range(LEN_INPUT*SRATE)], [np.nan for j in range(LEN_INPUT*SRATE)]
        df_vital_input = df_vital.loc[start_idx:end_idx-1]
        seg_ppg[0:len(df_vital_input)] = df_vital_input['Pleth'].tolist()
        seg_ecg[0:len(df_vital_input)] = df_vital_input['ECG'].tolist()
    
        ### 1. 결측치 처리 ###              
        # df.isnull().sum() 하면 더 간단하게 가능하나 애초에 NRS에 해당하는 vital data가 120초 보다 짧은 경우
        nan_ppg_list = np.isnan(seg_ppg)
        nan_ecg_list = np.isnan(seg_ecg)
        nan_ppg_perc = np.sum(nan_ppg_list) / LEN_INPUT / SRATE
        nan_ecg_perc = np.sum(nan_ecg_list) / LEN_INPUT / SRATE
        
        # ECG, PPG 둘다 결측치인 부분
        nan_both_perc = 0
        for j in range(len(seg_ppg)):
            if nan_ppg_list[j] and  nan_ecg_list[j]:
                nan_both_perc += 1
        nan_both_perc /= (LEN_INPUT*SRATE)
            
        # segment의 결측치 비율 정보
        nan_info = [nan_ppg_perc, nan_ecg_perc, nan_both_perc]
        
        # 결측치가 많은 경우, noise 확인할 것도 없이 False -  이 경우의 noise_info는 -1로 처리
        if nan_ppg_perc > 0.3 or nan_ecg_perc > 0.3 or nan_both_perc > 0.2:
            df_preprocess.loc[f_num-1,str(i+1)] = (False, nan_info, [-1, -1])
            continue
            
        

        ### 2. Noise 처리 ###
        # 10초 segment 내의 ppg, ecg peak idx
        #seg_ppg_min = ppg_min[(start_idx<=np.array(ppg_min)) & (np.array(ppg_min)<end_idx)]
        idx_ppg_peak = ppg_peak[(start_idx<=ppg_peak) & (ppg_peak<end_idx)] - start_idx
        idx_ecg_peak = ecg_peak[(start_idx<=ecg_peak) & (ecg_peak<end_idx)] - start_idx
        
        # peak가 5개 이하는 noise가 많이 낀 상황 (10초 구간 중 peak가 7초 이상 없으면 문제 -> 즉 peak 개수 범위는 7/2 ~ 7/0.4)
        # 따라서 peak가 7초 이상 있어야하고 이때 최소 peak 개수는 3.5개
        # peak 개수가 기준 미달이면 noise 계산 자세히 할 필요없이 False - 이 경우의 noise_info는 -2로 처리
        if len(idx_ppg_peak)<=4 or len(idx_ecg_peak)<=4:
            df_preprocess.loc[f_num-1,str(i+1)] = (False, nan_info, [-2, -2])
            continue

            
        # 10초 segment 내의 ppg, ecg peak value
        #print(len(seg_ppg), idx_ppg_peak)
        val_ppg_peak = [seg_ppg[k] for k in idx_ppg_peak]
        val_ecg_peak = [seg_ecg[k] for k in idx_ecg_peak]
        
        # peak와 peak 사이 interval에 대한 noise 여부 -> 따라서 길이는 peak - 1
        bool_noise_ppg = [False for k in range(len(idx_ppg_peak)-1)]
        bool_noise_ecg = [False for k in range(len(idx_ecg_peak)-1)]
        
        #  2.1 peak 간격 이상한 noise (HR 30~150 -> HBI 0.4s ~ 2s로 SRATE 곱해주면 40~200)
        for k in range(len(bool_noise_ppg)):
            if not 40 < idx_ppg_peak[k+1] - idx_ppg_peak[k] < 200:
                bool_noise_ppg[k] = True
        for k in range(len(bool_noise_ecg)):
            if not 40 < idx_ecg_peak[k+1] - idx_ecg_peak[k] < 200:
                bool_noise_ecg[k] = True
                
        # 2.2 모양 이상한 noise
        # wave interval into same length(2s(200))
        len_wave = 200
        norm_seg_ppg, norm_seg_ecg = [], []

        for k in range(len(bool_noise_ppg)):
            len_interval_ppg = idx_ppg_peak[k+1] - idx_ppg_peak[k]
            
            # peak 사이 wave를 모두 같은 길이로 변환
            norm_seg_ppg.append([linear_connection(seg_ppg[idx_ppg_peak[k]:idx_ppg_peak[k+1]+1], n/len_wave*len_interval_ppg) for n in range(len_wave)])
        
        for k in range(len(bool_noise_ecg)):
            len_interval_ecg = idx_ecg_peak[k+1] - idx_ecg_peak[k]
            
            # peak 사이 wave를 모두 같은 길이로 변환
            norm_seg_ecg.append([linear_connection(seg_ecg[idx_ecg_peak[k]:idx_ecg_peak[k+1]+1], n/len_wave*len_interval_ecg) for n in range(len_wave)])
          
        
        # wave interval 사이 correlation 계산 - PPG
        mean_wave_ppg = np.nanmean(norm_seg_ppg, axis = 0)
        mean_wave_ppg = pd.DataFrame(mean_wave_ppg).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        norm_seg_ppg = pd.DataFrame(norm_seg_ppg).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).values
        for k in range(len(bool_noise_ppg)):
            if np.corrcoef(norm_seg_ppg[k], mean_wave_ppg)[0,1] < 0.8:
                bool_noise_ppg[k] = True
        noise_ppg_perc = np.sum(bool_noise_ppg) / len(bool_noise_ppg)
        
        # wave interval 사이 correlation 계산 - ECG                
        mean_wave_ecg = np.nanmean(norm_seg_ecg, axis = 0)
        mean_wave_ecg = pd.DataFrame(mean_wave_ecg).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        norm_seg_ecg = pd.DataFrame(norm_seg_ecg).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).values
        for k in range(len(bool_noise_ecg)):
            if np.corrcoef(norm_seg_ecg[k], mean_wave_ecg)[0,1] < 0.8:
                bool_noise_ecg[k] = True
        noise_ecg_perc = np.sum(bool_noise_ecg) / len(bool_noise_ecg)
        
        # segment의 noise 비율 정보
        noise_info = [noise_ppg_perc, noise_ecg_perc]
        
        # segment를 input으로 써도 되는지
        if nan_ppg_perc < 0.3 and nan_ecg_perc < 0.3 and nan_both_perc < 0.2 and noise_ppg_perc < 0.5 and noise_ecg_perc < 0.5:
            bool_pass = True
        else:
            bool_pass = False
       
        # 이 segment의 정보를 dataframe에 저장
        df_preprocess.loc[f_num-1,str(i+1)] = (bool_pass, nan_info, noise_info)        

    if f_num%1000 == 0:
        print('dumping cache of d_preprocess -', f_num, '/ 3888')
        pickle.dump(df_preprocess, open('cache/input2/df_preprocess_preop_0-1000', 'wb'))
        
print('dumping cache of d_preprocess -', f_num, '/ 3888')
pickle.dump(df_preprocess, open('cache/input2/df_preprocess_preop_0-1000', 'wb'))


###Input 1 / 3888: 0.0,7304,PACU1_3_200909_132401.csv###
###Input 2 / 3888: 0.0,5131,PACU1_1_200514_104348.csv###
###Input 3 / 3888: 0.0,2222,PACU1_7_190906_142446.csv###
###Input 4 / 3888: 0.0,758,PACU1_5_190531_124635.csv###
###Input 5 / 3888: 0.0,5,PACU1_1_190409_133948.csv###
###Input 6 / 3888: 0.0,7270,PACU1_6_200902_103630.csv###
###Input 7 / 3888: 0.0,6780,PACU1_1_200730_115233.csv###
###Input 8 / 3888: 0.0,5525,PACU1_1_200601_131308.csv###
###Input 9 / 3888: 0.0,5722,PACU1_1_200608_192708.csv###
###Input 10 / 3888: 0.0,2707,PACU1_2_200102_132423.csv###
###Input 11 / 3888: 0.0,3390,PACU1_7_200131_095818.csv###
###Input 12 / 3888: 0.0,1480,PACU1_8_190816_110423.csv###
###Input 13 / 3888: 0.0,5219,PACU1_3_200518_114244.csv###
###Input 14 / 3888: 0.0,3601,PACU1_5_200210_104932.csv###
###Input 15 / 3888: 0.0,5763,PACU1_7_200610_091333.csv###
###Input 16 / 3888: 0.0,2646,PACU1_7_191219_165512.csv###
###Input 17 / 3888: 0.0,2034,PACU1_11_190902_153658.csv###
###Input 18 / 3888: 0.0,40

###Input 142 / 3888: 0.0,1211,PACU1_12_190715_163915.csv###
###Input 143 / 3888: 0.0,8199,PACU1_7_201023_154625.csv###
###Input 144 / 3888: 0.0,2053,PACU1_10_190903_102809.csv###
###Input 145 / 3888: 0.0,1405,PACU1_3_190721_160005.csv###
###Input 146 / 3888: 0.0,5562,PACU1_12_200602_163444.csv###
###Input 147 / 3888: 0.0,1210,PACU1_7_190715_163630.csv###
###Input 148 / 3888: 0.0,2079,PACU1_7_190903_162904.csv###
###Input 149 / 3888: 0.0,6238,PACU1_4_200701_212954.csv###
###Input 150 / 3888: 0.0,3892,PACU1_8_200220_100522.csv###
###Input 151 / 3888: 0.0,2077,PACU1_4_190903_164828.csv###
###Input 152 / 3888: 0.0,7930,PACU1_1_201008_183613.csv###
###Input 153 / 3888: 0.0,2518,PACU1_7_191023_123318.csv###
###Input 154 / 3888: 0.0,501,PACU1_7_190516_110712.csv###
###Input 155 / 3888: 0.0,3148,PACU1_9_200120_155532.csv###
###Input 156 / 3888: 0.0,4085,PACU1_8_200228_093013.csv###
###Input 157 / 3888: 0.0,6631,PACU1_11_200722_125355.csv###
###Input 158 / 3888: 0.0,6317,PACU1_10_200707_113409.

###Input 281 / 3888: 0.0,5546,PACU1_4_200602_114919.csv###
###Input 282 / 3888: 0.0,2429,PACU1_11_191017_095605.csv###
###Input 283 / 3888: 0.0,5893,PACU1_1_200615_161528.csv###
###Input 284 / 3888: 0.0,3620,PACU1_6_200210_172042.csv###
###Input 285 / 3888: 0.0,6164,PACU1_5_200629_112456.csv###
###Input 286 / 3888: 0.0,5292,PACU1_10_200521_103234.csv###
###Input 287 / 3888: 0.0,642,PACU1_4_190527_165508.csv###
###Input 288 / 3888: 0.0,1361,PACU1_7_190719_095819.csv###
###Input 289 / 3888: 0.0,3312,PACU1_5_200129_095120.csv###
###Input 290 / 3888: 0.0,3183,PACU1_2_200121_164252.csv###
###Input 291 / 3888: 0.0,5577,PACU1_4_200602_212449.csv###
###Input 292 / 3888: 0.0,3246,PACU1_10_200122_170924.csv###
###Input 293 / 3888: 0.0,5233,PACU1_5_200519_103309.csv###
###Input 294 / 3888: 0.0,4781,PACU1_9_200325_105720.csv###
###Input 295 / 3888: 0.0,6539,PACU1_5_200717_092906.csv###
###Input 296 / 3888: 0.0,5242,PACU1_8_200519_150417.csv###
###Input 297 / 3888: 0.0,1984,PACU1_2_190830_090741.cs

###Input 420 / 3888: 0.0,7081,PACU1_4_200818_130212.csv###
###Input 421 / 3888: 0.0,7331,PACU1_7_200910_180619.csv###
###Input 422 / 3888: 0.0,7241,PACU1_3_200828_085619.csv###
###Input 423 / 3888: 0.0,5127,PACU1_4_200513_235541.csv###
###Input 424 / 3888: 0.0,7919,PACU1_6_201008_164118.csv###
###Input 425 / 3888: 0.0,972,PACU1_5_190704_215924.csv###
###Input 426 / 3888: 0.0,2632,PACU1_12_191219_155006.csv###
###Input 427 / 3888: 0.0,6,PACU1_1_190409_144536.csv###
###Input 428 / 3888: 0.0,7954,PACU1_1_201012_174400.csv###
###Input 429 / 3888: 0.0,5474,PACU1_3_200529_001932.csv###
###Input 430 / 3888: 0.0,7150,PACU1_10_200820_154629.csv###
###Input 431 / 3888: 0.0,1802,PACU1_6_190822_191832.csv###
###Input 432 / 3888: 0.0,8185,PACU1_9_201022_124809.csv###
###Input 433 / 3888: 0.0,4807,PACU1_3_200325_194331.csv###
###Input 434 / 3888: 0.0,7406,PACU1_4_200915_155928.csv###
###Input 435 / 3888: 0.0,4286,PACU1_7_200305_121327.csv###
###Input 436 / 3888: 0.0,3394,PACU1_5_200131_105732.csv###

###Input 559 / 3888: 0.0,7373,PACU1_3_200911_195111.csv###
###Input 560 / 3888: 0.0,1307,PACU1_10_190718_120140.csv###
###Input 561 / 3888: 0.0,7116,PACU1_2_200819_152037.csv###
###Input 562 / 3888: 0.0,1516,PACU1_10_190816_154308.csv###
###Input 563 / 3888: 0.0,4963,PACU1_9_200507_091727.csv###
###Input 564 / 3888: 0.0,5765,PACU1_1_200610_093053.csv###
###Input 565 / 3888: 0.0,3373,PACU1_6_200130_192428.csv###
###Input 566 / 3888: 0.0,3086,PACU1_1_200116_203852.csv###
###Input 567 / 3888: 0.0,553,PACU1_8_190517_130825.csv###
###Input 568 / 3888: 0.0,310,PACU1_2_190508_215132.csv###
###Input 569 / 3888: 0.0,334,PACU1_3_190509_152745.csv###
###Input 570 / 3888: 0.0,5438,PACU1_1_200528_121801.csv###
###Input 571 / 3888: 0.0,825,PACU1_8_190611_101343.csv###
###Input 572 / 3888: 0.0,7362,PACU1_3_200911_163759.csv###
###Input 573 / 3888: 0.0,4383,PACU1_4_200309_171129.csv###
###Input 574 / 3888: 0.0,2974,PACU1_5_200113_175259.csv###
###Input 575 / 3888: 0.0,5810,PACU1_1_200611_160159.csv###

###Input 699 / 3888: 0.0,4874,PACU1_3_200330_104140.csv###
###Input 700 / 3888: 0.0,4998,PACU1_9_200508_093204.csv###
###Input 701 / 3888: 0.0,4681,PACU1_4_200320_001621.csv###
###Input 702 / 3888: 0.0,1907,PACU1_1_190828_130048.csv###
###Input 703 / 3888: 0.0,8370,PACU1_8_201030_152421.csv###
###Input 704 / 3888: 0.0,5172,PACU1_3_200515_122708.csv###
###Input 705 / 3888: 0.0,5140,PACU1_6_200514_123829.csv###
###Input 706 / 3888: 0.0,7840,PACU1_3_201007_022913.csv###
###Input 707 / 3888: 0.0,7068,PACU1_2_200817_175241.csv###
###Input 708 / 3888: 0.0,638,PACU1_5_190527_163035.csv###
###Input 709 / 3888: 0.0,144,PACU1_8_190429_142254.csv###
###Input 710 / 3888: 0.0,1433,PACU1_5_190723_113444.csv###
###Input 711 / 3888: 0.0,2890,PACU1_6_200109_170925.csv###
###Input 712 / 3888: 0.0,1498,PACU1_3_190816_133726.csv###
###Input 713 / 3888: 0.0,6324,PACU1_9_200707_140037.csv###
###Input 714 / 3888: 0.0,4497,PACU1_3_200313_140242.csv###
###Input 715 / 3888: 0.0,1644,PACU1_11_190820_164140.csv##

###Input 837 / 3888: 0.0,3677,PACU1_2_200213_120622.csv###
###Input 838 / 3888: 0.0,7344,PACU1_7_200911_102430.csv###
###Input 839 / 3888: 0.0,7667,PACU1_2_200924_131336.csv###
###Input 840 / 3888: 0.0,2739,PACU1_4_200103_103109.csv###
###Input 841 / 3888: 0.0,3405,PACU1_7_200131_165556.csv###
###Input 842 / 3888: 0.0,4190,PACU1_3_200302_180745.csv###
###Input 843 / 3888: 0.0,5813,PACU1_3_200611_163230.csv###
###Input 844 / 3888: 0.0,6464,PACU1_1_200715_114908.csv###
###Input 845 / 3888: 0.0,5894,PACU1_7_200615_164432.csv###
###Input 846 / 3888: 0.0,5404,PACU1_7_200527_123142.csv###
###Input 847 / 3888: 0.0,8122,PACU1_9_201020_170023.csv###
###Input 848 / 3888: 0.0,7643,PACU1_2_200923_213528.csv###
###Input 849 / 3888: 0.0,4721,PACU1_3_200322_124737.csv###
###Input 850 / 3888: 0.0,3716,PACU1_2_200214_030259.csv###
###Input 851 / 3888: 0.0,4562,PACU1_1_200317_163439.csv###
###Input 852 / 3888: 0.0,5752,PACU1_8_200609_160132.csv###
###Input 853 / 3888: 0.0,6264,PACU1_11_200703_143402.csv

###Input 976 / 3888: 0.0,7277,PACU1_5_200902_151322.csv###
###Input 977 / 3888: 0.0,2070,PACU1_2_190903_151956.csv###
###Input 978 / 3888: 0.0,4261,PACU1_6_200304_150340.csv###
###Input 979 / 3888: 0.0,8207,PACU1_2_201023_191347.csv###
###Input 980 / 3888: 0.0,7395,PACU1_2_200915_112039.csv###
###Input 981 / 3888: 0.0,22,PACU1_2_190410_084530.csv###
###Input 982 / 3888: 0.0,6650,PACU1_7_200723_093047.csv###
###Input 983 / 3888: 0.0,7351,PACU1_3_200911_121317.csv###
###Input 984 / 3888: 0.0,7028,PACU1_1_200813_171930.csv###
###Input 985 / 3888: 0.0,5016,PACU1_2_200508_130035.csv###
###Input 986 / 3888: 0.0,1023,PACU1_7_190708_144853.csv###
###Input 987 / 3888: 0.0,2231,PACU1_1_190906_153832.csv###
###Input 988 / 3888: 0.0,5255,PACU1_4_200519_173556.csv###
###Input 989 / 3888: 0.0,1929,PACU1_5_190828_204204.csv###
###Input 990 / 3888: 0.0,802,PACU1_12_190607_134925.csv###
###Input 991 / 3888: 0.0,647,PACU1_3_190528_000714.csv###
###Input 992 / 3888: 0.0,506,PACU1_6_190516_113813.csv###
#

In [5]:
import os, sys
import pickle
import pandas as pd
import numpy as np

# 피크 사이 wave를 모두 같은 length로 만들기 위한 함수
def linear_connection(list, idx):
    int_idx = int(idx)
    return list[int_idx] + (list[int_idx+1] - list[int_idx]) * (idx - int_idx)


# input 설정
LEN_INPUT = 20 # input 10s
LEN_PER_NRS = 120 # vital length for each NRS
OVERLAP = 2
n_aug = int((LEN_PER_NRS-LEN_INPUT)/OVERLAP) + 1 # data augmentation 개수


# vital data 저장 경로
vital_path = '../../cranberry2/Preprocessing/preop_vital/preop'
f_vital_list = os.listdir(vital_path)

# 전처리 정보를 담을 Dataframe
column_list = ['file_path'] + [str(i+1) for i in range(n_aug)] #+ ['NRS']
df_preprocess = pd.DataFrame(columns = column_list)


SRATE, f_num = 100, 0
for f_vital in f_vital_list[0:1000]:
    f_num += 1
    
    print('###Input', f_num,'/ '+str(len(f_vital_list))+': '+f_vital+'###')
    
    # ppg, ecg peaks 불러오기
    # peaks가 없는 경우는 ECG나 PPG data가 없는 case들
    if not os.path.exists('../../cranberry2/Preprocessing/cache/PPG_peaks/'+f_vital):
        print('no existing PPG peaks: ', f_vital)
        continue
    if not os.path.exists('../../cranberry2/Preprocessing/cache/ECG_peaks/'+f_vital):
        print('no existing ECG peaks: ', f_vital)
        continue
    
    # vital data 불러오기    
    df_vital = pickle.load(open(vital_path+'/'+f_vital, 'rb')).reset_index()
    
    #dataframe에 새로운 행 만들기
    df_preprocess.loc[f_num-1,'file_path'] = f_vital
        
    ppg_min, ppg_peak = pickle.load(open('../../cranberry2/Preprocessing/cache/PPG_peaks/'+f_vital, 'rb'))
    ecg_peak = pickle.load(open('../../cranberry2/Preprocessing/cache/ECG_peaks/'+f_vital, 'rb'))
    
    ppg_min, ppg_peak = np.array([ppg_min]), np.array([ppg_peak])
    ecg_peak= np.array([ecg_peak])
    
    
    # 10초 단위로 끊기
    for i in range(n_aug):
        start_idx = i*OVERLAP*SRATE # 500i
        end_idx = (i*OVERLAP + LEN_INPUT)*SRATE # 500i + 1000
        
        seg_ppg, seg_ecg = [np.nan for j in range(LEN_INPUT*SRATE)], [np.nan for j in range(LEN_INPUT*SRATE)]
        df_vital_input = df_vital.loc[start_idx:end_idx-1]
        seg_ppg[0:len(df_vital_input)] = df_vital_input['Pleth'].tolist()
        seg_ecg[0:len(df_vital_input)] = df_vital_input['ECG'].tolist()
    
        ### 1. 결측치 처리 ###              
        # df.isnull().sum() 하면 더 간단하게 가능하나 애초에 NRS에 해당하는 vital data가 120초 보다 짧은 경우
        nan_ppg_list = np.isnan(seg_ppg)
        nan_ecg_list = np.isnan(seg_ecg)
        nan_ppg_perc = np.sum(nan_ppg_list) / LEN_INPUT / SRATE
        nan_ecg_perc = np.sum(nan_ecg_list) / LEN_INPUT / SRATE
        
        # ECG, PPG 둘다 결측치인 부분
        nan_both_perc = 0
        for j in range(len(seg_ppg)):
            if nan_ppg_list[j] and  nan_ecg_list[j]:
                nan_both_perc += 1
        nan_both_perc /= (LEN_INPUT*SRATE)
            
        # segment의 결측치 비율 정보
        nan_info = [nan_ppg_perc, nan_ecg_perc, nan_both_perc]
        
        # 결측치가 많은 경우, noise 확인할 것도 없이 False -  이 경우의 noise_info는 -1로 처리
        if nan_ppg_perc > 0.3 or nan_ecg_perc > 0.3 or nan_both_perc > 0.2:
            df_preprocess.loc[f_num-1,str(i+1)] = (False, nan_info, [-1, -1])
            continue
            
        

        ### 2. Noise 처리 ###
        # 10초 segment 내의 ppg, ecg peak idx
        #seg_ppg_min = ppg_min[(start_idx<=np.array(ppg_min)) & (np.array(ppg_min)<end_idx)]
        idx_ppg_peak = ppg_peak[(start_idx<=ppg_peak) & (ppg_peak<end_idx)] - start_idx
        idx_ecg_peak = ecg_peak[(start_idx<=ecg_peak) & (ecg_peak<end_idx)] - start_idx
        
        # peak가 5개 이하는 noise가 많이 낀 상황 (10초 구간 중 peak가 7초 이상 없으면 문제 -> 즉 peak 개수 범위는 7/2 ~ 7/0.4)
        # 따라서 peak가 7초 이상 있어야하고 이때 최소 peak 개수는 3.5개
        # peak 개수가 기준 미달이면 noise 계산 자세히 할 필요없이 False - 이 경우의 noise_info는 -2로 처리
        if len(idx_ppg_peak)<=4 or len(idx_ecg_peak)<=4:
            df_preprocess.loc[f_num-1,str(i+1)] = (False, nan_info, [-2, -2])
            continue

            
        # 10초 segment 내의 ppg, ecg peak value
        #print(len(seg_ppg), idx_ppg_peak)
        val_ppg_peak = [seg_ppg[k] for k in idx_ppg_peak]
        val_ecg_peak = [seg_ecg[k] for k in idx_ecg_peak]
        
        # peak와 peak 사이 interval에 대한 noise 여부 -> 따라서 길이는 peak - 1
        bool_noise_ppg = [False for k in range(len(idx_ppg_peak)-1)]
        bool_noise_ecg = [False for k in range(len(idx_ecg_peak)-1)]
        
        #  2.1 peak 간격 이상한 noise (HR 30~150 -> HBI 0.4s ~ 2s로 SRATE 곱해주면 40~200)
        for k in range(len(bool_noise_ppg)):
            if not 40 < idx_ppg_peak[k+1] - idx_ppg_peak[k] < 200:
                bool_noise_ppg[k] = True
        for k in range(len(bool_noise_ecg)):
            if not 40 < idx_ecg_peak[k+1] - idx_ecg_peak[k] < 200:
                bool_noise_ecg[k] = True
                
        # 2.2 모양 이상한 noise
        # wave interval into same length(2s(200))
        len_wave = 200
        norm_seg_ppg, norm_seg_ecg = [], []

        for k in range(len(bool_noise_ppg)):
            len_interval_ppg = idx_ppg_peak[k+1] - idx_ppg_peak[k]
            
            # peak 사이 wave를 모두 같은 길이로 변환
            norm_seg_ppg.append([linear_connection(seg_ppg[idx_ppg_peak[k]:idx_ppg_peak[k+1]+1], n/len_wave*len_interval_ppg) for n in range(len_wave)])
        
        for k in range(len(bool_noise_ecg)):
            len_interval_ecg = idx_ecg_peak[k+1] - idx_ecg_peak[k]
            
            # peak 사이 wave를 모두 같은 길이로 변환
            norm_seg_ecg.append([linear_connection(seg_ecg[idx_ecg_peak[k]:idx_ecg_peak[k+1]+1], n/len_wave*len_interval_ecg) for n in range(len_wave)])
          
        
        # wave interval 사이 correlation 계산 - PPG
        mean_wave_ppg = np.nanmean(norm_seg_ppg, axis = 0)
        mean_wave_ppg = pd.DataFrame(mean_wave_ppg).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        norm_seg_ppg = pd.DataFrame(norm_seg_ppg).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).values
        for k in range(len(bool_noise_ppg)):
            if np.corrcoef(norm_seg_ppg[k], mean_wave_ppg)[0,1] < 0.8:
                bool_noise_ppg[k] = True
        noise_ppg_perc = np.sum(bool_noise_ppg) / len(bool_noise_ppg)
        
        # wave interval 사이 correlation 계산 - ECG                
        mean_wave_ecg = np.nanmean(norm_seg_ecg, axis = 0)
        mean_wave_ecg = pd.DataFrame(mean_wave_ecg).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        norm_seg_ecg = pd.DataFrame(norm_seg_ecg).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).values
        for k in range(len(bool_noise_ecg)):
            if np.corrcoef(norm_seg_ecg[k], mean_wave_ecg)[0,1] < 0.8:
                bool_noise_ecg[k] = True
        noise_ecg_perc = np.sum(bool_noise_ecg) / len(bool_noise_ecg)
        
        # segment의 noise 비율 정보
        noise_info = [noise_ppg_perc, noise_ecg_perc]
        
        # segment를 input으로 써도 되는지
        if nan_ppg_perc < 0.3 and nan_ecg_perc < 0.3 and nan_both_perc < 0.2 and noise_ppg_perc < 0.5 and noise_ecg_perc < 0.5:
            bool_pass = True
        else:
            bool_pass = False
       
        # 이 segment의 정보를 dataframe에 저장
        df_preprocess.loc[f_num-1,str(i+1)] = (bool_pass, nan_info, noise_info)        

    if f_num%1000 == 0:
        print('dumping cache of d_preprocess -', f_num, '/ 3888')
        pickle.dump(df_preprocess, open('cache/input3/df_preprocess_preop_0-1000', 'wb'))
        
print('dumping cache of d_preprocess -', f_num, '/ 3888')
pickle.dump(df_preprocess, open('cache/input3/df_preprocess_preop_0-1000', 'wb'))


###Input 1 / 3888: 0.0,7304,PACU1_3_200909_132401.csv###
###Input 2 / 3888: 0.0,5131,PACU1_1_200514_104348.csv###
###Input 3 / 3888: 0.0,2222,PACU1_7_190906_142446.csv###
###Input 4 / 3888: 0.0,758,PACU1_5_190531_124635.csv###
###Input 5 / 3888: 0.0,5,PACU1_1_190409_133948.csv###
###Input 6 / 3888: 0.0,7270,PACU1_6_200902_103630.csv###
###Input 7 / 3888: 0.0,6780,PACU1_1_200730_115233.csv###
###Input 8 / 3888: 0.0,5525,PACU1_1_200601_131308.csv###
###Input 9 / 3888: 0.0,5722,PACU1_1_200608_192708.csv###
###Input 10 / 3888: 0.0,2707,PACU1_2_200102_132423.csv###
###Input 11 / 3888: 0.0,3390,PACU1_7_200131_095818.csv###
###Input 12 / 3888: 0.0,1480,PACU1_8_190816_110423.csv###
###Input 13 / 3888: 0.0,5219,PACU1_3_200518_114244.csv###
###Input 14 / 3888: 0.0,3601,PACU1_5_200210_104932.csv###
###Input 15 / 3888: 0.0,5763,PACU1_7_200610_091333.csv###
###Input 16 / 3888: 0.0,2646,PACU1_7_191219_165512.csv###
###Input 17 / 3888: 0.0,2034,PACU1_11_190902_153658.csv###
###Input 18 / 3888: 0.0,40

###Input 142 / 3888: 0.0,1211,PACU1_12_190715_163915.csv###
###Input 143 / 3888: 0.0,8199,PACU1_7_201023_154625.csv###
###Input 144 / 3888: 0.0,2053,PACU1_10_190903_102809.csv###
###Input 145 / 3888: 0.0,1405,PACU1_3_190721_160005.csv###
###Input 146 / 3888: 0.0,5562,PACU1_12_200602_163444.csv###
###Input 147 / 3888: 0.0,1210,PACU1_7_190715_163630.csv###
###Input 148 / 3888: 0.0,2079,PACU1_7_190903_162904.csv###
###Input 149 / 3888: 0.0,6238,PACU1_4_200701_212954.csv###
###Input 150 / 3888: 0.0,3892,PACU1_8_200220_100522.csv###
###Input 151 / 3888: 0.0,2077,PACU1_4_190903_164828.csv###
###Input 152 / 3888: 0.0,7930,PACU1_1_201008_183613.csv###
###Input 153 / 3888: 0.0,2518,PACU1_7_191023_123318.csv###
###Input 154 / 3888: 0.0,501,PACU1_7_190516_110712.csv###
###Input 155 / 3888: 0.0,3148,PACU1_9_200120_155532.csv###
###Input 156 / 3888: 0.0,4085,PACU1_8_200228_093013.csv###
###Input 157 / 3888: 0.0,6631,PACU1_11_200722_125355.csv###
###Input 158 / 3888: 0.0,6317,PACU1_10_200707_113409.

###Input 281 / 3888: 0.0,5546,PACU1_4_200602_114919.csv###
###Input 282 / 3888: 0.0,2429,PACU1_11_191017_095605.csv###
###Input 283 / 3888: 0.0,5893,PACU1_1_200615_161528.csv###
###Input 284 / 3888: 0.0,3620,PACU1_6_200210_172042.csv###
###Input 285 / 3888: 0.0,6164,PACU1_5_200629_112456.csv###
###Input 286 / 3888: 0.0,5292,PACU1_10_200521_103234.csv###
###Input 287 / 3888: 0.0,642,PACU1_4_190527_165508.csv###
###Input 288 / 3888: 0.0,1361,PACU1_7_190719_095819.csv###
###Input 289 / 3888: 0.0,3312,PACU1_5_200129_095120.csv###
###Input 290 / 3888: 0.0,3183,PACU1_2_200121_164252.csv###
###Input 291 / 3888: 0.0,5577,PACU1_4_200602_212449.csv###
###Input 292 / 3888: 0.0,3246,PACU1_10_200122_170924.csv###
###Input 293 / 3888: 0.0,5233,PACU1_5_200519_103309.csv###
###Input 294 / 3888: 0.0,4781,PACU1_9_200325_105720.csv###
###Input 295 / 3888: 0.0,6539,PACU1_5_200717_092906.csv###
###Input 296 / 3888: 0.0,5242,PACU1_8_200519_150417.csv###
###Input 297 / 3888: 0.0,1984,PACU1_2_190830_090741.cs

###Input 420 / 3888: 0.0,7081,PACU1_4_200818_130212.csv###
###Input 421 / 3888: 0.0,7331,PACU1_7_200910_180619.csv###
###Input 422 / 3888: 0.0,7241,PACU1_3_200828_085619.csv###
###Input 423 / 3888: 0.0,5127,PACU1_4_200513_235541.csv###
###Input 424 / 3888: 0.0,7919,PACU1_6_201008_164118.csv###
###Input 425 / 3888: 0.0,972,PACU1_5_190704_215924.csv###
###Input 426 / 3888: 0.0,2632,PACU1_12_191219_155006.csv###
###Input 427 / 3888: 0.0,6,PACU1_1_190409_144536.csv###
###Input 428 / 3888: 0.0,7954,PACU1_1_201012_174400.csv###
###Input 429 / 3888: 0.0,5474,PACU1_3_200529_001932.csv###
###Input 430 / 3888: 0.0,7150,PACU1_10_200820_154629.csv###
###Input 431 / 3888: 0.0,1802,PACU1_6_190822_191832.csv###
###Input 432 / 3888: 0.0,8185,PACU1_9_201022_124809.csv###
###Input 433 / 3888: 0.0,4807,PACU1_3_200325_194331.csv###
###Input 434 / 3888: 0.0,7406,PACU1_4_200915_155928.csv###
###Input 435 / 3888: 0.0,4286,PACU1_7_200305_121327.csv###
###Input 436 / 3888: 0.0,3394,PACU1_5_200131_105732.csv###

###Input 559 / 3888: 0.0,7373,PACU1_3_200911_195111.csv###
###Input 560 / 3888: 0.0,1307,PACU1_10_190718_120140.csv###
###Input 561 / 3888: 0.0,7116,PACU1_2_200819_152037.csv###
###Input 562 / 3888: 0.0,1516,PACU1_10_190816_154308.csv###
###Input 563 / 3888: 0.0,4963,PACU1_9_200507_091727.csv###
###Input 564 / 3888: 0.0,5765,PACU1_1_200610_093053.csv###
###Input 565 / 3888: 0.0,3373,PACU1_6_200130_192428.csv###
###Input 566 / 3888: 0.0,3086,PACU1_1_200116_203852.csv###
###Input 567 / 3888: 0.0,553,PACU1_8_190517_130825.csv###
###Input 568 / 3888: 0.0,310,PACU1_2_190508_215132.csv###
###Input 569 / 3888: 0.0,334,PACU1_3_190509_152745.csv###
###Input 570 / 3888: 0.0,5438,PACU1_1_200528_121801.csv###
###Input 571 / 3888: 0.0,825,PACU1_8_190611_101343.csv###
###Input 572 / 3888: 0.0,7362,PACU1_3_200911_163759.csv###
###Input 573 / 3888: 0.0,4383,PACU1_4_200309_171129.csv###
###Input 574 / 3888: 0.0,2974,PACU1_5_200113_175259.csv###
###Input 575 / 3888: 0.0,5810,PACU1_1_200611_160159.csv###

###Input 699 / 3888: 0.0,4874,PACU1_3_200330_104140.csv###
###Input 700 / 3888: 0.0,4998,PACU1_9_200508_093204.csv###
###Input 701 / 3888: 0.0,4681,PACU1_4_200320_001621.csv###
###Input 702 / 3888: 0.0,1907,PACU1_1_190828_130048.csv###
###Input 703 / 3888: 0.0,8370,PACU1_8_201030_152421.csv###
###Input 704 / 3888: 0.0,5172,PACU1_3_200515_122708.csv###
###Input 705 / 3888: 0.0,5140,PACU1_6_200514_123829.csv###
###Input 706 / 3888: 0.0,7840,PACU1_3_201007_022913.csv###
###Input 707 / 3888: 0.0,7068,PACU1_2_200817_175241.csv###
###Input 708 / 3888: 0.0,638,PACU1_5_190527_163035.csv###
###Input 709 / 3888: 0.0,144,PACU1_8_190429_142254.csv###
###Input 710 / 3888: 0.0,1433,PACU1_5_190723_113444.csv###
###Input 711 / 3888: 0.0,2890,PACU1_6_200109_170925.csv###
###Input 712 / 3888: 0.0,1498,PACU1_3_190816_133726.csv###
###Input 713 / 3888: 0.0,6324,PACU1_9_200707_140037.csv###
###Input 714 / 3888: 0.0,4497,PACU1_3_200313_140242.csv###
###Input 715 / 3888: 0.0,1644,PACU1_11_190820_164140.csv##

###Input 837 / 3888: 0.0,3677,PACU1_2_200213_120622.csv###
###Input 838 / 3888: 0.0,7344,PACU1_7_200911_102430.csv###
###Input 839 / 3888: 0.0,7667,PACU1_2_200924_131336.csv###
###Input 840 / 3888: 0.0,2739,PACU1_4_200103_103109.csv###
###Input 841 / 3888: 0.0,3405,PACU1_7_200131_165556.csv###
###Input 842 / 3888: 0.0,4190,PACU1_3_200302_180745.csv###
###Input 843 / 3888: 0.0,5813,PACU1_3_200611_163230.csv###
###Input 844 / 3888: 0.0,6464,PACU1_1_200715_114908.csv###
###Input 845 / 3888: 0.0,5894,PACU1_7_200615_164432.csv###
###Input 846 / 3888: 0.0,5404,PACU1_7_200527_123142.csv###
###Input 847 / 3888: 0.0,8122,PACU1_9_201020_170023.csv###
###Input 848 / 3888: 0.0,7643,PACU1_2_200923_213528.csv###
###Input 849 / 3888: 0.0,4721,PACU1_3_200322_124737.csv###
###Input 850 / 3888: 0.0,3716,PACU1_2_200214_030259.csv###
###Input 851 / 3888: 0.0,4562,PACU1_1_200317_163439.csv###
###Input 852 / 3888: 0.0,5752,PACU1_8_200609_160132.csv###
###Input 853 / 3888: 0.0,6264,PACU1_11_200703_143402.csv

###Input 976 / 3888: 0.0,7277,PACU1_5_200902_151322.csv###
###Input 977 / 3888: 0.0,2070,PACU1_2_190903_151956.csv###
###Input 978 / 3888: 0.0,4261,PACU1_6_200304_150340.csv###
###Input 979 / 3888: 0.0,8207,PACU1_2_201023_191347.csv###
###Input 980 / 3888: 0.0,7395,PACU1_2_200915_112039.csv###
###Input 981 / 3888: 0.0,22,PACU1_2_190410_084530.csv###
###Input 982 / 3888: 0.0,6650,PACU1_7_200723_093047.csv###
###Input 983 / 3888: 0.0,7351,PACU1_3_200911_121317.csv###
###Input 984 / 3888: 0.0,7028,PACU1_1_200813_171930.csv###
###Input 985 / 3888: 0.0,5016,PACU1_2_200508_130035.csv###
###Input 986 / 3888: 0.0,1023,PACU1_7_190708_144853.csv###
###Input 987 / 3888: 0.0,2231,PACU1_1_190906_153832.csv###
###Input 988 / 3888: 0.0,5255,PACU1_4_200519_173556.csv###
###Input 989 / 3888: 0.0,1929,PACU1_5_190828_204204.csv###
###Input 990 / 3888: 0.0,802,PACU1_12_190607_134925.csv###
###Input 991 / 3888: 0.0,647,PACU1_3_190528_000714.csv###
###Input 992 / 3888: 0.0,506,PACU1_6_190516_113813.csv###
#

In [7]:
import os, sys
import pickle
import pandas as pd
import numpy as np

# 피크 사이 wave를 모두 같은 length로 만들기 위한 함수
def linear_connection(list, idx):
    int_idx = int(idx)
    return list[int_idx] + (list[int_idx+1] - list[int_idx]) * (idx - int_idx)


# input 설정
LEN_INPUT = 20 # input 10s
LEN_PER_NRS = 120 # vital length for each NRS
OVERLAP = 2
n_aug = int((LEN_PER_NRS-LEN_INPUT)/OVERLAP) + 1 # data augmentation 개수


# vital data 저장 경로
vital_path = '../../cranberry2/Preprocessing/preop_vital/preop'
f_vital_list = os.listdir(vital_path)

# 전처리 정보를 담을 Dataframe
column_list = ['file_path'] + [str(i+1) for i in range(n_aug)] #+ ['NRS']
df_preprocess = pd.DataFrame(columns = column_list)


SRATE, f_num = 100, 3000
for f_vital in f_vital_list[3000:]:
    f_num += 1
    
    print('###Input', f_num,'/ '+str(len(f_vital_list))+': '+f_vital+'###')
    
    # ppg, ecg peaks 불러오기
    # peaks가 없는 경우는 ECG나 PPG data가 없는 case들
    if not os.path.exists('../../cranberry2/Preprocessing/cache/PPG_peaks/'+f_vital):
        print('no existing PPG peaks: ', f_vital)
        continue
    if not os.path.exists('../../cranberry2/Preprocessing/cache/ECG_peaks/'+f_vital):
        print('no existing ECG peaks: ', f_vital)
        continue
    
    # vital data 불러오기    
    df_vital = pickle.load(open(vital_path+'/'+f_vital, 'rb')).reset_index()
    
    #dataframe에 새로운 행 만들기
    df_preprocess.loc[f_num-1,'file_path'] = f_vital
        
    ppg_min, ppg_peak = pickle.load(open('../../cranberry2/Preprocessing/cache/PPG_peaks/'+f_vital, 'rb'))
    ecg_peak = pickle.load(open('../../cranberry2/Preprocessing/cache/ECG_peaks/'+f_vital, 'rb'))
    
    ppg_min, ppg_peak = np.array([ppg_min]), np.array([ppg_peak])
    ecg_peak= np.array([ecg_peak])
    
    
    # 10초 단위로 끊기
    for i in range(n_aug):
        start_idx = i*OVERLAP*SRATE # 500i
        end_idx = (i*OVERLAP + LEN_INPUT)*SRATE # 500i + 1000
        
        seg_ppg, seg_ecg = [np.nan for j in range(LEN_INPUT*SRATE)], [np.nan for j in range(LEN_INPUT*SRATE)]
        df_vital_input = df_vital.loc[start_idx:end_idx-1]
        seg_ppg[0:len(df_vital_input)] = df_vital_input['Pleth'].tolist()
        seg_ecg[0:len(df_vital_input)] = df_vital_input['ECG'].tolist()
    
        ### 1. 결측치 처리 ###              
        # df.isnull().sum() 하면 더 간단하게 가능하나 애초에 NRS에 해당하는 vital data가 120초 보다 짧은 경우
        nan_ppg_list = np.isnan(seg_ppg)
        nan_ecg_list = np.isnan(seg_ecg)
        nan_ppg_perc = np.sum(nan_ppg_list) / LEN_INPUT / SRATE
        nan_ecg_perc = np.sum(nan_ecg_list) / LEN_INPUT / SRATE
        
        # ECG, PPG 둘다 결측치인 부분
        nan_both_perc = 0
        for j in range(len(seg_ppg)):
            if nan_ppg_list[j] and  nan_ecg_list[j]:
                nan_both_perc += 1
        nan_both_perc /= (LEN_INPUT*SRATE)
            
        # segment의 결측치 비율 정보
        nan_info = [nan_ppg_perc, nan_ecg_perc, nan_both_perc]
        
        # 결측치가 많은 경우, noise 확인할 것도 없이 False -  이 경우의 noise_info는 -1로 처리
        if nan_ppg_perc > 0.3 or nan_ecg_perc > 0.3 or nan_both_perc > 0.2:
            df_preprocess.loc[f_num-1,str(i+1)] = (False, nan_info, [-1, -1])
            continue
            
        

        ### 2. Noise 처리 ###
        # 10초 segment 내의 ppg, ecg peak idx
        #seg_ppg_min = ppg_min[(start_idx<=np.array(ppg_min)) & (np.array(ppg_min)<end_idx)]
        idx_ppg_peak = ppg_peak[(start_idx<=ppg_peak) & (ppg_peak<end_idx)] - start_idx
        idx_ecg_peak = ecg_peak[(start_idx<=ecg_peak) & (ecg_peak<end_idx)] - start_idx
        
        # peak가 5개 이하는 noise가 많이 낀 상황 (10초 구간 중 peak가 7초 이상 없으면 문제 -> 즉 peak 개수 범위는 7/2 ~ 7/0.4)
        # 따라서 peak가 7초 이상 있어야하고 이때 최소 peak 개수는 3.5개
        # peak 개수가 기준 미달이면 noise 계산 자세히 할 필요없이 False - 이 경우의 noise_info는 -2로 처리
        if len(idx_ppg_peak)<=4 or len(idx_ecg_peak)<=4:
            df_preprocess.loc[f_num-1,str(i+1)] = (False, nan_info, [-2, -2])
            continue

            
        # 10초 segment 내의 ppg, ecg peak value
        #print(len(seg_ppg), idx_ppg_peak)
        val_ppg_peak = [seg_ppg[k] for k in idx_ppg_peak]
        val_ecg_peak = [seg_ecg[k] for k in idx_ecg_peak]
        
        # peak와 peak 사이 interval에 대한 noise 여부 -> 따라서 길이는 peak - 1
        bool_noise_ppg = [False for k in range(len(idx_ppg_peak)-1)]
        bool_noise_ecg = [False for k in range(len(idx_ecg_peak)-1)]
        
        #  2.1 peak 간격 이상한 noise (HR 30~150 -> HBI 0.4s ~ 2s로 SRATE 곱해주면 40~200)
        for k in range(len(bool_noise_ppg)):
            if not 40 < idx_ppg_peak[k+1] - idx_ppg_peak[k] < 200:
                bool_noise_ppg[k] = True
        for k in range(len(bool_noise_ecg)):
            if not 40 < idx_ecg_peak[k+1] - idx_ecg_peak[k] < 200:
                bool_noise_ecg[k] = True
                
        # 2.2 모양 이상한 noise
        # wave interval into same length(2s(200))
        len_wave = 200
        norm_seg_ppg, norm_seg_ecg = [], []

        for k in range(len(bool_noise_ppg)):
            len_interval_ppg = idx_ppg_peak[k+1] - idx_ppg_peak[k]
            
            # peak 사이 wave를 모두 같은 길이로 변환
            norm_seg_ppg.append([linear_connection(seg_ppg[idx_ppg_peak[k]:idx_ppg_peak[k+1]+1], n/len_wave*len_interval_ppg) for n in range(len_wave)])
        
        for k in range(len(bool_noise_ecg)):
            len_interval_ecg = idx_ecg_peak[k+1] - idx_ecg_peak[k]
            
            # peak 사이 wave를 모두 같은 길이로 변환
            norm_seg_ecg.append([linear_connection(seg_ecg[idx_ecg_peak[k]:idx_ecg_peak[k+1]+1], n/len_wave*len_interval_ecg) for n in range(len_wave)])
          
        
        # wave interval 사이 correlation 계산 - PPG
        mean_wave_ppg = np.nanmean(norm_seg_ppg, axis = 0)
        mean_wave_ppg = pd.DataFrame(mean_wave_ppg).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        norm_seg_ppg = pd.DataFrame(norm_seg_ppg).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).values
        for k in range(len(bool_noise_ppg)):
            if np.corrcoef(norm_seg_ppg[k], mean_wave_ppg)[0,1] < 0.8:
                bool_noise_ppg[k] = True
        noise_ppg_perc = np.sum(bool_noise_ppg) / len(bool_noise_ppg)
        
        # wave interval 사이 correlation 계산 - ECG                
        mean_wave_ecg = np.nanmean(norm_seg_ecg, axis = 0)
        mean_wave_ecg = pd.DataFrame(mean_wave_ecg).fillna(method='ffill', axis=0).fillna(method='bfill', axis=0).values.flatten()
        norm_seg_ecg = pd.DataFrame(norm_seg_ecg).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).values
        for k in range(len(bool_noise_ecg)):
            if np.corrcoef(norm_seg_ecg[k], mean_wave_ecg)[0,1] < 0.8:
                bool_noise_ecg[k] = True
        noise_ecg_perc = np.sum(bool_noise_ecg) / len(bool_noise_ecg)
        
        # segment의 noise 비율 정보
        noise_info = [noise_ppg_perc, noise_ecg_perc]
        
        # segment를 input으로 써도 되는지
        if nan_ppg_perc < 0.3 and nan_ecg_perc < 0.3 and nan_both_perc < 0.2 and noise_ppg_perc < 0.5 and noise_ecg_perc < 0.5:
            bool_pass = True
        else:
            bool_pass = False
       
        # 이 segment의 정보를 dataframe에 저장
        df_preprocess.loc[f_num-1,str(i+1)] = (bool_pass, nan_info, noise_info)        

    if f_num%1000 == 0:
        print('dumping cache of d_preprocess -', f_num, '/ 3888')
        pickle.dump(df_preprocess, open('cache/input3/df_preprocess_preop_3000-4000', 'wb'))
        
print('dumping cache of d_preprocess -', f_num, '/ 3888')
pickle.dump(df_preprocess, open('cache/input3/df_preprocess_preop_3000-4000', 'wb'))


###Input 3001 / 3888: 0.0,8028,PACU1_4_201014_160858.csv###
###Input 3002 / 3888: 0.0,511,PACU1_4_190516_123713.csv###
###Input 3003 / 3888: 0.0,3844,PACU1_10_200219_113830.csv###
###Input 3004 / 3888: 0.0,4048,PACU1_8_200225_172129.csv###
###Input 3005 / 3888: 0.0,4933,PACU1_2_200331_204748.csv###
###Input 3006 / 3888: 0.0,2959,PACU1_5_200113_095557.csv###
###Input 3007 / 3888: 0.0,6956,PACU1_9_200811_103423.csv###
###Input 3008 / 3888: 0.0,2446,PACU1_3_191018_110127.csv###
###Input 3009 / 3888: 0.0,4483,PACU1_1_200313_094749.csv###
###Input 3010 / 3888: 0.0,2223,PACU1_8_190906_145350.csv###
###Input 3011 / 3888: 0.0,8251,PACU1_6_201027_095822.csv###
###Input 3012 / 3888: 0.0,247,PACU1_7_190503_125635.csv###
###Input 3013 / 3888: 0.0,1902,PACU1_2_190828_105401.csv###
###Input 3014 / 3888: 0.0,4237,PACU1_4_200304_005917.csv###
###Input 3015 / 3888: 0.0,4627,PACU1_1_200318_152543.csv###
###Input 3016 / 3888: 0.0,1940,PACU1_5_190829_110931.csv###
###Input 3017 / 3888: 0.0,4111,PACU1_9_20

###Input 3138 / 3888: 0.0,5978,PACU1_10_200617_155547.csv###
###Input 3139 / 3888: 0.0,4437,PACU1_7_200311_131721.csv###
###Input 3140 / 3888: 0.0,6783,PACU1_1_200730_124247.csv###
###Input 3141 / 3888: 0.0,1965,PACU1_7_190829_163443.csv###
###Input 3142 / 3888: 0.0,1320,PACU1_8_190718_142340.csv###
###Input 3143 / 3888: 0.0,3575,PACU1_1_200207_144731.csv###
###Input 3144 / 3888: 0.0,4692,PACU1_9_200320_114058.csv###
###Input 3145 / 3888: 0.0,6901,PACU1_6_200806_094740.csv###
###Input 3146 / 3888: 0.0,4077,PACU1_3_200228_001842.csv###
###Input 3147 / 3888: 0.0,7865,PACU1_2_201007_141545.csv###
###Input 3148 / 3888: 0.0,5961,PACU1_8_200617_093624.csv###
###Input 3149 / 3888: 0.0,4031,PACU1_7_200225_113451.csv###
###Input 3150 / 3888: 0.0,8294,PACU1_3_201028_014517.csv###
###Input 3151 / 3888: 0.0,1636,PACU1_2_190820_162428.csv###
###Input 3152 / 3888: 0.0,1053,PACU1_6_190709_121303.csv###
###Input 3153 / 3888: 0.0,5029,PACU1_12_200508_164704.csv###
###Input 3154 / 3888: 0.0,6861,PACU1_3

###Input 3275 / 3888: 0.0,6579,PACU1_3_200717_175105.csv###
###Input 3276 / 3888: 0.0,6795,PACU1_10_200730_165305.csv###
###Input 3277 / 3888: 0.0,3742,PACU1_4_200214_171031.csv###
###Input 3278 / 3888: 0.0,6987,PACU1_2_200812_001450.csv###
###Input 3279 / 3888: 0.0,1389,PACU1_11_190719_154255.csv###
###Input 3280 / 3888: 0.0,1169,PACU1_3_190712_110616.csv###
###Input 3281 / 3888: 0.0,1979,PACU1_1_190829_193244.csv###
###Input 3282 / 3888: 0.0,4140,PACU1_3_200302_112011.csv###
###Input 3283 / 3888: 0.0,95,PACU1_1_190424_104547.csv###
###Input 3284 / 3888: 0.0,4024,PACU1_1_200224_200759.csv###
###Input 3285 / 3888: 0.0,7565,PACU1_9_200921_164342.csv###
###Input 3286 / 3888: 0.0,2687,PACU1_1_191231_164630.csv###
###Input 3287 / 3888: 0.0,519,PACU1_7_190516_152710.csv###
###Input 3288 / 3888: 0.0,7466,PACU1_4_200916_210147.csv###
###Input 3289 / 3888: 0.0,1826,PACU1_4_190823_152946.csv###
###Input 3290 / 3888: 0.0,7268,PACU1_3_200902_091802.csv###
###Input 3291 / 3888: 0.0,6846,PACU1_1_20

###Input 3412 / 3888: 0.0,7250,PACU1_4_200830_205923.csv###
###Input 3413 / 3888: 0.0,7263,PACU1_3_200901_163338.csv###
###Input 3414 / 3888: 0.0,6202,PACU1_8_200701_092447.csv###
###Input 3415 / 3888: 0.0,753,PACU1_4_190531_102830.csv###
###Input 3416 / 3888: 0.0,6983,PACU1_4_200811_211336.csv###
###Input 3417 / 3888: 0.0,1127,PACU1_1_190711_085134.csv###
###Input 3418 / 3888: 0.0,7495,PACU1_1_200917_151922.csv###
###Input 3419 / 3888: 0.0,1296,PACU1_1_190718_083748.csv###
###Input 3420 / 3888: 0.0,70,PACU1_3_190417_234040.csv###
###Input 3421 / 3888: 0.0,1231,PACU1_5_190716_105345.csv###
###Input 3422 / 3888: 0.0,870,PACU1_2_190612_153210.csv###
###Input 3423 / 3888: 0.0,1858,PACU1_4_190827_105221.csv###
###Input 3424 / 3888: 0.0,7854,PACU1_9_201007_114705.csv###
###Input 3425 / 3888: 0.0,2876,PACU1_7_200109_121459.csv###
###Input 3426 / 3888: 0.0,1442,PACU1_10_190723_125732.csv###
###Input 3427 / 3888: 0.0,276,PACU1_3_190503_200820.csv###
###Input 3428 / 3888: 0.0,127,PACU1_8_190429

###Input 3549 / 3888: 0.0,5061,PACU1_2_200511_214458.csv###
###Input 3550 / 3888: 0.0,6920,PACU1_3_200806_214508.csv###
###Input 3551 / 3888: 0.0,3948,PACU1_6_200221_114317.csv###
###Input 3552 / 3888: 0.0,7926,PACU1_7_201008_181134.csv###
###Input 3553 / 3888: 0.0,1467,PACU1_1_190724_091731.csv###
###Input 3554 / 3888: 0.0,6055,PACU1_6_200622_171038.csv###
###Input 3555 / 3888: 0.0,1096,PACU1_2_190710_124742.csv###
###Input 3556 / 3888: 0.0,1119,PACU1_5_190710_164522.csv###
###Input 3557 / 3888: 0.0,6058,PACU1_4_200622_202654.csv###
###Input 3558 / 3888: 0.0,6917,PACU1_1_200806_160551.csv###
###Input 3559 / 3888: 0.0,5642,PACU1_9_200604_141315.csv###
###Input 3560 / 3888: 0.0,2402,PACU1_3_190917_173755.csv###
###Input 3561 / 3888: 0.0,1209,PACU1_7_190715_154508.csv###
###Input 3562 / 3888: 0.0,5942,PACU1_10_200616_161900.csv###
###Input 3563 / 3888: 0.0,895,PACU1_1_190617_192522.csv###
###Input 3564 / 3888: 0.0,1013,PACU1_7_190708_094819.csv###
###Input 3565 / 3888: 0.0,541,PACU1_7_19

###Input 3686 / 3888: 0.0,3786,PACU1_4_200217_163223.csv###
###Input 3687 / 3888: 0.0,6995,PACU1_8_200812_095401.csv###
###Input 3688 / 3888: 0.0,4941,PACU1_1_200504_112829.csv###
###Input 3689 / 3888: 0.0,8165,PACU1_1_201022_090538.csv###
###Input 3690 / 3888: 0.0,7003,PACU1_9_200812_123754.csv###
###Input 3691 / 3888: 0.0,2856,PACU1_3_200108_172624.csv###
###Input 3692 / 3888: 0.0,7991,PACU1_4_201014_085629.csv###
###Input 3693 / 3888: 0.0,5952,PACU1_4_200617_000155.csv###
###Input 3694 / 3888: 0.0,4108,PACU1_12_200228_143735.csv###
###Input 3695 / 3888: 0.0,493,PACU1_1_190516_090804.csv###
###Input 3696 / 3888: 0.0,742,PACU1_2_190530_220623.csv###
###Input 3697 / 3888: 0.0,7828,PACU1_4_201006_151048.csv###
###Input 3698 / 3888: 0.0,1033,PACU1_4_190708_200649.csv###
###Input 3699 / 3888: 0.0,4015,PACU1_8_200224_170327.csv###
###Input 3700 / 3888: 0.0,3335,PACU1_11_200129_153633.csv###
###Input 3701 / 3888: 0.0,6433,PACU1_5_200714_165941.csv###
###Input 3702 / 3888: 0.0,2065,PACU1_11_

###Input 3823 / 3888: 0.0,4379,PACU1_1_200309_160055.csv###
###Input 3824 / 3888: 0.0,5589,PACU1_5_200603_104036.csv###
###Input 3825 / 3888: 0.0,507,PACU1_3_190516_120110.csv###
###Input 3826 / 3888: 0.0,7100,PACU1_6_200819_115941.csv###
###Input 3827 / 3888: 0.0,6327,PACU1_3_200707_172721.csv###
###Input 3828 / 3888: 0.0,5475,PACU1_4_200529_003558.csv###
###Input 3829 / 3888: 0.0,1370,PACU1_6_190719_123845.csv###
###Input 3830 / 3888: 0.0,2375,PACU1_3_190916_111008.csv###
###Input 3831 / 3888: 0.0,4443,PACU1_7_200311_143000.csv###
###Input 3832 / 3888: 0.0,4661,PACU1_6_200319_115128.csv###
###Input 3833 / 3888: 0.0,4613,PACU1_8_200318_132838.csv###
###Input 3834 / 3888: 0.0,7009,PACU1_3_200812_160536.csv###
###Input 3835 / 3888: 0.0,467,PACU1_11_190515_150348.csv###
###Input 3836 / 3888: 0.0,5930,PACU1_11_200616_142848.csv###
###Input 3837 / 3888: 0.0,3511,PACU1_5_200205_165602.csv###
###Input 3838 / 3888: 0.0,7090,PACU1_5_200818_182106.csv###
###Input 3839 / 3888: 0.0,729,PACU1_6_19