Outline:

~2,000,000 for training set + ~80,000 for validate set(no augment)

1. Annotation:

   1)  Read through ann.txt. For each R wave point, store its position to the corresponding set (R/A/V). If type is X, skip.
   (annProcess)


2. Sampling:

   1) For each record in lydhdb, choose 18 segment(768 points). For each segment, using a sliding window(512 width) with 8 points step to obtain 256/ 8 = 32 samples. And create label for each sample.
   
   2) Obtain true noise from each record. Create label
   
   3) Obtain regular noise(sine/square/triangle) with frequency ranging from 1 to 10 and different phases. Create label.


3. Data Augmentation: 

   1) Segment 5 period(512 points) from em and ma noise record seperatively. Apply 2 different coefficients to the 10 segments -> obtain 5*2*2 = 20 noise segment in total. (noise_segment)


4. Wavelet decomposition + Store as TFRecord
    
   1) Add each noise segment to each sampled sequence.

In [1]:
import os
import numpy as np
import tensorflow as tf
from Stationary_transform import *
from sklearn.model_selection import train_test_split

### Annotation Preprocessing

In [3]:
'''preprocess annotation file; 
   create set R: contains all R waves
          set A: contains all atrial premature beats
          set V: contains all ventricular premature beats'''

def annProcess(path):
    
    with open(path+'/ann.txt','r') as f: # read annotation
        lines = f.readlines()
    
    R = set() # output initialize
    A = set()
    V = set()
    
    ecg = np.fromfile(path+'/ecg.dat', '>i2') # get ecg length
    ecg_len = len(ecg)
    
    for line in lines: # add each R wave to corresponding set
        temp = line.split(',')
        pos,typ = temp[0],temp[1]
        if typ == 'X':
            continue
        pos = round(int(pos)/1000*256) # convert time(ms) to index
        R.add(pos) # R contains all type R waves (except X)
        if typ =='A': # A contains all atrial premature beat
            A.add(pos)
        elif typ =='V': # V contains all ventricular premature beat
            V.add(pos)
    
    return R,A,V

### Label Creator Helper

In [7]:
'''create label for samples containing R waves
*** position in the label is the relative position to the grid with range from [0,1]
'''

def create_normal_label(sample, R):
    label = np.empty((8,2),'float32') # label size (8,2) -> 8 grid. prob and pos x for each grid
    for i in range(8):
        found = False
        for j in range(64): # loop through each point. Search point in R. If is R, label the grid with [1, relative pos]
            pos = 64*i+j+sample[-1]-256
            if pos in R:
                label[i] = [1,j/64]
                found = True
        if not found: # if no R in the grid, label the grid with [0,-1]. -> -1 = no meaning
            label[i] = [0,-1]
    return label


'''create label for true noise or regular noise samples'''

def create_noise_label():
    label = np.empty((8,2),'int')
    label[:,0], label[:,1] = 0,-1   
    return label

### Sampling period with R waves

In [9]:
'''Check if at least 1 R wave present in the period'''
def validPeriod(n, R):
    for i in range(256):
        if n-i in R or n+i in R:
            return True
    return False

'''sampling period without noise'''
def sampling_normal(path,R,record,train_size,val_size):
    ecg = np.fromfile(path +'/ecg.dat', '>i2') # load ecg 
    
    x_train,y_train = np.empty((train_size,514),'float32'),np.empty((train_size,8,2),'float32') # initialize output
    x_val,y_val  = np.empty((val_size,514),'float32'),np.empty((val_size,8,2),'float32')
    
    selected,count,times = set(),0,train_size//32
    
    # 把ECG平分成 ‘times’个部分，并在每个部分做滑窗，取得32个样本
    start,end,step = len(ecg)//times//2,len(ecg),len(ecg)//times # Get sample for train set
    for n in range(start,end,step):
        for i in range(32): # 32 windows
            sample = ecg[n+i*8-256:n+i*8+256]
            x = np.empty((514),'float32')
            x[:512] = sample
            x[-2:] = [record,n+i*8]
            x_train[count] = x
            y_train[count] = create_normal_label(x,R)
            selected.add(n+i*8) # memorize all points have been selected
            count+=1
            
    np.random.seed(40) # Get sample for val set
    count = 0
    while count<val_size:
        n = np.random.randint(256,len(ecg)-256) 
        if n not in selected and validPeriod(n,R):
            x = np.empty((514),'float32')
            x[:512] = np.array([ecg[n-256:n+256]])
            x[-2:] = [record,n]
            x_val[count] = x
            y_val[count] = create_normal_label(x,R)
            count+=1
        selected.add(n)

    return x_train, x_val, y_train, y_val 

In [10]:
'''sampling period with A'''

def sampling_A(path,R,A,record,train_size,val_size):
    ecg = np.fromfile(path +'/ecg.dat', '>i2') # load ecg 
    
    x_train,y_train = np.empty((train_size,514),'float32'),np.empty((train_size,8,2),'float32') # initialize output
    x_val,y_val  = np.empty((val_size,514),'float32'),np.empty((val_size,8,2),'float32')
    
    selected, count= set(), 0 # Get sample for train set
    np.random.seed(40) 
    A = np.fromiter(A, 'int')
    while count < train_size:
        n = random.choice(A)
        if n not in selected:
            for i in range(n-256-128,n-256,8): # 16 windows
                x = np.empty((514),'float32')
                x[:512] = np.array([ecg[i:i+512]])
                x[-2:] = [record,i+256]
                x_train[count] = x
                y_train[count] = create_normal_label(x,R)
                count+=1
            selected.add(n)
    
    count = 0
    while count<val_size:  # Get sample for val set
        n = random.choice(A)
        if n not in selected:
            for i in range(n-256-128,n-256,16): #8 windows
                x = np.empty((514),'float32')
                x[:512] = np.array([ecg[i:i+512]])
                x[-2:] = [record,i+256]
                x_val[count] = x
                y_val[count] = create_normal_label(x,R)
                count+=1
            selected.add(n)
    
    return x_train, x_val, y_train, y_val

In [11]:
'''sampling period with V'''

def sampling_V(path,R,V,record,train_size,val_size):
    ecg = np.fromfile(path +'/ecg.dat', '>i2') # load ecg 
    
    x_train,y_train = np.empty((train_size,514),'float32'),np.empty((train_size,8,2),'float32') # initialize output
    x_val,y_val  = np.empty((val_size,514),'float32'),np.empty((val_size,8,2),'float32')
    
    selected,count = set(),0
    np.random.seed(40)
    V = np.fromiter(V, 'int')
    while count<train_size:  # Get sample for train set
        n = random.choice(V)
        if n not in selected:
            for i in range(n-256-128,n-128,8): # 32 windows
                x = np.empty((514),'float32')
                x[:512] = np.array([ecg[i:i+512]])
                x[-2:] = [record,i+256]
                x_train[count] = x
                y_train[count] = create_normal_label(x,R)
                count+=1
            selected.add(n)
            
    count = 0
    while count<val_size:  # Get sample for val set
        n = random.choice(V)
        if n not in selected:
            for i in range(n-256-128,n-256,16): # 8 windows
                x = np.empty((514),'float32')
                x[:512] = np.array([ecg[i:i+512]])
                x[-2:] = [record,i+256]
                x_val[count] = x
                y_val[count] = create_normal_label(x,R)
                count+=1
            selected.add(n)
    
    return x_train, x_val, y_train, y_val

### Sampling True Noise

In [None]:
'''sampling true noise from each record'''

def sampling_noise(path,R,record): 
    
    R = sorted(list(R))
    ecg = np.fromfile(path +'/ecg.dat', '>i2') 
    data_x,data_y = np.empty((150000,514),'float32'),np.empty((150000,8,2),'int') # initialize output
    # 150000 is an abitrary large number. It is used cuz exact number of noise in the record is unknown
    
    count = 0
    
    for i, loc in enumerate(R[1:]):
        dis=loc-R[i]
        if dis>1064 and count<150000: #1064 considered +-20 of the R peak
            for n in range(R[i]+257,loc-257,25):
                x = np.empty((514),'float32')
                x[:512] = np.array([ecg[n-256:n+256]])
                x[-2:] = [record,n]
                data_x[count] = x
                data_y[count] = create_noise_label() 
                count+=1

    if count ==0: # when no noise in the record
        return [],[],0
    
    x_noise,y_noise = np.copy(data_x[:count]),np.copy(data_y[:count]) # make copy to avoid memory leak
    del data_x
    del data_y
    
    return x_noise,y_noise,count

### Sampling Regular Noise

In [13]:
'''Create sine/square/triangle waves as regular noise'''

def get_sin(freq_low, freq_high, size=747):
    
    data_x, data_y = np.empty((size,514),'float32'),np.empty((size,8,2),'int') # initialize output
    
    count = 0
    time_axis = np.arange(512)
    for freq in range(freq_low, freq_high+1): # get sine with multiple frequency and phase
        B = 2*np.pi*freq/256
        for phi in range(int(256/freq)):
            print('Get sine '+str(count)+'/ '+str(size), end = '\r')
            sine = np.sin(B*time_axis-B*phi)
            data_x[count] = append(sine,[0,0])
            data_y[count] = create_noise_label()
            count+=1 
    return data_x,data_y
    
    
def get_square(freq_low, freq_high, size=747):
    data_x, data_y = np.empty((size,514),'float32'),np.empty((size,8,2),'int') # initialize output
    count = 0
    
    for freq in range(freq_low, freq_high+1):
        square = np.zeros(1024,'float32') # generate square wave wiht length 1024, but will only crop 512 segment from it
        length,pos,i = int(256/(2*freq)),1,0
        while i< 4*2*freq:
            square[i*length:(i+1)*length] = np.ones(length,'int')*pos
            pos *= -1
            i+=1 
        for phi in range(int(256/freq)): # sliding window to crop the length 512 from square wave according to phase
            print('Get square '+str(count)+'/ '+str(size), end = '\r')
            data_x[count]=np.append(square[phi:phi+512],[0,0])
            data_y[count] = create_noise_label()
            count+=1
    return data_x, data_y
    
    
def get_triangle(freq_low, freq_high,size=747):
    data_x, data_y = np.empty((size,514),'float32'),np.empty((size,8,2),'int') # initialize output
    count = 0
    
    for freq in range(freq_low, freq_high+1):
        tran = np.zeros(1024,'float32') # generate triangle wave wiht length 1024, but will only crop 512 segment from it
        slope, part, period = 1/(256/(4*freq)), 256/(4*freq), 256//freq
        for i in range(256//freq): # draw triangle wave with length 256
            m = 256/(4*freq)
            if (i//(256/(4*freq)))%4==0:
                tran[i] = slope*i
            elif (i//(256/(4*freq)))%4==3:
                tran[i] = slope*i+(-1-slope*(3*part))
            else:
                tran[i] = -slope*i+slope*(part*2)
        for i in range(1,4*freq): # copy length 256 for multi-times to make full 1024 length wave
            tran[i*period:(i+1)*period] = tran[:period]

        for phi in range(int(256/freq)): # sliding window to crop the length 512 from square wave according to phase
            data_x[count]=np.append(tran[phi:phi+512],[0,0])
            data_y[count] = create_noise_label()
            count+=1
    return data_x, data_y

### Augmentation and TFRecord Storage

In [None]:
'''Obtain noise segments from em and ma noise record'''

def noise_segment(coefficient, em_path = 'Record/em.dat', ma_path = 'Record/ma.dat'):
    em = np.fromfile(em_path,'>i2') # upload noise record 
    ma = np.fromfile(ma_path,'>i2')
    
    noise_sample = np.empty((20,512),'float32') #[em1_1,em1_2,ma1_1,ma1_2...,em5_1,em5_2,ma5_1,ma5_2]
    orig_noise = np.empty((10,512),'float32') #[em1,ma1,em2,ma2,..., em5,ma5]
    
    # randomly obtained 5 noise samples from em and ma seperatively and stored in orig_noise
    start,end,step,index = len(em)//5//2,len(em),len(em)//5,0 
    for n in range(start,end,step):
        sample_em = em[n-256:n+256]
        orig_noise[index] = sample_em
        sample_ma = ma[n-256:n+256]
        orig_noise[index+1] = sample_ma
        index+=2
        
    # for each noise sample obtained, applied coefficient to it and stored in noise_sample
    index = 0
    for i,noise in enumerate(orig_noise):
        for e in coefficient[i]:
            noise_sample[index] = noise*e
            index+=1
    
    return noise_sample

In [14]:
'''Do stationary wavelet decomposition;
   Save samples to tfrecord file'''

def putSample(writer, size, data_x, data_y):
    for i in range(size):
        y = data_y[i].reshape(16)
        
        sample = decomp(data_x[i][:-2],'db2',(512,8)).reshape(4096) # stationary wavelet decomposition using db wavelet
        x = np.empty((4098),'float32')
        x[:4096] = sample
        x[-2:] = data_x[i][-2:]

        # create example and write to tfrecord
        feature = {
            'ecg': tf.train.Feature(float_list=tf.train.FloatList(value=x)),
            'label': tf.train.Feature(float_list=tf.train.FloatList(value=y)),
        }
        example = tf.train.Example(features = tf.train.Features(feature = feature))
        writer.write(example.SerializeToString())

In [15]:
'''Add em/ma noise to each sample;
   Do stationary wavelet decomposition on each sample;
   Save samples to tfrecord file'''

def putSample_AddingNoise(writer, size, data_x, data_y, noise_sample):
    for i in range(size):
        y = data_y[i].reshape(16)
        for noise in noise_sample:
            sample = decomp(data_x[i][:-2]+noise,'db2',(512,8)).reshape(4096) #stationary wavelet decomposition using db wavelet
            x = np.empty((4098),'float32')
            x[:4096] = sample
            x[-2:] = data_x[i][-2:]

            # # create example and write to tfrecord
            feature = {
                'ecg': tf.train.Feature(float_list=tf.train.FloatList(value=x)),
                'label': tf.train.Feature(float_list=tf.train.FloatList(value=y)),
            }
            example = tf.train.Example(features = tf.train.Features(feature = feature))
            writer.write(example.SerializeToString())

### Other helper

In [16]:
#Shuffle x and y; split to val set/train set
def shuffle_and_split(data_x,data_y,seed = 10,size = 0.03):
    np.random.seed(seed)
    np.random.shuffle(data_x)
    np.random.seed(seed)
    np.random.shuffle(data_y)
    x_train, x_val, y_train, y_val = train_test_split(data_x,data_y, test_size=0.03)
    return x_train, x_val, y_train, y_val


# get the name of each record folder
def findSubdir(path):
    subdir = [x for x in os.walk(path)]
    subdir[0][1].remove('lorenz_plots')
    return subdir[0][1]

### Main

In [20]:
em1 = [0.2,0.5] # coefficient for each em/ma noise segment
em2 = [0.1,0.15]
em3 = [0.2,0.5]
em4 = [0.1,0.3]
em5 = [0.1,0.2]

ma1 = [0.3,0.7]
ma2 = [0.5,1.3]
ma3 = [0.5,1.4]
ma4 = [1,4]
ma5 = [0.5,1.3]
coefficient = np.array([em1,ma1,em2,ma2,em3,ma3,em4,ma4,em5,ma5])

In [24]:
total_val = 0 # validate set/ train set/ noise counter
total_train = 0
total_noise = {}

lydhdb = findSubdir('db/lydhdb') # get all record folder name
lydhdb = sorted(lydhdb)

with tf.io.TFRecordWriter('/tmpdata/val.tfrecords') as val_writer:
    for i in range(0,9):
        with tf.io.TFRecordWriter('/tmpdata/train_'+str(i+1)+'.tfrecords') as train_writer:
            records = lydhdb[i*8:(i+1)*8] if i!=8 else lydhdb[i*8:]
            for record in records:
                path = 'db/lydhdb/'+record

                R,A,V = annProcess(path) 

                noise_sample = noise_segment(coefficient)

                x_train, x_val, y_train, y_val = sampling_normal(path,R,record,576,560) # sampling 
                print(record + ' normal sampling finished')
                
                putSample_AddingNoise(train_writer, len(x_train), x_train,y_train,noise_sample) # store train sample to TFrecord
                putSample(train_writer,len(x_train),x_train,y_train)
                putSample(val_writer, len(x_val), x_val,y_val)
                total_val+=len(x_val)
                total_train+=len(x_train)
                print('      normal store finished       ')  
                
                if len(A)>=14: # sampling period with A
                    x_train, x_val, y_train, y_val = sampling_A(path,R,A,record,int(len(A)*0.075)*16,int(len(A)*0.148)*8)
                    putSample_AddingNoise(train_writer,len(x_train),x_train,y_train,noise_sample) 
                    putSample(train_writer,len(x_train),x_train,y_train)
                    putSample(val_writer, len(x_val), x_val,y_val)
                    total_val+=len(x_val)
                    total_train+=len(x_train)
                
                if len(V)>=50: # sampling period with V
                    x_train, x_val, y_train, y_val = sampling_V(path,R,V,record,int(len(V)*0.02)*32,int(len(V)*0.078)*8)
                    putSample_AddingNoise(train_writer,len(x_train),x_train,y_train,noise_sample)
                    putSample(train_writer,len(x_train),x_train,y_train)
                    putSample(val_writer, len(x_val), x_val,y_val)
                    total_val+=len(x_val)
                    total_train+=len(x_train)
                print('      premature beat store finished       ')
                
                x_noise, y_noise,count_noise = sampling_noise(path,R,record) # sampling true noise sample
                total_noise[record] = count_noise
                print('      noise sampling finished')
                
                if len(x_noise)!=0:
                    x_train, x_val, y_train, y_val = shuffle_and_split(x_noise,y_noise,seed = 10,size = 0.03)
                    
                    putSample(train_writer, len(x_train), x_train,y_train) # store train sample to TFrecord
                    putSample(val_writer, len(x_val), x_val,y_val)
                    total_val+=len(x_val)
                    total_train+=len(x_train)
                    print('     noise store finished       ')  

            if i==8: # get regular noise
                x_sine,y_sine = get_sin(1,10)
                x_square, y_square = get_square(1,10)
                x_tran, y_tran = get_triangle(1,10)
                putSample(train_writer, 747, x_sine,y_sine)
                putSample(train_writer, 747, x_square, y_square)
                putSample(train_writer, 747, x_tran, y_tran)
                total_train+=len(x_sine)
                total_train+=len(x_square)
                total_train+=len(x_tran)
                print('sine, square, traingle finished') 

        train_writer.close()
val_writer.close()

20000 normal sampling finished
      normal store finished       
      premature beat store finished       
      noise sampling finished
20001 normal sampling finished
      normal store finished       
      premature beat store finished       
      noise sampling finished
     noise store finished       
20002 normal sampling finished
      normal store finished       
      premature beat store finished       
      noise sampling finished
20003 normal sampling finished
      normal store finished       
      premature beat store finished       
      noise sampling finished
     noise store finished       
20004 normal sampling finished
      normal store finished       
      premature beat store finished       
      noise sampling finished
     noise store finished       
20005 normal sampling finished
      normal store finished       
      premature beat store finished       
      noise sampling finished
20006 normal sampling finished
      normal store finished       
 

In [25]:
total_val,total_train

(89021, 382048)

In [26]:
noise = 0 
for x in total_noise:       # total true noise
    noise+=total_noise[x]
noise

307684