In [29]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
from sklearn import preprocessing
from tqdm import tqdm
import os
import re
import pandas as pd
import pickle
import csv

# Extract information from MIT-BIH raw Files

## Get Record Names from the RECORDS File

In [30]:
rlist = []
records = 'afib-svsm/mit-bih-raw/RECORDS' # Replace the _______ with the name of the records file in mit-bih-raw/
with open(records) as rfile: #Then we open the file 
                             #The 'with' command only opens the file while we are in it. Automatically closes the file when we're not
    for record in rfile:  # Then we iterate through the lines in the file
        record = record[0:len(record)-1] # Remove any erronious new line characters at the end ('\n')
        rlist.append(record) # Then build an array with it

## Extract all info using the WaveForm DataBase (WFDB)

In [31]:

###### Step 1: Initialize all Arrays
             # Below, replace all of the ___ with the command that declares an array/list
             # hint: https://stackoverflow.com/questions/1514553/how-to-declare-an-array-in-python
samples = [] # will house the samples of all subjects
good_list = [] # will list the names of the subjects we successfully extracted
bad_list = [] # will house the names of the subjects we failed to extract
qrs = [] # will house the indices of R-Peaks for all subjects
atr_label = [] # will house the labels for each rhythm annotation for all subjects
atr_locs = [] # will house the locations corresponding to the rhythm annotation labels


###### Step 2: Extract Information
for x in tqdm(rlist): #this will iterate through te records that we found above
    try: # A try statement will run the except statement if for some reason the try commands fail
         # In this case I use the try statement because one of the subjects has no signal data causing failure
         # I then use bad_list and good_list so that all of the indices in rlist match with the arrays we initialized in Step 1, above
        ######################################################
            # Below find the wfdb function that will return the information that is described below 
            # Then replace _____ with the correct function call
        samp = wfdb.rdsamp('mit-bih-raw/'+x) # wfdb._____(file_location) will read the signal & header data and return a 2 value array
            # samp[0] - the signal data is the raw reading from the ecg. Each value is a sample taken.
            # samp[1] - the header data includes things about the signal data such as:
              # samples per section, denoted 'fs'
              # number of signals, denoted 'n_sig'
            
        ######################################################
        samples.append(samp) #add it to our array for all subject
        
            #What is our file extension that has the annotation we want? Find it here and replace _____ with it 
            #hint: READ THE VARIABLE NAMES!!!!
        qrs_tmp = wfdb.rdann('mit-bih-raw/'+x, extension='qrs') #extract the QRS Info
        qrs_locs = np.array(qrs_tmp.sample, dtype='int') #Get just the loccation of R-Peaks from the QRS Info
        qrs.append(qrs_locs) # Add to our array for all subjects
        
        
            #Do the same thing here
        atr = wfdb.rdann('mit-bih-raw/'+x,extension='atr') #extract the atr info which stores the rhythm type(s) over the whole signal
        atr_label.append(atr.aux_note) # aux_note stores the type of rhythm - main two are '(N' for normal and '(AFIB' for AFIB
        atr_locs.append(np.append(atr.sample, len(samp[0]))) #I add the length of the whole sample to the end for better visualization later
        
        good_list.append(x) # when all extraction is successful append the record name to good_list
    except Exception as exep:
        print(exep) # Alert the user of an exception
        bad_list.append(x) # add to the bad list
        

  0%|          | 0/25 [00:00<?, ?it/s]

sampto must be greater than sampfrom
sampto must be greater than sampfrom


100%|██████████| 25/25 [01:02<00:00,  2.48s/it]


In [32]:
print(samp)

# Now, in this code block use wfdb to extract the sample info, QRS info, and atr info
# Print some stuff out and see if you can figure out how to manipulate it

(array([[-0.275, -0.37 ],
       [-0.245, -0.45 ],
       [-0.305, -0.43 ],
       ...,
       [-0.24 , -0.52 ],
       [-0.3  , -0.52 ],
       [-0.27 , -0.565]]), {'fs': 250, 'sig_len': 9205760, 'n_sig': 2, 'base_date': None, 'base_time': None, 'units': ['mV', 'mV'], 'sig_name': ['ECG1', 'ECG2'], 'comments': []})


### Extracting Rhythm Data

Next, I am going to reformat the rhythm annotations into a different format that is more understandable.

The current format for an individuals rhythm annotations are as follows:
- `labels = ['(N', '(AFIB', '(N', '(O', ...]`
- `locs   = [  10,    1000, 1234, 1983, ...]`

Where the labels' corresponding locations are where that rhythm begins.

The below code changes it to the following format instead using the python data type Dictionary.

```python
rhythm_annotations = {
    '(N':    [ [10,   999],
               [1234, 1982]
             ], 
    '(AFIB': [ [1000, 1233]
             ],
    '(O':    [ [1983, ...]
             ]    
}
```


This data is now formatted in a 2-Dimensional array in which each pair of values represents a range of values in which a specific rythm is present. 

The data can be accessed like so: 
```python
  rhythm_annotations['(N']         = [ [10,   999],
                                       [1234, 1982]
                                     ]
    
  rhythm_annotations['(N'][0]      = [10,   999]

  rhythm_annotations['(N'][0][0]   = 10

```

In [33]:
atr_dics = [] #Initialize the array that will hold the dictionary for each subject


for idxs,lab in enumerate(atr_label):
    atr_dic = {} #Initialize dictionary for each subject
    for idx,x in enumerate(lab):
        if x not in atr_dic.keys():
            atr_dic[x] = [] #Add dictionary key if does not exist
        atr_dic[x].append([atr_locs[idxs][idx], atr_locs[idxs][idx+1]]) #Insert range for each rhythm
    atr_dics.append(atr_dic) #Add to dictionary array

## Other Ways To Format

Here we are going to format each subjects data into a DataFrame using pandas. Many statistical tools are built to be used with DataFrames. 

Also, it allows for a one-stop shop for our data where we can save all data for each subject in one file instead of having multiple files per subject.

Our data frame is going to be formatted like so:

|     | Signal 1 | Signal 2 | R-Peak | Normal | AFIB  | Other |
|-----|----------|----------|--------|--------|-------|-------|
| ... | ...      | ...      | ...    | ...    | ...   | ...   |
| 234 | 0.123    | -0.312   | True   | True   | False | False |
| ... | ...      | ...      | ...    | ...    | ...   | ...   |


- Column 1: Index
    - the index is the value of each row and represents the sample value
- Column 2: Signal 1
    - a float (or decimal) value which represents the value of the first signal in the reading at the given sample value
- Column 3: Signal 2
    - a float (or decimal) value which represents the value of the second signal in the reading at the given sample value
- Column 4: R-Peak
    - a boolean value (```True``` or ```False```) which represents if there is a R-Peak at the given sample value
- Column 5: Normal
    - a boolean value (```True``` or ```False```) which represents if the sample is in a pattern of Normal beats
- Column 6: AFIB
    - a boolean value (```True``` or ```False```) which represents if the sample is in a pattern of AFIB beats
- Column 7: Other
    - a boolean value (```True``` or ```False```) which represents if the sample is in a pattern of other beats

In [34]:
subject_dataframes = [] # Initialize the subject_dataframes - will hold all of our subject dataframes

for s, _ in enumerate(tqdm(good_list)): # Iterate through all of the subjects that we have complete data of 
    subj = pd.DataFrame( # The below statements initialize our datafram. The first to columns will be our given signals, and the rest we initialize to 0
        data = np.transpose(np.array([ # First we give our data, for pandas they want the data by row instead of by column, so we use transpose to get the proper format
                                               [x[0] for x in samples[s][0]],
                                               [x[1] for x in samples[s][0]],
                                               np.zeros(len(samples[s][0])), # np.zeros makes an array of zeros with the given lenth
                                               np.zeros(len(samples[s][0])), 
                                               np.zeros(len(samples[s][0])), 
                                               np.zeros(len(samples[s][0])), 
                                        ])
                           ),
        columns = ['Signal 1', 'Signal 2', 'R-Peak', 'Normal', 'AFIB', 'Other'] # Here we name our columns to match the dataframe we outlined above
    )
    norm = [] # Initialize the norm array which will list every index the person is in a normal rhythm
    if '(N' in atr_dics[s].keys():
        for x in atr_dics[s]['(N']: # Then we iterate through our ranges we extracted above
            norm = norm + list(range(x[0], x[1])) # And add all values in the range to our norm array
    af = [] # Then we do the same steps above for AFIB rhythms
    if '(AFIB' in atr_dics[s].keys():
        for x in atr_dics[s]['(AFIB']:
            af = af + list(range(x[0], x[1]))
    subj['R-Peak']= subj.index.isin(qrs[s]) # the isin() function of a DataFram index will return true if the index is in that list and false if it is not
                                            # then, we can initialize our dataFrame with correct values based on that
    subj['Normal']= subj.index.isin(norm)
    subj['AFIB'] = subj.index.isin(af)
    subj['Other'] = ~subj.index.isin(np.append(norm, af)) # Because we are classifying AFIB specifically we define other as any rhythm not in the norm or AFIB list
    
    subject_dataframes.append(subj) # Add the dataframe we built to our to array that holds all of our subjects' dataframes

100%|██████████| 23/23 [09:40<00:00, 25.25s/it]


In [35]:
subject_dataframes[1]

Unnamed: 0,Signal 1,Signal 2,R-Peak,Normal,AFIB,Other
0,-0.415,-0.395,False,False,False,True
1,-0.415,-0.260,False,False,False,True
2,-0.430,-0.185,False,False,False,True
3,-0.445,-0.135,False,False,False,True
4,-0.460,-0.080,False,False,False,True
...,...,...,...,...,...,...
9205755,0.220,-0.130,False,True,False,False
9205756,0.200,-0.160,False,True,False,False
9205757,0.175,-0.140,False,True,False,False
9205758,0.080,-0.160,False,True,False,False


## Saving Extracted Information 

Saving the information that we have used processing power to extract is important because:
1. It makes our data easier to access in the future
    - Easy access in new files
2. It creates static information for us to use and reference
3. By saving in in a CSV we make it more accessible for others to use
    - The data can now be used in an excel sheet and more
    
We will generally always extract to a CSV file unless the data is too complex. If that is the case then we have another option. 

'pickle' is a Python package which will save much more complex Data types for future use. 

For example - if you have want to save a statistical model, pickle will be able to do that more effectively than CSV format.

In [36]:
reload_flag = False
  # Set this flag to true to re-save all of the extracted information even if it has already been saved at these paths

In [None]:
for idx, x in enumerate(tqdm(good_list)): 
    if not os.path.exists('mit-bih-dataframes/'+x+ '.csv') or reload_flag:
        subject_dataframes[idx].to_csv('afib-svsm/mit-bih-dataframes/'+x+'.csv') # Pandas DataFrames have a built in to_csv() function which whill save it at the passed path

np.savetxt("mit-bih-dataframes/subject_list.csv", good_list, delimiter=",",  fmt='%s') 
   # We'll load the complete list of subjects as well so that we can easily recreate the file names

 83%|████████▎ | 19/23 [27:56<09:23, 140.86s/it]

In [None]:
np.savetxt("mit-bih-extracted/subject_list.csv", good_list, delimiter=",",  fmt='%s') #Save the names in the folder 
for idx, x in enumerate(tqdm(good_list)): # Iterate through our subjects
    if not os.path.exists("mit-bih-extracted/"+x+"_signals.csv") or reload_flag:
        np.savetxt("mit-bih-extracted/"+x+"_signals.csv", np.array(samples[idx][0]), delimiter=",") # numPy has a savetxt() function which by setting the delimiter as ',' we can 
                                                                                            # simulate a to_csv() function 
    if not os.path.exists("mit-bih-extracted/"+x+"_rpeaks.csv") or reload_flag:
            np.savetxt("mit-bih-extracted/"+x+"_rpeaks.csv", np.array(qrs[idx]), delimiter=",")      
    if not os.path.exists("mit-bih-extracted/"+x+"_headers.pkl") or reload_flag:
        with open("mit-bih-extracted/"+x+"_headers.pkl", 'wb') as picklefile: # nomPy has no way to save a dictionary as a CSV so we use the pickle package
                                    # First we open up the file we would like to write to
            pickle.dump(samples[idx][1], picklefile)
    if not os.path.exists("mit-bih-extracted/"+x+"_rhythms.pkl") or reload_flag:
        with open("mit-bih-extracted/"+x+"_rhythms.pkl", 'wb') as picklefile:
            pickle.dump(atr_dics[idx], picklefile)
