In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
import pickle
from sklearn import preprocessing
from tqdm import tqdm
import os
import re
import pandas as pd
import csv

In [2]:
record_list = [] # Initialize the array that will hold the list of our records

records = 'mit-bih-dataframes/subject_list.csv' # Get our record list like we did in the initial extraction
with open(records) as rfile:# Load our records into the array
    for record in rfile:
        record = record[0:-1] # The -1 removes the newline ("\n") character from the string
        record_list.append(record)

In [14]:
subject_dataframes = [] # Initialize our dataframe array

for x in tqdm(record_list): # Extract our dataframes from the CSVs we saved previously
    subject_dataframes.append(pd.read_csv('mit-bih-dataframes/'+x+'.csv', index_col=0))
            # 'index_col=0' loads the index that is saved into the csv as the index rather than another column


100%|██████████| 23/23 [02:37<00:00,  6.83s/it]


[         Signal 1  Signal 2  R-Peak  Normal   AFIB  Other
0          -0.275    -0.210   False   False  False   True
1          -0.295    -0.200   False   False  False   True
2          -0.310    -0.220   False   False  False   True
3          -0.315    -0.225   False   False  False   True
4          -0.300    -0.215   False   False  False   True
...           ...       ...     ...     ...    ...    ...
9205755    -0.765    -1.810   False    True  False  False
9205756    -0.680    -1.560   False    True  False  False
9205757    -0.680    -1.345   False    True  False  False
9205758    -0.670    -1.230   False    True  False  False
9205759    -0.665    -1.165   False    True  False  False

[9205760 rows x 6 columns],          Signal 1  Signal 2  R-Peak  Normal   AFIB  Other
0          -0.415    -0.395   False   False  False   True
1          -0.415    -0.260   False   False  False   True
2          -0.430    -0.185   False   False  False   True
3          -0.445    -0.135   False   Fals

In [4]:

# Now we get the indexes of the R-Peaks from our dataframes
qrs = [] # Initialize the array that will hold our list of R-Peaks for each subject
qrs_colors= []
for subject in tqdm(subject_dataframes): # Load each subject
    qrs.append([idx for idx, x in enumerate(subject['R-Peak']) if x]) # Then we use list comprehension to get our r peaks for each subject
                                                                # Recall, the indices that the R-Peak column is true is where there is an r-peak
                                                                # So, we iterate through and identify which indices that is for each subject
for idx, subj in enumerate(tqdm(qrs)):    
    df_color = []
    for x in subj: 
        if subject_dataframes[idx].loc[x, 'Normal']: 
            df_color.append('Normal')
        elif subject_dataframes[idx].loc[x, 'AFIB']:
            df_color.append('AFIB')
        else: 
            df_color.append('Other')
    qrs_colors.append(df_color)

100%|██████████| 23/23 [00:54<00:00,  2.36s/it]
100%|██████████| 23/23 [02:07<00:00,  5.53s/it]


In [5]:
print(len(qrs_colors[0]),len(qrs[0]))

44005 44005


In [12]:
rr_ints = [] # Initialize the array that will hold all of our subjects' RR-Intervals
rhythms = []
for idx, subj in enumerate(tqdm(qrs)): # Iterate through our subjects data
    rrlabels = qrs_colors[idx]
    s_labels = []
    rr1 = [] # Initialize a temporary array that will store a single subject's RR-Intervals
    for idxs, r in enumerate(subj): # Iterate through the subject's R-Peaks
        if idxs == 0: # If it is the first peak we have no interval so go on to the next R-Peak
            next
        else:
            rr1.append(r - subj[idxs-1]) # Find the interval by taking the difference of the location of one R-Peak with the Location of the R-Peak before it
            s_labels.append(rrlabels[idxs])
    rr_ints.append(rr1) # Add it to our master array
    rhythms.append(s_labels)

100%|██████████| 23/23 [00:00<00:00, 33.08it/s]


In [7]:
outlier_list=[]
rrs=[]
for idx, subj in tqdm(enumerate(rr_ints)):
    outlier = [[],[]] # I want to store the index and the outlier
    for idx2, rr in enumerate(subj):
        if rr > 500:
            outlier[0].append(rr) # add the rr int to our list
            outlier[1].append(idx2) # Add its index to our list
    subj = np.delete(subj, outlier[1]) # Remove all found outliers from our subject 
    rhythms[idx] = np.delete(rhythms[idx], outlier[1])
    rrs.append(subj) # Add it to our new rr interval list
    outlier_list.append(outlier)

23it [00:02, 11.42it/s]


In [8]:
 # Setup subset dictionary
subset_list = {}
subset_rhythm_labels = {}
for x in record_list:
    subset_list[x] = []
    subset_rhythm_labels[x] = []

In [9]:
subset_len_sec = 25 # Set the time we are going to subset by
subset_len_samp = subset_len_sec*250 # Get that timme in samplse

for idx, subj in tqdm(enumerate(rrs)):
    samp = 0
    while samp < len(subj):
        subs_len = 0
        subs = []
        rhythm_list = []
        while subs_len < subset_len_samp and samp<len(subj):
            rr = subj[samp]
            subs.append(rr)
            rhythm_list.append(rhythms[idx][samp])
            subs_len+=rr
            samp+=1
        majority_rhythm = np.unique(rhythm_list)[np.argmax(np.unique(rhythm_list, return_counts=True)[1])]
        subset_list[record_list[idx]].append(subs)
        subset_rhythm_labels[record_list[idx]].append(majority_rhythm)        

23it [00:10,  2.15it/s]


In [10]:
subset_record_list = []
reload_flag=True
for idx, x in enumerate(tqdm(record_list)): 
    subset_record_list = []
    for num, subset in enumerate(subset_list[x]):
        if not os.path.exists('mit_bih_subset/'+x+ '-'+str(num)+'.csv') or reload_flag:
            np.savetxt('mit_bih_subset/'+x+ '-'+str(num)+'.csv', subset, delimiter=",",  fmt='%s') 
            
            subset_record_list.append(x+ '-'+str(num)+'.csv')
    if not os.path.exists('mit_bih_subset/'+x +'.csv') or reload_flag:
        pd.DataFrame({'subsetID': subset_record_list, 'rhythmLabel': subset_rhythm_labels[x]}).to_csv('mit_bih_subset/'+x+'_subset_dataframe.csv')
            # We'll load the complete list of subjects as well so that we can easily recreate the file names

100%|██████████| 23/23 [00:48<00:00,  2.10s/it]
