In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
import pickle
from sklearn import preprocessing
from tqdm import tqdm
import os
import re
import pandas as pd
import csv

In [2]:
record_list = [] # Initialize the array that will hold the list of our records

records = 'mit-bih-dataframes/subject_list.csv' # Get our record list like we did in the initial extraction
with open(records) as rfile:# Load our records into the array
    for record in rfile:
        record = record[0:-1] # The -1 removes the newline ("\n") character from the string
        record_list.append(record)

In [50]:
subject_dataframes = {}

for x in tqdm(record_list):
    subject_dataframes[x] = pd.read_csv('mit_bih_subset/' + x + '_subset_dataframe.csv')

100%|██████████| 23/23 [00:00<00:00, 24.45it/s]


In [63]:
print(subject_dataframes['04015'])

      Unnamed: 0        subsetID rhythmLabel
0              0     04015-0.csv      Normal
1              1     04015-1.csv      Normal
2              2     04015-2.csv      Normal
3              3     04015-3.csv      Normal
4              4     04015-4.csv      Normal
...          ...             ...         ...
1413        1413  04015-1413.csv      Normal
1414        1414  04015-1414.csv      Normal
1415        1415  04015-1415.csv      Normal
1416        1416  04015-1416.csv      Normal
1417        1417  04015-1417.csv      Normal

[1418 rows x 3 columns]


In [3]:
rr_ints = [] # Initialize the array that will hold all of our subjects' RR-Intervals

for idx, subj in enumerate(record_list): # Iterate through our subject ids
    rr_ints.append(np.genfromtxt('mit-bih-rrintervals/'+str(subj)+'_rr-intervals.csv',delimiter=',')) # Add to master array
       #NOTE = change 'mit-bih-rr-intervals/' to the appropriate location


In [4]:
outlier_list=[]
rrs=[]
for idx, subj in tqdm(enumerate(rr_ints)):
    outlier = [[],[]] # I want to store the index and the outlier
    for idx2, rr in enumerate(subj):
        if rr > 500:
            outlier[0].append(rr) # add the rr int to our list
            outlier[1].append(idx2) # Add its index to our list
    subj = np.delete(subj, outlier[1]) # Remove all found outliers from our subject 
    rrs.append(subj) # Add it to our new rr interval list
    outlier_list.append(outlier)

23it [00:00, 37.89it/s]


In [5]:
subset_list = {}
for x in record_list:
    subset_list[x] = []
    


{'04015': [], '04043': [], '04048': [], '04126': [], '04746': [], '04908': [], '04936': [], '05091': [], '05121': [], '05261': [], '06426': [], '06453': [], '06995': [], '07162': [], '07859': [], '07879': [], '07910': [], '08215': [], '08219': [], '08378': [], '08405': [], '08434': [], '08455': []}


In [6]:
subset_len_sec = 25 # Set the time we are going to subset by
subset_len_samp = subset_len_sec*250 # Get that timme in samplse

for idx, subj in tqdm(enumerate(rrs)):
    samp = 0
    while samp < len(subj):
        subs_len = 0
        subs = []
        while subs_len < subset_len_samp and samp<len(subj):
            rr = subj[samp]
            subs.append(rr)
            subs_len+=rr
            samp+=1
        subset_list[record_list[idx]].append(subs)

23it [00:01, 18.05it/s]


In [7]:
#Classifying transitions between RR-Ints as short, medium, or long using running mean
int_types = []

for idx, subject in tqdm(enumerate(record_list)):
    subject_types = []
    for i, subset in enumerate(subset_list[subject]):
        sum1 = 0
        subset_types = []
        for j, rr_ints in enumerate(subset):
            sum1=sum1+rr_ints
            if j==0:
                rrmean=sum1
            runningmean = 0.75*rrmean + 0.25*rr_ints
            rrmean= sum1/(j+1)
            if rr_ints < 0.85 * runningmean:
                subset_types.append("Short")
            elif rr_ints > 1.15 * runningmean:
                subset_types.append("Long")
            else:
                subset_types.append("Regular")
        subject_types.append(subset_types)
    int_types.append(subject_types)



23it [00:01, 12.31it/s]


In [8]:
#Classifying transitions between RR-Ints as short, medium, or long using running mean
transitions_list=[]

for idx, subject in enumerate(int_types): 
    subject_transitions = []
    for j, subset in enumerate(subject):
        StoS, StoR, StoL, RtoS, RtoR, RtoL, LtoS, LtoR, LtoL = [0]*9
        count = 0
        for i, rr_int in enumerate(subset):
            if i<len(subset)-1:
                count += 1
                if subset[i]=="Short" and subset[i+1]=="Short":
                    StoS += 1
                elif subset[i]=="Short" and subset[i+1]=="Regular":
                    StoR += 1
                elif subset[i]=="Short" and subset[i+1]=="Long":
                    StoL += 1
                elif subset[i]=="Regular" and subset[i+1]=="Short":
                    RtoS += 1
                elif subset[i]=="Regular" and subset[i+1]=="Regular":
                    RtoR += 1
                elif subset[i]=="Regular" and subset[i+1]=="Long":
                    RtoL += 1
                elif subset[i]=="Long" and subset[i+1]=="Short":
                    LtoS += 1
                elif subset[i]=="Long" and subset[i+1]=="Regular":
                    LtoR += 1
                elif subset[i]=="Long" and subset[i+1]=="Long":
                    LtoL += 1
        subject_transitions.append([StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count])
    transitions_list.append(subject_transitions)

In [34]:
print(transitions_list[0][0])

[0.2, 0.05714285714285714, 0.02857142857142857, 0.05714285714285714, 0.22857142857142856, 0.14285714285714285, 0.05714285714285714, 0.11428571428571428, 0.11428571428571428]


In [16]:
#Calculating root mean square of the successive differences
rms_list = []
for idx, subject in enumerate(record_list):
    rmssubject_list = []
    for i, subset  in enumerate(subset_list[subject]):
        sum_of_squares=0
        for j, rr_int in enumerate(subset):
              if j>0:
                square_difference = (subset[j] - subset[j-1])**2
                sum_of_squares = sum_of_squares + square_difference
        mean_square = sum_of_squares/(len(subject)-1)
        root_mean_square = np.sqrt(mean_square)
        rmssubject_list.append(root_mean_square)
    rms_list.append(rmssubject_list)
print(rms_list[0])

[156.276997667603, 197.78397306151982, 151.63855050744846, 157.6118650355994, 226.85568099564975, 220.93324783744072, 222.38142908075756, 105.17485440921703, 92.4040042422405, 190.58397099441495, 214.81154531356083, 222.9585163208618, 244.438642607915, 209.65805493708083, 106.94157283301944, 86.64006001844643, 132.49433950173116, 93.98138113477584, 143.71673528159482, 193.30287116336373, 150.68178390236824, 132.53584420827445, 158.25217218098462, 114.33940703012239, 108.04975705664498, 96.80134296589071, 98.25222643787774, 276.4719696461108, 118.91909014115437, 88.21281086100817, 230.31880079576655, 136.68028387444915, 49.79708826829135, 18.887826767524103, 21.11279233071741, 93.29389047520743, 98.63189139421387, 139.7596150538488, 81.38181615078395, 28.442925306655784, 22.02839077191069, 22.929238975596203, 90.6545641432355, 70.25667228100119, 20.512191496766015, 34.528973341239094, 33.64149223800871, 24.14021540914662, 20.062402647738878, 22.02839077191069, 58.670691149840735, 26.641

In [28]:
#Standard deviation
Std_list = []
for i, subject in enumerate(record_list):
    Standard_devs = []
    for j, subset in enumerate(subset_list[subject]):
        standard_dev = np.std(subset)
        Standard_devs.append(standard_dev)
    Std_list.append(Standard_devs)

In [12]:
#Coefficient of variance(std/mean)
CoefVar_list = []
for i, subject in enumerate(record_list):
    CoefVarSubjects = []
    for j, subset in enumerate(subset_list[subject]):
        standard_dev = np.std(subset)
        mean = np.mean(subset)
        CoefVarSubjects.append(standard_dev/mean)
    CoefVar_list.append(CoefVarSubjects)

In [61]:
#Range
Range_list = []
for i, subject in enumerate(record_list):
    Ranges = []
    for j, subset in enumerate(subset_list[subject]):
        range = max(subset) - min(subset)
        Ranges.append(range)
    Range_list.append(Ranges)
    



In [None]:
#Interquartile Range
IQR_list = []
for i, subject in enumerate(record_list):
    IQRs = []
    for j, subset in enumerate(subset_list[subject]):
        intqrange = np.percentile(subset, 75) - np.percentile(subset, 25)
        IQRs.append(intqrange)
    IQR_list.append(IQRs)

In [62]:
#Median Absolute Deviation
MAD_list = []
for i, subject in enumerate(record_list):
    MADs = []
    for j, subset in enumerate(subset_list[subject]):
        for k, rr_int in enumerate(subset):
          medianAbsDev = np.median(np.abs(subset[k]-np.median(subset)))
        MADs.append(medianAbsDev)
    MAD_list.append(MADs)


In [52]:
dfdic = {}
for idx, subj in tqdm(enumerate(record_list)):
    dfdic[subj] = pd.DataFrame(
        data = np.transpose(np.array([
                                    [x[0] for x in transitions_list[idx]],
                                    [x[1] for x in transitions_list[idx]],
                                    [x[2] for x in transitions_list[idx]],
                                    [x[3] for x in transitions_list[idx]],
                                    [x[4] for x in transitions_list[idx]],
                                    [x[5] for x in transitions_list[idx]],
                                    [x[6] for x in transitions_list[idx]],
                                    [x[7] for x in transitions_list[idx]],
                                    [x[8] for x in transitions_list[idx]],
                                    rms_list[idx],
                                    Std_list[idx],
                                    CoefVar_list[idx],
                                    Range_list[idx],
                                    IQR_list[idx],
                                    MAD_list[idx]
                                    ])
                           ),
        columns = ['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'RMS', 'STD', 'CoefVar', 'Range', 'IQR', 'MAD']
    )


23it [00:00, 58.55it/s]


In [57]:
for idx, x in enumerate(record_list):
    dfdic[x] = pd.concat([subject_dataframes[x], dfdic[x]], axis=1)

In [58]:
dfdic['04015']

Unnamed: 0.1,Unnamed: 0,subsetID,rhythmLabel,StoS,StoR,StoL,RtoS,RtoR,RtoL,LtoS,LtoR,LtoL,RMS,STD,CoefVar,Range,IQR,MAD
0,0,04015-0.csv,Normal,0.200000,0.057143,0.028571,0.057143,0.228571,0.142857,0.057143,0.114286,0.114286,156.276998,43.660906,0.248152,155.0,78.00,32.5
1,1,04015-1.csv,Normal,0.000000,0.052632,0.000000,0.052632,0.184211,0.236842,0.000000,0.210526,0.263158,197.783973,57.233572,0.355262,213.0,93.50,55.0
2,2,04015-2.csv,Normal,0.333333,0.083333,0.055556,0.027778,0.305556,0.083333,0.111111,0.000000,0.000000,151.638551,43.820505,0.253179,138.0,72.00,56.0
3,3,04015-3.csv,Normal,0.300000,0.175000,0.000000,0.125000,0.025000,0.125000,0.075000,0.050000,0.125000,157.611865,48.827831,0.318730,167.0,79.00,59.0
4,4,04015-4.csv,Normal,0.090909,0.151515,0.090909,0.030303,0.060606,0.212121,0.212121,0.060606,0.090909,226.855681,53.260256,0.289227,237.0,95.75,66.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1413,1413,04015-1413.csv,Normal,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,20.688161,4.756725,0.023406,22.0,4.00,1.0
1414,1414,04015-1414.csv,Normal,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,21.788759,4.269535,0.021069,19.0,3.00,1.0
1415,1415,04015-1415.csv,Normal,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,26.594172,6.170151,0.030419,24.0,6.00,2.0
1416,1416,04015-1416.csv,Normal,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,23.119256,5.017348,0.024803,22.0,3.00,9.0


In [60]:
reload_flag = False
for idx, x in enumerate(record_list):
    if not os.path.exists('mit-bih-features/'+x+ '.csv') or reload_flag:
        dfdic[x].to_csv('mit-bih-features/'+x+'.csv') 
