### Retrieving Normal sinus rythms, atrial fibrillation rythms and atrial flutter rythms
### From Physionet Databases AFDB, LTAFDB, MITDB and NSRDB

In [1]:
# pip install tqdm, wfdb
import wfdb
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm, trange

# Functions

In [2]:
def get_data(data):
    
    data_path = "data/" + str(data) + "/*.dat" ### "data/AFDB/*.dat"
    records = []
    properties = []
    annot = []
    AnnSymb = []
    AnnSamp = []
    AnnRhythm = []
    Rpeak_Samp = []
    Rpeak_Symb = []
    
    
    ################################ AFDB OR LTAFDB ################################
    if str(data) == "AFDB" or data == "LTAFDB": ### str(data) == ("AFDB") or ("LTAFDB")
        for f in tqdm(glob.glob(data_path)): ##### change the path to own directory.       
            sig, fields = wfdb.rdsamp(f[:-4], channels=[1])  #### In this function, pass 
                                                             #### "channels=[0]" or "channels=[1]" to select channel 1 or 2.
            ann = wfdb.rdann(f[:-4], 'atr')
            QRS = wfdb.rdann(f[:-4], 'qrs')
            Symb = pd.Series(ann.symbol)
            Samp = pd.Series(ann.sample)
            QRS_Symb = pd.Series(QRS.symbol)
            QRS_Samp = pd.Series(QRS.sample)
            Rhythm = pd.Series(ann.aux_note)
            records.append(sig)
            properties.append(fields)
            annot.append(ann)
            AnnSymb.append(Symb)
            AnnSamp.append(Samp)
            Rpeak_Symb.append(QRS_Symb)
            Rpeak_Samp.append(QRS_Samp)
            AnnRhythm.append(Rhythm)
        
        
    elif str(data) == "MITDB": ### str(data) == "MITDB"
        for f in tqdm(glob.glob(data_path)):             #### change the path to own directory.       
            sig, fields = wfdb.rdsamp(f[:-4], channels=[1])  #### In this function, pass "channels=[0]" or "channels=[1]" to select channel 1 or 2.

            ann = wfdb.rdann(f[:-4], 'atr')
            QRS = wfdb.rdann(f[:-4], 'xws')
            Symb = pd.Series(ann.symbol)
            Samp = pd.Series(ann.sample)

            QRS_Symb = pd.Series(QRS.symbol)
            QRS_Samp = pd.Series(QRS.sample)
            Rhythm = pd.Series(ann.aux_note)
            records.append(sig)
            properties.append(fields)
            annot.append(ann)
            AnnSymb.append(Symb)
            AnnSamp.append(Samp)
            Rpeak_Symb.append(QRS_Symb)
            Rpeak_Samp.append(QRS_Samp)
            AnnRhythm.append(Rhythm)
         
        ################################ NSRDB ################################
    elif str(data) == "NSRDB": ### str(data) == "NSRDB"
        for f in tqdm(glob.glob(data_path)):
            sig, fields = wfdb.rdsamp(f[:-4], channels=[1])  #### In this function, you have the opportunity to pass "channels=[0]" or "channels=[1]" to select channel 1 or 2.

            ann = wfdb.rdann(f[:-4], 'atr')
            Symb = pd.Series(ann.symbol)
            Samp = pd.Series(ann.sample)
            Rhythm = pd.Series(ann.aux_note)
            records.append(sig)
            properties.append(fields)
            annot.append(ann)
            AnnSymb.append(Symb)
            AnnSamp.append(Samp)
            Rpeak_Samp = -1 #### NO R-peak_Samp
            AnnRhythm.append(Rhythm)

    
    AnnSymb = pd.Series(AnnSymb).values
    AnnSamp = pd.Series(AnnSamp).values
    
    return AnnSamp, Rpeak_Samp, AnnRhythm

In [3]:
def create_dataframe(data, AnnSamp, Rpeak_Samp, AnnRhythm):
    labeled_Rpeaks = []
    appended_data = []
    k=1
    
    ################################ AFDB ################################
    if str(data) == "AFDB": 
        for i in trange(23): ## 23 records
            for j in range(len(AnnSamp[i])-1): ## AnnSamp or AnnRhythm -- both same dimensions
                df = pd.DataFrame(Rpeak_Samp[i][(Rpeak_Samp[i] > AnnSamp[i][j]) & (Rpeak_Samp[i] < AnnSamp[i][j+1])])
                df['Label'] = AnnRhythm[i][j]
                df['ids'] = k
                appended_data.append(df)
            k+=1    

        labeled_Rpeaks = pd.concat(appended_data)

        final_df = labeled_Rpeaks.loc[(labeled_Rpeaks.Label == '(N') | (labeled_Rpeaks.Label == '(AFIB') | (labeled_Rpeaks.Label == '(AFL'), :]
        final_df['Label'] = final_df['Label'].map({'(N':0 ,'(AFIB':1, '(AFL':2})
        final_df['Label'] = final_df['Label'].astype(int)
        final_df.rename(columns={0: 'Rpeaks'},inplace=True, errors='raise')
        
    ################################ LTAFDB ################################    
    elif str(data) == "LTAFDB" or str(data) == "MITDB":
        ### Rescale_value = 2 for 128 Hz samples (LTAFDB) into 250 Hz samples (AFDB)
        ### Rescale_value = 0.694 for 360 Hz (MITDB) into 250 Hz samples (AFDB)
        rescale_value = 2 if str(data) == "LTAFDB" else 0.694
        
        appended_AnnSamp = [item for i in range(len(AnnSamp)) for item in AnnSamp[i]]
        appended_AnnSymb = [item for i in range(len(AnnRhythm)) for item in AnnRhythm[i]]
        appended_ids = []
        
        for i in range(len(AnnSamp)): #48 records
            for item in AnnSamp[i]: #loop trough each record
                appended_ids.append(k)
            k+=1
        
        df = pd.DataFrame({'Rpeaks':appended_AnnSamp, 'Label':appended_AnnSymb, 'ids':appended_ids})
        
        df['Rpeaks'] = df['Rpeaks']*rescale_value
        df = df.replace(r'^\s*$', np.nan, regex=True) ### Replace empty places with NaN
        df = df.fillna(method='ffill') ### Perform "forward fill" - filling instances (NaN) in front row with preceeeding value
        
        arrhythmia_list = ['(N', '(N\x00', '(AFIB', '(AFIB\x00', '(AFL', '(AFL\x00']
        final_df = df[df['Label'].isin(arrhythmia_list)]
        
        final_df.loc[df['Label'].astype(str) == '(N\x00', 'Label'] = '(N'
        final_df.loc[df['Label'].astype(str) == '(AFIB\x00', 'Label'] = '(AFIB'
        final_df.loc[df['Label'].astype(str) == '(AFL\x00', 'Label'] = '(AFL'
        
        final_df['Label'] = final_df['Label'].map({'(N':0 ,'(AFIB':1, '(AFL':2})
        final_df['Label'] = final_df['Label'].astype(int)
        
    elif str(data) == "NSRDB":
#         ### Rescaling frequency sampling of 128 Hz for NSRDB into 250 Hz (AFDB) and preprocess the dataset.
        appended_data = []
        appended_ids = []
        
        for i in trange(18): ## 18 records
            for item in AnnSamp[i]:
                appended_data.append(item*2)
                appended_ids.append(k)
            k+=1
        
        print(len(appended_data))
        print(len(appended_ids))
        final_df = pd.DataFrame({'Rpeaks':appended_data, 'Label':appended_data, 'ids':appended_ids})
        final_df['Label'] = 0
    
    
    return final_df

# AFDB

In [4]:
AnnSamp_AFDB, Rpeak_Samp_AFDB, AnnRhythm_AFDB = get_data("AFDB")

df_AFDB = create_dataframe("AFDB", AnnSamp_AFDB, Rpeak_Samp_AFDB, AnnRhythm_AFDB)

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:25<00:00,  1.10s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 32.91it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Label'] = final_df['Label'].map({'(N':0 ,'(AFIB':1, '(AFL':2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Label'] = final_df['Label'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: ht

In [5]:
df_AFDB

Unnamed: 0,Rpeaks,Label,ids
0,61,0,1
1,200,0,1
2,358,0,1
3,584,0,1
4,729,0,1
...,...,...,...
15373,2849597,0,23
15374,2849785,0,23
15375,2849975,0,23
15376,2850083,0,23


# LTAFDB

In [6]:
AnnSamp_LTAFDB, Rpeak_Samp_LTAFDB, AnnRhythm_LTAFDB = get_data("LTAFDB")

df_LTAFDB = create_dataframe("LTAFDB", AnnSamp_LTAFDB, Rpeak_Samp_LTAFDB, AnnRhythm_LTAFDB)

100%|██████████████████████████████████████████████████████████████████████████████████| 84/84 [05:39<00:00,  4.04s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Label'] = final_df['Label'].map({'(N':0 ,'(AFIB':1, '(AFL':2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Label'] = final_df['Label'].astype(int)


In [7]:
df_LTAFDB

Unnamed: 0,Rpeaks,Label,ids
0,167214,0,1
1,167256,0,1
2,167472,0,1
3,167688,0,1
4,167906,0,1
...,...,...,...
8992738,8395432,1,84
8992739,8395532,1,84
8992740,8395694,1,84
8992741,8395876,1,84


# MITDB

In [8]:
AnnSamp_MITDB, Rpeak_Samp_MITDB, AnnRhythm_MITDB = get_data("MITDB")

df_MITDB = create_dataframe("MITDB", AnnSamp_MITDB, Rpeak_Samp_MITDB, AnnRhythm_MITDB)

100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:04<00:00, 10.58it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Label'] = final_df['Label'].map({'(N':0 ,'(AFIB':1, '(AFL':2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Label'] = final_df['Label'].astype(int)


In [9]:
df_MITDB

Unnamed: 0,Rpeaks,Label,ids
0,12.492,0,1
1,53.438,0,1
2,256.780,0,1
3,459.428,0,1
4,656.524,0,1
...,...,...,...
112642,450265.118,0,48
112643,450433.760,0,48
112644,450608.648,0,48
112645,450777.984,0,48


# NSRDB

In [10]:
AnnSamp_NSRDB, Rpeak_Samp_NSRDB, AnnRhythm_NSRDB = get_data("NSRDB")

df_NSRDB = create_dataframe("NSRDB", AnnSamp_NSRDB, Rpeak_Samp_NSRDB, AnnRhythm_NSRDB)

100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:45<00:00,  2.55s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 34.05it/s]


1806792
1806792


In [11]:
df_NSRDB

Unnamed: 0,Rpeaks,Label,ids
0,2,0,1
1,104,0,1
2,258,0,1
3,414,0,1
4,568,0,1
...,...,...,...
1806787,19446442,0,18
1806788,19446610,0,18
1806789,19446776,0,18
1806790,19446938,0,18


In [12]:
frames = [df_AFDB, df_LTAFDB, df_MITDB, df_NSRDB] ### df_LTAFDB

final_df = pd.concat(frames)

In [13]:
final_df

Unnamed: 0,Rpeaks,Label,ids
0,61.0,0,1
1,200.0,0,1
2,358.0,0,1
3,584.0,0,1
4,729.0,0,1
...,...,...,...
1806787,19446442.0,0,18
1806788,19446610.0,0,18
1806789,19446776.0,0,18
1806790,19446938.0,0,18


In [14]:
# final_df.to_csv('Physionet_DataFrame.csv')

### Checking for wrong rythms in db

In [15]:
values_to_keep = [0,1,2] ### 713954 (total) - 708567 (NOT AFL) = 5387 AFL labels;   = 253862 AFIB labels;   = 454705 N labels
bool_result = final_df['Label'].isin(values_to_keep)
wrong_rythms = [i for i in bool_result if i is False]
len(wrong_rythms)

0