# <center> Improved Arrhythmia Classification Using Select  </center>
# <center>Morphological and Heart Rate Variability ECG Features. </center>
## <center> Mark James Dunbar </center>

<center>School of Electronic Engineering and Computer Science </center>
<center>Queen Mary University of London </center>
<center>London, United Kingdom </center>
<center>ec21896@qmul.ac.uk </center>

## <center> MSc Big Data Science - Final Project </center>



---

# <center>MSc Final Project Data Pre-Processing Notebook</center>



In this notebook, I will extract and pre-process the ECG data to be used in model experimentation and the final pipeline. The dataset can be found here (https://www.kaggle.com/datasets/nelsonsharma/ecg-lead-2-dataset-physionet-open-access), and is open access.

## Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import datetime
import os
import random
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scipy.ndimage import label
from tqdm import tqdm, notebook

def linebreak():
  print('________________________________________________________________________________')

## Data Preprocessing

Set the sample rate as a global variable, and the path of the raw ECG data to be extracted and pre-processed.

In [None]:
# Dataset sample Rate of 128 Hz
sample_rate = 128 

# Path to data through google drive
dataset_path = 'drive/MyDrive/data/db_npy'

Check out the standard PhysioNet annotations provided in the original dataset.

In [None]:
standard_annotations = os.path.join(dataset_path,'annotations.txt')

# Load the annotations file as a matrix
annotations_txt = np.loadtxt(standard_annotations, dtype='str',delimiter="\t")

annotations_b =  []
annotations_n = []

# Print a list of the annotations, and separate the beat and non-beat annotations
for n in annotations_txt:
  if n[1] == 'b':
    annotations_b.append(n)
  else:
    annotations_n.append(n)

print("Beat Annotations:     ",len(annotations_b))
print("Non-Beat Annotations: ",len(annotations_n))

Beat Annotations:      19
Non-Beat Annotations:  22


AAMI recommended classes. We'll map these to the PhysioNet standard annotations. PhysioNet non-beat annotations will be replaced with the "No feature present" label: `-`

In [None]:
# Note we have 7 classes here, but the F, X and Q classes will be removed from the dataset due to a small number of classes.

N = 'N' # Normal (Non-Ectopic) Beats
S = 'S' # Supra Ventricular Ectopic Beats
V = 'V' # Ventricular Ectopic Beats
F = 'F' # Fusion Beats
Q = 'Q' # Unknown Beats
X = 'X' # Unmapped

remapping_dict = {
              'N': N,
              'R': N,
              'B': N,
              'L': N,
              'A': S,
              'a': S,
              'n': S,
              'J': S,
              'S': S,
              'j': S,
              'e': S,
              'V': V, 
              'r': V,
              'E': V,
              'F': F,
              'f': _,
              '/': _,
              'Q': Q,
              '?': Q,
             }

Class for extracting the data. Each record is saved as a pandas dataframe with the column 'voltage' for the actual ECG signal, and 'label' for the ECG AAMI labels corresponding to the indexes of the ECG signal.

In [None]:
class extractData:
  def __init__(self, database_name, database_path, drop, remapping_dict, extract_to):
    self.database_name = database_name
    self.database_path = database_path
    self.remapping_dict = remapping_dict
    self.extract_to = extract_to
    # Drop unwanted records
    self.record_list = [x for x in np.loadtxt(os.path.join(database_path, database_name, 'RECORDS'), dtype='str',delimiter="\n") if x not in drop]
    self._tidy()
    self._extract()
    
  def _tidy(self):
    linebreak()
    print(f"Beginning data extraction from {self.database_name}")
    for f in tqdm(os.listdir(self.extract_to), desc='Old Data Removal'):
      if any(self.record_list) == f:
        os.remove(os.path.join(self.extract_to, f))

  def _extract(self):
    
    # Iterate over every record in the record list, extract voltages and labels as a single pandas dataframe
    for record_index in tqdm(self.record_list, desc='Data Extraction '):
      # Voltages
      voltages_filename = f"{record_index}_SIG_II.npy"
      voltages = np.load(os.path.join(self.database_path,self.database_name,voltages_filename))

      # Annotations
      annotations_filename = f"{record_index}_BEAT.npy"
      annotations = np.load(os.path.join(self.database_path,self.database_name,annotations_filename))

      # Generate an empty array of R-peak and Non-R peak labels (non R-peak = '_')
      labels = np.full_like(voltages, '_', dtype='str')
      labels_dict = {}
      # Get a dictionary of beat locations
      for i in annotations:
        labels_dict[int(i[0])] = i[1]
      # Map each beat to the empty labels array
      for idx, m in enumerate(labels):
        if idx in labels_dict:

          # assign labels
          labels[idx - 1] = labels_dict[idx]

          # remap the labels
          for key,value in self.remapping_dict.items():
            if key in labels[idx - 1]:
              labels[idx - 1] = labels[idx - 1].replace(key,value)

      # Combine the data into a single pandas dataframe of exactly 30 minutes length
      ecg_dataframe = pd.DataFrame(np.column_stack((voltages,labels)), columns=['voltage','label'])
      ecg_dataframe['voltage'] = ecg_dataframe['voltage'].astype(float)
      if len(ecg_dataframe) > 230400:
        diff = len(ecg_dataframe) - 230400
        ecg_dataframe.drop(ecg_dataframe.tail(diff).index, inplace = True)
      elif len(ecg_dataframe) < 230400:
        print(f"ECG record too short for record {record_index}")

      # Pickle the dataframe in the extraction path
      ecg_dataframe.to_pickle(os.path.join(self.extract_to,f"{record_index}"))

    print(f"\nData successfully extracted from {self.database_name}")

## Extracting, Preprocessing and Saving the Data

In [None]:
# Path to extract data to
extract_to = "drive/MyDrive/data/ECG_extracted_data"

# Databases to extract from
databases = ['mitdb_npy', 'incartdb_npy', 'svdb_npy']

# Records to drop due to unclassified beats
records_to_drop = ['102', '104', '107', '207', '217']

for database in databases:
  extraction = extractData(database, dataset_path, records_to_drop, remapping_dict, extract_to)
  linebreak()
print(len(os.listdir(extract_to)))