# AF classification from 9s ECG waveforms

#### Jiaxing (Joy) Qiu



Reference:
https://physionet.org/content/challenge-2017/1.0.0/

Objective:
1. In real world application, given a "10s" ECG chart, the model aims to predict risk of AF outcome like a clinician does;
2. Interpretable CNN -- comparing vertical / horizontal patterns in 10s ECGs from AF patients versus others.

Data Description:
- **data/REFERENCE.csv** contains outcome labels.
- **data** folder contains data for this project.
- **data/raw** subfolder contains raw physionet 2017 challenge in original matlab file format: 8528 single lead ECG recordings lasting from 9s to just over 60s. ECG recordings were sampled as 300 Hz, value unit is 0.1 millivolts

- **data/image** subfolder contains image format dataset to develop image classifier on, they are engineered in this way --  
    + train has 70%
    + valid has 20%
    + test has 10%



In [None]:
import pandas as pd
import numpy as np
import random as rd
import scipy.io
import matplotlib.pyplot as plt
import os
from PIL import Image
import re

### prepare file paths

In [None]:
# original matlab file directory
mat_path = './data/raw'
# image file directory
img_path = './data/image'
if not os.path.exists(img_path):
    os.mkdir(img_path)

In [None]:
df = pd.read_csv('./data/REFERENCE.csv')
def assign_train_valid_test(df):
    #rd.seed(333)
    train, valid, test = np.split(df.sample(frac=1, replace=False, random_state=333), [int(.7*len(df)), int(.9*len(df))])
    df['g'] = "train"
    df.loc[df['fin'].isin(train['fin'].tolist()), ['g']] = "train"
    df.loc[df['fin'].isin(valid['fin'].tolist()), ['g']] = "valid"
    df.loc[df['fin'].isin(test['fin'].tolist()), ['g']] = "test"
    return df

df_new = df.groupby(['label']).apply(assign_train_valid_test)
print(df_new.groupby(['label','g'])['fin'].nunique())
df_new.to_csv("./data/label.csv") # save locally

label  g    
A      test       74
       train     516
       valid     148
N      test      505
       train    3535
       valid    1010
O      test      246
       train    1719
       valid     491
~      test       29
       train     198
       valid      57
Name: fin, dtype: int64


In [None]:
fl = os.listdir(mat_path)
fins = [re.sub(".mat","",fin) for fin in fl if fin.endswith('.mat')]
label = pd.read_csv("./data/label.csv")

In [None]:
# pip install opencv-python
import cv2
from tqdm.auto import tqdm

tbl_init = True

for k in tqdm(range(len(fins)), desc="processing..."):
    fin = fins[k] # loop through each ECG record # 0: Normal example 4: AF example
    rcd = scipy.io.loadmat(mat_path+'/'+fin+'.mat')
    rcd1d = rcd['val'].flatten() # 1D array
    sub_rcd_len = 9*300
    n_sub = len(rcd1d)//sub_rcd_len
    sub_rcds = [rcd1d[i*sub_rcd_len:(i+1)*sub_rcd_len] for i in range(n_sub)] # split into equal length pieces

    # loop though each sub record pieces
    for i, sub_rcd in enumerate(sub_rcds):
        #print(f"fin {fin}, piece {i + 1}: {len(sub_rcd)}")

        ### find label l and group g
        l = label.loc[label['fin']==fin, 'label'].values[0]
        g = label.loc[label['fin']==fin, 'g'].values[0]

        ### add to one dataframe for 1d conv ##
        if tbl_init:
            tbl = np.append(sub_rcd, [l, g, fin, i])
            tbl_init = False
        else:
            tbl = np.vstack((tbl, np.append(sub_rcd, [l, g, fin, i]) ))


        ### convert to 2d image for 2d conv ###
        if not os.path.exists('./data/image/'+str(g)): os.mkdir('./data/image/'+str(g)) # create model group folder
        if not os.path.exists('./data/image/'+str(g)+'/'+str(l)): os.mkdir('./data/image/'+str(g)+'/'+str(l)) # create outcome label folder
        im_path = './data/image/'+str(g)+'/'+str(l)+'/'

        w_pixel = sub_rcd_len
        h_pixel = 100
        dpi = 100
        w_fig = w_pixel/dpi
        h_fig = h_pixel/dpi
        plt.figure(figsize=(w_fig,h_fig))
        plt.plot(sub_rcd)
        plt.axis('off')

        plt.savefig(im_path+str(fin)+'_'+str(i)+'.jpg', dpi=dpi) # temporarily save as RGB
        plt.close();
        # Load the saved image and convert to grayscale
        image = cv2.imread(im_path+str(fin)+'_'+str(i)+'.jpg', cv2.IMREAD_GRAYSCALE)
#         # Display the grayscale image (optional)
#         plt.imshow(image, cmap='gray')
#         plt.axis('off')  # Turn off axis labels
#         plt.show();
        # Save the grayscale image
        cv2.imwrite(im_path+str(fin)+'_'+str(i)+'.jpg', image)


processing...:   0%|          | 0/8528 [00:00<?, ?it/s]

In [None]:
pd.DataFrame(tbl).to_csv("./data/data_9s.csv",index=False)

### Preprocess image version ECG data

In [1]:
# #
# fl = os.listdir(mat_path)
# fins = [re.sub(".mat","",fin) for fin in fl if fin.endswith('.mat')]
# label = pd.read_csv('./training2017/REFERENCE.csv')

# for fin in fins:
#     # load in record of ECG one by one
#     rcd = scipy.io.loadmat(mat_path+'/'+fin+'.mat')
#     rcd2d = rcd['val'] # 2D array
#     rcd1d = rcd2d.flatten() # 1D array

#     # find corresponding label
#     l = label.loc[label['fin']==fin, 'label'].values[0]
#     img_path_l = img_path+'/'+l
#     if not os.path.exists(img_path_l):
#         os.mkdir(img_path_l)
#     # create RGB image and save by label
#     plt.figure(figsize=(30,5))
#     plt.plot(rcd1d)
#     plt.axis('off');
#     plt.savefig(img_path_l+'/'+fin+'.png', dpi=100)


# #     # convert to grayscale image
#     image = Image.open(img_path+'/'+fin+'.png').convert("L")
#     arr = np.asarray(image)
#     plt.figure(figsize=(50,5))
#     plt.imshow(arr, cmap='gray', vmin=0, vmax=255)
#     plt.axis('off');
#     plt.savefig(img_path+'/'+fin+'.png', dpi=100)