## Imports

In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast

from helpers import load_raw_data

## Load data

In [2]:
#path = '../input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/'
path = 'ptb-xl_ecg/'

Y = pd.read_csv(path + 'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

X = load_raw_data(df = Y, path=path)

100%|██████████| 21837/21837 [10:09<00:00, 35.81it/s]


## Initial analysis

In [3]:
print('X shape:', X.shape)
print('Y shape:', Y.shape, '\n')

print(Y.columns, '\n')

print('Unique classes:', Y['scp_codes'].explode().unique(), '\n')

print(Y.head())

X shape: (21837, 1000, 12)
Y shape: (21837, 27) 

Index(['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr'],
      dtype='object') 

Unique classes: ['NORM' 'LVOLT' 'SR' 'SBRAD' 'IMI' 'ABQRS' 'SARRH' 'AFLT' 'AFIB' 'NDT'
 'NST_' 'DIG' 'LVH' 'LPFB' 'LNGQT' 'LAFB' 'IRBBB' 'RAO/RAE' 'RVH' 'IVCD'
 'LMI' 'ASMI' 'AMI' 'ISCAL' '1AVB' 'STACH' 'ISC_' 'PACE' 'ISCLA' 'SEHYP'
 'ISCIL' 'ILMI' 'PVC' 'CRBBB' 'CLBBB' 'ALMI' 'ANEUR' 'ISCAS' 'TAB_'
 'HVOLT' 'PAC' 'LOWT' 'STD_' 'EL' 'NT_' 'QWAVE' 'INVT' 'LPR' 'VCLVH'
 'LAO/LAE' 'ILBBB' 'ISCIN' 'SVTAC' 'INJAS' 'INJAL' 'IPMI' 'WPW' 'ISCAN'
 'INJLA' 'BIGU' 'TRIGU' 'IPLMI' '

We obviously have some columns here that we cannot use for features. The ones we can remove on the go are:
- patient_id: the unique identifier for each patient, not useful as a feature
- recording_date: the date of the recording, not useful as a feature in this format
- report: the text of the report, which we will not use directly as a feature
- filename_lr: the filename of the left eye image, not useful as a feature
- filename_hr: the filename of the right eye image, not useful as a feature

Let's look at the remaining columns:

## Heart Beat Diagnostic data

In [None]:
# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]
print(agg_df.shape)
agg_df.head()

(44, 12)


Unnamed: 0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7


- NOTE: only 44 of the original 71 statements are diagnostics and will be used

### Applying diagnostic superclass

In [13]:
def aggregate_superclass_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))
    
# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_superclass_diagnostic)
# Apply length of diagnostic superclass
Y['diagnostic_superclass_len'] = Y['diagnostic_superclass'].apply(len)
Y.loc[Y.diagnostic_superclass_len > 1, 'diagnostic_superclass']

ecg_id
39       [MI, STTC]
45        [CD, HYP]
50         [MI, CD]
77         [MI, CD]
87       [STTC, CD]
            ...    
21815      [MI, CD]
21821    [CD, NORM]
21827    [MI, STTC]
21828    [MI, STTC]
21829    [CD, NORM]
Name: diagnostic_superclass, Length: 5158, dtype: object

In [11]:
# Find the distribution of the diagnostic superclasses
print('Distribution of diagnostic superclasses:')
print(Y['diagnostic_superclass'].explode().value_counts(), '\n')


Distribution of diagnostic superclasses:
diagnostic_superclass
NORM    9528
MI      5486
STTC    5250
CD      4907
HYP     2655
Name: count, dtype: int64 



### Brief superclass description
- NORM: Normal ECG
- MI: Miocardial Infarction (heart attack)
- STTC: ST-T wave changes
- CD: Conduction disturbances. Disturbances in the electrical conduction system of the heart.
- HYP: Hypertrophy. Thickening of the heart muscle.

In [16]:
# Find the distribution of the Length of the diagnostic superclasses
print('Distribution of the Length of the diagnostic superclasses:')
print(Y['diagnostic_superclass_len'].value_counts(), '\n')

Distribution of the Length of the diagnostic superclasses:
diagnostic_superclass_len
1    16272
2     4079
3      920
0      407
4      159
Name: count, dtype: int64 



- Only 407 unclassified samples
- Up to 4 superclasses on the same ECG record

## Diagnostic Subclasses

In [17]:
def aggregate_suberclass_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_subclass)
    ret = list(set(tmp))
    ret = ['sub_'+r for r in ret]
    return ret

# Apply diagnostic subclass
Y['diagnostic_subclass'] = Y.scp_codes.apply(aggregate_suberclass_diagnostic)
# Apply length of diagnostic subclass
Y['diagnostic_subclass_len'] = Y['diagnostic_subclass'].apply(len)
Y.loc[Y.diagnostic_subclass_len > 1, 'diagnostic_subclass']


ecg_id
26                    [sub_STTC, sub_NST_]
28                    [sub_STTC, sub_NST_]
39           [sub_STTC, sub_IMI, sub_NST_]
45       [sub_RVH, sub_IRBBB, sub_RAO/RAE]
50                     [sub_IVCD, sub_LMI]
                       ...                
21821                [sub_IRBBB, sub_NORM]
21827                  [sub_IMI, sub_ISCA]
21828                  [sub_IMI, sub_ISCA]
21829                [sub_IRBBB, sub_NORM]
21832            [sub_LAFB/LPFB, sub_IVCD]
Name: diagnostic_subclass, Length: 6191, dtype: object

In [18]:
# Find the distribution of the diagnostic subclasses
print('Distribution of diagnostic subclasses:')
print(Y['diagnostic_subclass'].explode().value_counts(), '\n')


Distribution of diagnostic subclasses:
diagnostic_subclass
sub_NORM         9528
sub_IMI          3281
sub_AMI          3086
sub_STTC         2244
sub_LVH          2137
sub_LAFB/LPFB    1800
sub_ISC_         1275
sub_IRBBB        1118
sub_ISCA          944
sub__AVB          827
sub_IVCD          789
sub_NST_          770
sub_CRBBB         542
sub_CLBBB         536
sub_LAO/LAE       427
sub_ISCI          398
sub_LMI           201
sub_RVH           126
sub_RAO/RAE        99
sub_WPW            80
sub_ILBBB          77
sub_SEHYP          30
sub_PMI            17
Name: count, dtype: int64 



In [19]:
# Find the distribution of the Length of the diagnostic subclasses
print('Distribution of the Length of the diagnostic subclasses:')
print(Y['diagnostic_subclass_len'].value_counts(), '\n')


Distribution of the Length of the diagnostic subclasses:
diagnostic_subclass_len
1    15239
2     4171
3     1439
4      475
0      407
5      102
6        4
Name: count, dtype: int64 



## Data reformatting

- As per now, X includes only the 12 ECG signals for all patients
- Y includes

Let's 

In [22]:
Y.columns

Index(['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr',
       'diagnostic_superclass', 'diagnostic_superclass_len',
       'diagnostic_subclass', 'diagnostic_subclass_len'],
      dtype='object')