In [1]:
import pandas as pd
import numpy as np

# Load the metadata file
metadata = pd.read_csv('ptbxl_database.csv')

# Drop rows with missing values
metadata.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [2]:
metadata.columns

Index(['ecg_id', 'patient_id', 'age', 'sex', 'height', 'weight', 'nurse',
       'site', 'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr'],
      dtype='object')

ecg_id: A unique identifier for each ECG recording. <br>
patient_id: A unique identifier for each patient.<br>
age: The age of the patient in years.<br>
sex: The sex of the patient (male or female).<br>
height: The height of the patient in centimeters (if available).<br>
weight: The weight of the patient in kilograms (if available).<br>
nurse: The name of the nurse or technician who performed the ECG recording.<br>
site: The hospital or clinic where the ECG recording was performed.<br>
device: The type of ECG device used to record the data.<br>
recording_date: The date on which the ECG recording was performed.<br>
report: The diagnostic report prepared by a physician based on the ECG recording and other clinical data.<br>
scp_codes: A set of standardized diagnostic codes that describe the cardiac diagnoses and clinical findings observed in the ECG recording.<br>
heart_axis: The mean electrical axis of the heart as determined from the ECG recording.<br>
infarction_stadium1: The stage of myocardial infarction, if present.<br>
infarction_stadium2: A more detailed description of the stage of myocardial infarction, if present.<br>
validated_by: The name of the expert physician who validated the diagnostic report.<br>
second_opinion: Whether a second expert physician provided a second opinion on the diagnostic report.<br>
initial_autogenerated_report: Whether the diagnostic report was initially generated automatically by a computer algorithm.<br>
validated_by_human: Whether the diagnostic report was validated by a human expert.<br>
baseline_drift: Whether baseline drift was present in the ECG recording.<br>
static_noise: Whether static noise was present in the ECG recording.<br>
burst_noise: Whether burst noise was present in the ECG recording.<br>
electrodes_problems: Whether any problems were present with the ECG electrodes during the recording.<br>
extra_beats: Whether any extra beats were present in the ECG recording.<br>
pacemaker: Whether the patient had a pacemaker at the time of the recording.<br>
strat_fold: The fold number used for cross-validation in the original study.<br>
filename_lr: The filename of the ECG recording in low-resolution format.<br>
filename_hr: The filename of the ECG recording in high-resolution format.<br>

In [96]:
scp = pd.read_csv('scp_statements.csv')
scp.rename(columns={'Unnamed: 0': 'name'}, inplace=True)

In [97]:
scp

Unnamed: 0,name,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
0,NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
1,NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
2,DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
3,LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
4,NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,BIGU,"bigeminal pattern (unknown origin, SV or Ventr...",,,1.0,,,Statements related to ectopic rhythm abnormali...,"bigeminal pattern (unknown origin, SV or Ventr...",,,,
67,AFLT,atrial flutter,,,1.0,,,Statements related to impulse formation (abnor...,atrial flutter,51.0,MDC_ECG_RHY_ATR_FLUT,,
68,SVTAC,supraventricular tachycardia,,,1.0,,,Statements related to impulse formation (abnor...,supraventricular tachycardia,55.0,MDC_ECG_RHY_SV_TACHY,,D3-31290
69,PSVT,paroxysmal supraventricular tachycardia,,,1.0,,,Statements related to impulse formation (abnor...,paroxysmal supraventricular tachycardia,,MDC_ECG_RHY_SV_TACHY_PAROX,,


In [98]:
label={
    'Normal ECG findings': ['NDT', 'NST_', 'NORM'],
    'Atrial abnormalities': ['LAO/LAE', 'AFIB', 'AFLT', 'SR', 'PSVT', 'SVTAC', 'SVARR', 'BIGU', 'SARRH', 'SBRAD'],
    'Ventricular abnormalities': ['VCLVH','LVH', 'IMI', 'ASMI', 'ILMI', 'ALMI', 'ISCIN', 'LMI', 'ANEUR', 'RVH', 'IPMI', 'INJAS', 'INJAL', 'INJIL', 'SEHYP', 'INJIN'],
    'Conduction abnormalities': ['LAFB', 'IRBBB', 'IVCD', 'CRBBB', 'CLBBB', 'LPFB', 'ILBBB', 'ISCAL', 'ISCLA', 'ISCIL', 'ISCAS', 'ISCAN', 'ISC_', 'ABQRS', 'PVC', 'LPR','1AVB'],
    'Ischemic heart disease': ['STD_','AMI', 'ISCIN', 'INJAS', 'IPMI', 'ILMI', 'ASMI', 'ALMI', 'PMI','IPLMI','INJLA'],
    'Miscellaneous abnormalities': ['QWAVE','DIG', 'LVOLT', 'HVOLT', 'LNGQT', 'LOWT', 'NT_', 'TAB_', 'STE_', 'PRC(S)', 'STACH', 'PACE', 'PAC', 'INVT'],
    'Other abnormalities': ['RAO/RAE', 'EL', 'WPW', '3AVB', '2AVB','TRIGU']
}


In [134]:
list(label.keys()), list(label.values())

(['Normal ECG findings',
  'Atrial abnormalities',
  'Ventricular abnormalities',
  'Conduction abnormalities',
  'Ischemic heart disease',
  'Miscellaneous abnormalities',
  'Other abnormalities'],
 [['NDT', 'NST_', 'NORM'],
  ['LAO/LAE',
   'AFIB',
   'AFLT',
   'SR',
   'PSVT',
   'SVTAC',
   'SVARR',
   'BIGU',
   'SARRH',
   'SBRAD'],
  ['VCLVH',
   'LVH',
   'IMI',
   'ASMI',
   'ILMI',
   'ALMI',
   'ISCIN',
   'LMI',
   'ANEUR',
   'RVH',
   'IPMI',
   'INJAS',
   'INJAL',
   'INJIL',
   'SEHYP',
   'INJIN'],
  ['LAFB',
   'IRBBB',
   'IVCD',
   'CRBBB',
   'CLBBB',
   'LPFB',
   'ILBBB',
   'ISCAL',
   'ISCLA',
   'ISCIL',
   'ISCAS',
   'ISCAN',
   'ISC_',
   'ABQRS',
   'PVC',
   'LPR',
   '1AVB'],
  ['STD_',
   'AMI',
   'ISCIN',
   'INJAS',
   'IPMI',
   'ILMI',
   'ASMI',
   'ALMI',
   'PMI',
   'IPLMI',
   'INJLA'],
  ['QWAVE',
   'DIG',
   'LVOLT',
   'HVOLT',
   'LNGQT',
   'LOWT',
   'NT_',
   'TAB_',
   'STE_',
   'PRC(S)',
   'STACH',
   'PACE',
   'PAC',
   'INVT']

In [99]:
meta_final = metadata.loc[:, ['ecg_id', 'scp_codes']]

In [137]:
def get_label(dic, lbl):
    for i in range(len(list(dic.values()))):
        if lbl in list(dic.values())[i]:
            return list(dic.keys())[i]

In [139]:
label_list=[]
for s in meta_final['scp_codes']:
    s=s[1:-1]
    #s = s.replace(":", "")
    s = s.replace("', '", ",")
    s = s.replace("'", "")
    s = s.replace(" ", "")
    s=s.split(',')
    score=[]
    name=[]
    for item in s:
        item=item.split(':')
        score.append(float(item[1]))
        name.append(item[0])
    highest_index = score.index(max(score))
    lbl=name[highest_index]
    label_list.append(get_label(label,lbl))

    
        

In [142]:
meta_final['scp_codes']=label_list

In [143]:
meta_final

Unnamed: 0,ecg_id,scp_codes
0,1,Normal ECG findings
1,2,Normal ECG findings
2,3,Normal ECG findings
3,4,Normal ECG findings
4,5,Normal ECG findings
...,...,...
21794,21833,Normal ECG findings
21795,21834,Normal ECG findings
21796,21835,Conduction abnormalities
21797,21836,Normal ECG findings


In [144]:
meta_final.to_csv('meta_final.csv', index=False)