# Pre-Processing Pipelines

## Clinical Risk Factor Pre-Processing

In [60]:
import pandas as pd
import numpy as np

crf_path = "CRFs.csv"

In [61]:
df = pd.read_csv(crf_path)
df.head()

Unnamed: 0,Record,Gender,Age,Weight,Height,BSA,BMI,Smoker,SBP,DBP,IMT MAX,LVMi,EF,Vascular event
0,1911,M,56,105,180,2.29,32.41,yes,140.0,80.0,4.0,123.0,66.0,none
1,2012,M,72,83,169,1.97,29.06,no,130.0,75.0,,121.0,69.0,none
2,2019,F,80,80,165,1.91,29.38,no,177.0,75.0,2.5,164.0,56.0,none
3,2020,M,77,88,178,2.09,27.77,no,140.0,85.0,2.7,115.0,67.0,none
4,2025,F,66,80,174,1.97,26.42,no,110.0,65.0,1.5,98.0,66.0,none


In [62]:
df = df.drop(columns=['Record','IMT MAX','LVMi','EF'])
df.head()

Unnamed: 0,Gender,Age,Weight,Height,BSA,BMI,Smoker,SBP,DBP,Vascular event
0,M,56,105,180,2.29,32.41,yes,140.0,80.0,none
1,M,72,83,169,1.97,29.06,no,130.0,75.0,none
2,F,80,80,165,1.91,29.38,no,177.0,75.0,none
3,M,77,88,178,2.09,27.77,no,140.0,85.0,none
4,F,66,80,174,1.97,26.42,no,110.0,65.0,none


Gender, Smoker and Vascular event values need to be encoded

In [63]:
df['Gender'] = df['Gender'].str.upper()
df['Smoker'] = df['Smoker'].str.upper()

One-Hot encoding of Gender and Smoker values

In [64]:
one_hot = pd.get_dummies(df[['Gender', 'Smoker']])
df_multi = pd.concat([df,one_hot], axis=1)
df_multi = df_multi.drop(columns=['Gender','Smoker'])
df_multi.head()

Unnamed: 0,Age,Weight,Height,BSA,BMI,SBP,DBP,Vascular event,Gender_F,Gender_M,Smoker_NO,Smoker_YES
0,56,105,180,2.29,32.41,140.0,80.0,none,False,True,False,True
1,72,83,169,1.97,29.06,130.0,75.0,none,False,True,True,False
2,80,80,165,1.91,29.38,177.0,75.0,none,True,False,True,False
3,77,88,178,2.09,27.77,140.0,85.0,none,False,True,True,False
4,66,80,174,1.97,26.42,110.0,65.0,none,True,False,True,False


Need to choose between binary and multi class classification for vascular event

In [65]:
df_bin_class = df_multi.copy()
df_bin_class['Vascular event'] = df_bin_class['Vascular event'].apply(lambda x: False if x == 'none' else True)
df_bin_class.head()

Unnamed: 0,Age,Weight,Height,BSA,BMI,SBP,DBP,Vascular event,Gender_F,Gender_M,Smoker_NO,Smoker_YES
0,56,105,180,2.29,32.41,140.0,80.0,False,False,True,False,True
1,72,83,169,1.97,29.06,130.0,75.0,False,False,True,True,False
2,80,80,165,1.91,29.38,177.0,75.0,False,True,False,True,False
3,77,88,178,2.09,27.77,140.0,85.0,False,False,True,True,False
4,66,80,174,1.97,26.42,110.0,65.0,False,True,False,True,False


BSA Calculation is $\sqrt{Weight (kg) * Height (cm)}\over 3600$ : https://www.registerednursern.com/body-surface-area-calculations-nursing-review/

BMI Calculation is $Weight (kg) \over Height^{2} (m)$ : https://www.registerednursern.com/bmi-calculation-formula-explained/

In [67]:
df_bin_class[['Gender_F', 'Gender_M', 'Smoker_NO', 'Smoker_YES', 'Vascular event']] = df_bin_class[['Gender_F', 'Gender_M', 'Smoker_NO', 'Smoker_YES', 'Vascular event']].astype(int)
df_bin_class.head()

Unnamed: 0,Age,Weight,Height,BSA,BMI,SBP,DBP,Vascular event,Gender_F,Gender_M,Smoker_NO,Smoker_YES
0,56,105,180,2.29,32.41,140.0,80.0,0,0,1,0,1
1,72,83,169,1.97,29.06,130.0,75.0,0,0,1,1,0
2,80,80,165,1.91,29.38,177.0,75.0,0,1,0,1,0
3,77,88,178,2.09,27.77,140.0,85.0,0,0,1,1,0
4,66,80,174,1.97,26.42,110.0,65.0,0,1,0,1,0


## ECG Signal Pre-Processing

WFDB Documentation: https://wfdb.readthedocs.io/en/latest/index.html

In [None]:
import wfdb

ecg_data_path = "dataset/"