In [1]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from sklearn import set_config
set_config(display='diagram')

import pandas as pd

np.random.seed(0)

## Load the LongIsland_Heart_Data Set
heart_df = pd.read_csv('LongIsland_Heart_Data.csv')

heart_df.describe()
print(heart_df)

      age  sex   cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  4.0      10.0  60.0  2.0      1.0     12.0    3.0     11.0   
1    44.0  1.0  4.0       3.0   NaN  2.0      1.0      8.0    NaN     14.0   
2    60.0  1.0  4.0       5.0  27.0  2.0      1.0     19.0    3.0      6.0   
3    55.0  1.0  4.0      11.0  39.0  2.0      1.0     25.0    3.0     10.0   
4    66.0  1.0  3.0      33.0  22.0  3.0      2.0     53.0    3.0      5.0   
..    ...  ...  ...       ...   ...  ...      ...      ...    ...      ...   
195  54.0  0.0  4.0      41.0  95.0  3.0      1.0      NaN    NaN     14.0   
196  62.0  1.0  1.0       1.0  30.0  2.0      1.0      1.0    1.0      1.0   
197  55.0  1.0  4.0      37.0  33.0  3.0      1.0      4.0    2.0      NaN   
198  58.0  1.0  NaN       1.0   3.0  NaN      NaN      1.0    1.0      1.0   
199  62.0  1.0  2.0      34.0   NaN  2.0      NaN     47.0    3.0     14.0   

     slope   ca  thal  diagnosis of heart disease  
0      3.0 

In [11]:
# check for samples that have more than 4 missing values
heart_df.loc[heart_df.isnull().sum(axis=1) >=4, :]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis of heart disease
69,63.0,1.0,2.0,1.0,26.0,3.0,1.0,,1.0,,,1.0,,2
120,62.0,1.0,3.0,1.0,,,1.0,1.0,,,1.0,1.0,1.0,3
130,,,3.0,,29.0,3.0,1.0,,,,1.0,1.0,3.0,2
156,64.0,1.0,,15.0,7.0,2.0,1.0,,3.0,,3.0,,1.0,3
175,58.0,1.0,4.0,,22.0,2.0,1.0,55.0,,14.0,,,1.0,1
186,61.0,,3.0,,,2.0,0.0,51.0,3.0,14.0,1.0,1.0,,4


In [13]:
# create a df with 4 of the most important categorical and numerical features
X_reduced= heart_df.loc[:, ['chol', 'thalach','sex','cp']]
X_reduced.head()

Unnamed: 0,chol,thalach,sex,cp
0,60.0,12.0,1.0,4.0
1,,8.0,1.0,4.0
2,27.0,19.0,1.0,4.0
3,39.0,25.0,1.0,4.0
4,22.0,53.0,1.0,3.0


In [15]:
#separate numeric and categorical features
numeric_features = ['chol','thalach']
categorical_features = ['sex','cp']

#for numeric variables, use simple imputer to replace missing values with the median of the remaining values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

#for categorical variables use simple imputer with a constant missing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1))])

#combine numerical and categorical imputers in a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_imputer', numeric_transformer, numeric_features),
        ('categorical_imputer', categorical_transformer, categorical_features)])



In [16]:
#apply the preprocessor pipeline on the reduced df
clf = Pipeline(steps=[('preprocessor', preprocessor)])
new_X_reduced = clf.fit_transform(X_reduced)

new_X_reduced_df = pd.DataFrame( new_X_reduced )
new_X_reduced_df.columns = [ numeric_features + categorical_features ]

print(new_X_reduced_df)

     chol thalach  sex   cp
0    60.0    12.0  1.0  4.0
1    29.0     8.0  1.0  4.0
2    27.0    19.0  1.0  4.0
3    39.0    25.0  1.0  4.0
4    22.0    53.0  1.0  3.0
..    ...     ...  ...  ...
195  95.0    19.0  0.0  4.0
196  30.0     1.0  1.0  1.0
197  33.0     4.0  1.0  4.0
198   3.0     1.0  1.0 -1.0
199  29.0    47.0  1.0  2.0

[200 rows x 4 columns]
