In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Feature Engineering

In [3]:
# We can drop the 'id' column

In [4]:
df.drop(labels='id', axis=1, inplace=True)

### Let's set up a DF selector 
*As originally seen in Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow 2nd Edition*

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

In [13]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import MinMaxScaler

In [15]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [19]:
num_pip = Pipeline([
    ("gets_numeric", DataFrameSelector(['age', 'hypertension', 'heart_disease', 'avg_glucose_level','bmi' ])),
    ("imputer", SimpleImputer(strategy='mean')),
    ("scaler", MinMaxScaler()),
])

In [20]:
num_pip.fit_transform(df)

array([[0.81689453, 0.        , 1.        , 0.80126489, 0.30126002],
       [0.74365234, 0.        , 0.        , 0.67902317, 0.21298095],
       [0.97558594, 0.        , 1.        , 0.23451205, 0.25429553],
       ...,
       [0.42626953, 0.        , 0.        , 0.12865848, 0.2325315 ],
       [0.62158203, 0.        , 0.        , 0.51320284, 0.17525773],
       [0.53613281, 0.        , 0.        , 0.13922999, 0.18213058]])

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
cat_pip = Pipeline(steps=[
    ("gets_cats", DataFrameSelector(['gender', 'work_type', 'Residence_type', 'smoking_status'])),
    ("cat_encoder", OneHotEncoder(sparse=True))
])

In [23]:
cat_pip.fit_transform(df)

<5110x14 sparse matrix of type '<class 'numpy.float64'>'
	with 20440 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn.pipeline import FeatureUnion

In [26]:
full_pip = FeatureUnion(transformer_list=[
    ("num_pip", num_pip),
    ("cat_pip", cat_pip)
])

In [27]:
X_train = full_pip.fit_transform(df)

In [28]:
X_train

<5110x19 sparse matrix of type '<class 'numpy.float64'>'
	with 36540 stored elements in Compressed Sparse Row format>

In [None]:
y_train = df['stroke']