In [1]:
import pandas as pd

stroke = pd.read_csv('healthcare-dataset-stroke-data.csv', encoding='utf-8')

stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
stroke_ = stroke.copy()

stroke_ = stroke_.drop(['id'], axis=1)

In [3]:
stroke_ = stroke_[stroke_['gender'] != 'Other']

In [4]:
from sklearn.preprocessing import LabelEncoder

def df_labeling(df, columns):
    le_list = []
    for column in columns:
        le = LabelEncoder()
        le.fit(df[column])
        df[column] = le.transform(df[column])
        le_list.append(le)
    return le_list

columns = ['gender','ever_married','work_type','Residence_type','smoking_status']
le_list = df_labeling(stroke_, columns)

In [5]:
stroke_1_mean = stroke_[stroke_['stroke'] == 1]['bmi'].mean()
stroke_0_mean = stroke_[stroke_['stroke'] == 0]['bmi'].mean()

In [6]:
stroke_.loc[ stroke_['bmi'].isnull() & (stroke_['stroke'] == 0), 'bmi' ] = stroke_0_mean
stroke_.loc[ stroke_['bmi'].isnull() & (stroke_['stroke'] == 1), 'bmi' ] = stroke_1_mean

In [7]:
le_list[0].classes_, le_list[1].classes_, le_list[2].classes_

(array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
       dtype=object))

In [8]:
len(stroke_[ stroke_['stroke'] == 0]) / len(stroke_)

0.9512624779800353

---

In [9]:
y = stroke_[['stroke']]
X = stroke_[stroke_.columns.difference(['stroke'])]

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = \
        train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()

X_over_data, y_over_data = sm.fit_resample(train_data, train_label)

In [18]:
import tensorflow as tf
from tensorflow.keras import datasets, utils
from tensorflow.keras import (models, layers, activations, 
                                initializers, losses, optimizers, metrics)

model = models.Sequential() 

model.add(
    layers.Dense(
        input_dim=10, units=256, activation=None, kernel_initializer=initializers.he_uniform()
    )
) 
model.add(layers.Activation('elu')) # elu or relu

model.add(
    layers.Dense(
        units=512, activation=None, kernel_initializer=initializers.he_uniform()
    )
) 
model.add(layers.Activation('elu')) 

model.add(
    layers.Dense(
        units=512, activation=None, kernel_initializer=initializers.he_uniform()
    )
) 
model.add(layers.Activation('elu'))

model.add(
    layers.Dense(
        units=256, activation=None, kernel_initializer=initializers.he_uniform()
    )
) 
model.add(layers.Activation('elu')) 
model.add(layers.Dropout(rate=0.5))

model.add(layers.Dense(units=2, activation='softmax')) # One-hot vector for 0 & 1


model.compile(optimizer=optimizers.Adam(), 
              loss=losses.sparse_categorical_crossentropy, 
              metrics=[metrics.categorical_accuracy])

history = model.fit(X_over_data, y_over_data, batch_size=100, epochs=20, validation_split=0.3, verbose=0)
result = model.evaluate(test_data, test_label)

print('loss (cross-entropy) :', result[0])
print('test accuracy :', result[1])

loss (cross-entropy) : 0.799070654122251
test accuracy : 1.0


In [20]:
tf.keras.__version__

'2.2.4-tf'

In [21]:
import imblearn

imblearn.__version__

'0.8.0'