In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib as plt
import tensorflow as tf

In [2]:
stroke_df = pd.read_csv('Resources/healthcare-dataset-stroke-data.csv')
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
stroke_df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [4]:
stroke_df.dropna(inplace=True)

In [5]:
stroke_df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
stroke_df.drop(columns='id', inplace=True)
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [7]:
stroke1_df = stroke_df.loc[stroke_df['stroke'] == 1]
stroke1_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
243,Female,68.0,1,1,Yes,Private,Urban,247.51,40.5,formerly smoked,1
244,Male,57.0,0,0,Yes,Private,Rural,84.96,36.7,Unknown,1
245,Female,14.0,0,0,No,children,Rural,57.93,30.9,Unknown,1
246,Female,75.0,0,0,Yes,Self-employed,Rural,78.80,29.3,formerly smoked,1


In [8]:
stroke0_df = stroke_df.loc[stroke_df['stroke'] == 0]
stroke0ran_df = stroke0_df.sample(n=209)
stroke0ran_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1871,Male,56.00,1,0,Yes,Govt_job,Urban,72.79,23.8,smokes,0
504,Female,76.00,0,0,Yes,Govt_job,Urban,96.29,25.4,smokes,0
2685,Male,80.00,1,0,Yes,Self-employed,Urban,178.89,27.4,Unknown,0
4749,Female,55.00,0,0,Yes,Private,Urban,59.36,34.1,smokes,0
415,Female,33.00,0,0,Yes,Private,Rural,71.16,46.5,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...
1733,Female,45.00,0,0,Yes,Private,Rural,77.19,37.2,smokes,0
296,Female,66.00,0,0,Yes,Private,Rural,141.24,28.5,never smoked,0
3447,Female,1.88,0,0,No,children,Rural,97.26,16.7,Unknown,0
1582,Female,78.00,0,0,No,Private,Rural,87.70,29.6,never smoked,0


In [9]:
frames = [stroke1_df, stroke0ran_df]
eqStroke_df = pd.concat(frames)
eqStroke_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.00,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.00,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.00,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.00,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.00,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
1733,Female,45.00,0,0,Yes,Private,Rural,77.19,37.2,smokes,0
296,Female,66.00,0,0,Yes,Private,Rural,141.24,28.5,never smoked,0
3447,Female,1.88,0,0,No,children,Rural,97.26,16.7,Unknown,0
1582,Female,78.00,0,0,No,Private,Rural,87.70,29.6,never smoked,0


In [10]:
stroke_features_df = eqStroke_df.drop(columns='stroke')
stroke_features_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.00,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
2,Male,80.00,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.00,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.00,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
5,Male,81.00,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked
...,...,...,...,...,...,...,...,...,...,...
1733,Female,45.00,0,0,Yes,Private,Rural,77.19,37.2,smokes
296,Female,66.00,0,0,Yes,Private,Rural,141.24,28.5,never smoked
3447,Female,1.88,0,0,No,children,Rural,97.26,16.7,Unknown
1582,Female,78.00,0,0,No,Private,Rural,87.70,29.6,never smoked


In [11]:
stroke_features_df = pd.get_dummies(stroke_features_df)
stroke_features_df.head()
stroke_features_df.nunique()

age                                79
hypertension                        2
heart_disease                       2
avg_glucose_level                 414
bmi                               212
gender_Female                       2
gender_Male                         2
ever_married_No                     2
ever_married_Yes                    2
work_type_Govt_job                  2
work_type_Never_worked              2
work_type_Private                   2
work_type_Self-employed             2
work_type_children                  2
Residence_type_Rural                2
Residence_type_Urban                2
smoking_status_Unknown              2
smoking_status_formerly smoked      2
smoking_status_never smoked         2
smoking_status_smokes               2
dtype: int64

In [21]:
X=stroke_features_df
y= eqStroke_df['stroke']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 42)

In [22]:
scaler = StandardScaler()
scaler.fit(Xtrain)

X_train_scaled = scaler.transform(Xtrain)
X_test_scaled = scaler.transform(Xtest)


In [23]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=20))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [24]:
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [16]:
tuner.search(X_train_scaled, ytrain,epochs=20,validation_data=(X_test_scaled,ytest))

INFO:tensorflow:Oracle triggered exit


In [17]:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 7,
 'num_layers': 3,
 'units_0': 9,
 'units_1': 7,
 'units_2': 7,
 'units_3': 3,
 'units_4': 5,
 'units_5': 5,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [18]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(Xtest, ytest, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

ValueError: Shapes (20, 7) and (21, 7) are incompatible

In [None]:
predictions = best_model.predict(Xtest)
rounded_predictions = np.argmax(predictions, axis=-1)

In [None]:
%matplotlib inline
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt

In [None]:
cm = confusion_matrix(y_true=ytest, y_pred=rounded_predictions)


In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(asix=1)[:, mp.newaxis]
        print('Normalized confusion matrix')
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
cm_plot_labels = ['No Stroke','Stroke']

In [None]:
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')