In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB #Modeling using Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder, StandardScaler
from io import StringIO
import seaborn
from pydot import graph_from_dot_data
%matplotlib inline

filename = '/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'

data = pd.read_csv(filename)
data.head(5)


In [2]:
# Check Null Values
data.isna().sum()

In [3]:
# File null values of bmi with a mean
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.head(5)

In [4]:
# Check Data Columns their types
data.info()

In [5]:
data = data.drop(columns ='id')
data.describe()

In [6]:
categories = ['gender', 'hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status', 'stroke']

fig,axs = plt.subplots(4, 2, figsize=(20,25))
axs = axs.flatten()
# Plot all the categories
for i, col_name in enumerate(categories):
    seaborn.countplot(x=col_name, data=data, ax=axs[i], hue = data['stroke'], palette=['#432371','#FAAE7B'])
    axs[i].set_xlabel(f"{col_name}", weight = 'bold')
    axs[i].set_ylabel('Count', weight='bold')

In [7]:
# Print graph based on stroke history and Gender as  Male
data[(data.stroke==1) & (data.gender==1)]['age'].plot(kind='hist',bins=[20,30,40,50,60,70, 80,90,100],rwidth=0.8)
plt.show()

In [8]:
# Print graph based on stroke history and Gender as female
data[(data.stroke==1) & (data.gender==0)]['age'].plot(kind='hist',bins=[20,30,40,50,60,70,80,90,100],rwidth=0.8)
plt.show()

In [9]:
# Transform values to non-string for classification
columns = ['avg_glucose_level','bmi','age']

data.drop(columns=columns,axis=1)

labelEncoder = LabelEncoder()

data['gender'] = labelEncoder.fit_transform(data['gender'])
data['ever_married'] = labelEncoder.fit_transform(data['ever_married'])
data['work_type'] = labelEncoder.fit_transform(data['work_type'])
data['Residence_type'] = labelEncoder.fit_transform(data['Residence_type'])
data['smoking_status'] = labelEncoder.fit_transform(data['smoking_status'])

In [None]:
#Print Decision Tree
x = data.drop(['stroke'], axis=1)
y = data['stroke']
features = x.columns
model_dt = DecisionTreeClassifier(random_state=1234)
trained_model = model_dt.fit(x, y)
dotfile = StringIO()
export_graphviz(
    trained_model,  
    out_file        = dotfile,
    feature_names   = x.columns, 
    class_names     = ['[y=0]', '[y=1]'], # Ascending numerical order
    filled          = True,
    rounded         = True
)
(graph,) = graph_from_dot_data(dotfile.getvalue())
graph.write_png("decision_tree.png")

In [None]:
# Drop Other gender value
data['gender'] = data['gender'].replace('Other', list(data.gender.mode().values)[0])
data.gender.value_counts()
data.heart_disease.value_counts()

In [None]:
data[(data.heart_disease==1)].stroke.value_counts()

In [None]:
stdScaler = StandardScaler()
stand_scaled = stdScaler.fit_transform(data[['avg_glucose_level','bmi','age']])
stand_scaled = pd.DataFrame(stand_scaled,columns=columns)
data.groupby(['gender'])['bmi'].agg(['mean', 'median'])
data = pd.concat([data, stand_scaled], axis=1)
data.head(5)

In [None]:
# Prepare the predicates
x = data.drop(['stroke'], axis=1)
y = data['stroke']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 200)

def printMatrix(arr):
    for line in arr:
        print ('\t'.join(map(str, line)))

In [None]:
# Run K-nearest Neighbors
model_knn = KNeighborsClassifier()
model_knn.fit(x_train, y_train)
y_predict = model_knn.predict(x_test)
arg_test = {'y_true':y_test, 'y_pred':y_predict}

print("Confusion Matrix:")
printMatrix(confusion_matrix(**arg_test))

print("\nClassification Report:\n")
print(classification_report(**arg_test))

In [None]:
# Run Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(x_train, y_train)
y_predict = model_rf.predict(x_test)
arg_test = {'y_true':y_test, 'y_pred':y_predict}

print("Confusion Matrix:")
printMatrix(confusion_matrix(**arg_test))

print("\nClassification Report:\n")
print(classification_report(**arg_test))

In [None]:
# Run Decision Tree Classifier
model_dt.fit(x_train, y_train)
y_predict = model_dt.predict(x_test)
arg_test = {'y_true':y_test, 'y_pred':y_predict}

print("Confusion Matrix:")
printMatrix(confusion_matrix(**arg_test))

print("\nClassification Report:\n")
print(classification_report(**arg_test))

In [None]:
# Run Naive Bayes Classifier
model_nb = GaussianNB()
model_nb.fit(x_train, y_train)
y_predict = model_nb.predict(x_test)
arg_test = {'y_true':y_test, 'y_pred':y_predict}

print("Confusion Matrix:")
printMatrix(confusion_matrix(**arg_test))

print("\nClassification Report:\n")
print(classification_report(**arg_test))