In [None]:
# Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [None]:
# Handling missing values

missing_values = ["?","-"," ", ""]
df = pd.read_csv("healthcare-dataset-stroke-data.csv", na_values=missing_values)

In [None]:
# Checking for NA values

df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [None]:
# Checking for Unique Values

df.nunique()

id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [None]:
# Dropping identifier variables

df = df.dropna()
df = df.drop(["id"],axis=1)


In [None]:
# Checking total number of records

len(df)

4909

In [None]:
# Seperating dependant and independant variables

Y = df['stroke']
len(Y)

4909

In [None]:
# Removing the independant varible from dependant variable

df = df.drop(['stroke'], axis=1)

In [None]:
# Seperating continuos variables

conti_var = ["bmi","avg_glucose_level","age"]
conti_df = df[conti_var]
conti_df.head()

Unnamed: 0,bmi,avg_glucose_level,age
0,36.6,228.69,67.0
2,32.5,105.92,80.0
3,34.4,171.23,49.0
4,24.0,174.12,79.0
5,29.0,186.21,81.0


In [36]:
len(conti_df)

4909

In [None]:
# Checking for clear high correlation

variables = []
correlation = []
for var1 in conti_var:
    for var2 in conti_var:
        if var1 == var2:
            pass
        else:
            cor_val, _ = spearmanr(conti_df[var1], conti_df[var2])
            print("Correlation between " + var1 + " and " + var2 + "is: ", cor_val)
            if abs(cor_val) > 0.5:
                variables.append({var1,var2})
                correlation.append(cor_val)

print(variables, correlation)

Correlation between bmi and avg_glucose_levelis:  0.11437029781765023
Correlation between bmi and ageis:  0.37564955990962323
Correlation between avg_glucose_level and bmiis:  0.11437029781765025
Correlation between avg_glucose_level and ageis:  0.14080904558032697
Correlation between age and bmiis:  0.3756495599096232
Correlation between age and avg_glucose_levelis:  0.14080904558032695
[] []


In [None]:
# Seperating categorical variables

cat_var = list(set(df.columns) - set(conti_var)) 

In [39]:
cat_var

['hypertension',
 'ever_married',
 'Residence_type',
 'smoking_status',
 'heart_disease',
 'work_type',
 'gender']

In [40]:
cat_df = df[cat_var]

In [None]:
# Checking for clear high correlation 

variables = []
correlation = []
for var1 in cat_var:
    for var2 in cat_var:
        if var1 == var2:
            pass
        else:
            cor_val, _ = spearmanr(cat_df[var1], cat_df[var2])
            print("Correlation between " + var1 + " and " + var2 + "is: ", cor_val)
            if abs(cor_val) > 0.5:
                variables.append({var1,var2})
                correlation.append(cor_val)
            

Correlation between hypertension and ever_marriedis:  0.16240626040283884
Correlation between hypertension and Residence_typeis:  -0.0010741461750661688
Correlation between hypertension and smoking_statusis:  0.10824293024583871
Correlation between hypertension and heart_diseaseis:  0.11599099148786933
Correlation between hypertension and work_typeis:  -0.03580569389875176
Correlation between hypertension and genderis:  0.021669875718188767
Correlation between ever_married and hypertensionis:  0.16240626040283884
Correlation between ever_married and Residence_typeis:  0.004989171140676837
Correlation between ever_married and smoking_statusis:  0.25824125177832397
Correlation between ever_married and heart_diseaseis:  0.11124512057147098
Correlation between ever_married and work_typeis:  -0.32898598198089946
Correlation between ever_married and genderis:  -0.03692569248470313
Correlation between Residence_type and hypertensionis:  -0.0010741461750661688
Correlation between Residence_typ

In [None]:
#printing all significant correlations

print(variables, correlation)

[] []


In [None]:
# Onehot Encoding for categorical variables

cat_df = pd.get_dummies(cat_df)
cat_df.head()

Unnamed: 0,hypertension,heart_disease,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,gender_Female,gender_Male,gender_Other
0,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0
2,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,1,0
3,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0
4,1,0,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0
5,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0


In [44]:
len(cat_df)

4909

In [None]:
# Applying standard scaling to continuos variables

scaler = StandardScaler()
conti_df = scaler.fit_transform(conti_df)
conti_df = pd.DataFrame(conti_df)

In [46]:
len(conti_df)

4909

In [None]:
# Creating final training data 

conti_df = np.array(conti_df)
cat_df = np.array(cat_df)

X = np.hstack((cat_df,conti_df))

In [48]:
len(X)

4909

In [None]:
X = pd.DataFrame(X)

In [None]:
# Training with a Decision Tree classifier

clf = DecisionTreeClassifier(random_state=1)
cross_val = cross_val_score(clf, X, Y, cv=100)

In [None]:
# Getting accuarcy metrics

average_cross_val = cross_val.sum()/(len(cross_val)+1)
accuracy = round(average_cross_val, 3)*100 

In [57]:
print("Model Accuracy is: " + str(accuracy) + "%")

Model Accuracy is: 90.3%
