# Preparing

Import neccesary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

Get data

In [2]:
df = pd.read_csv("D:\\Learn\\Uni\\ML\\CapstoneProject\\data\\healthcare-dataset-stroke-data.csv")
df = df.drop('id', axis=1)
print(df.shape)

(5104, 11)


In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
#check for missing data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5104 entries, 0 to 5103
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5104 non-null   object 
 1   age                5104 non-null   float64
 2   hypertension       5104 non-null   int64  
 3   heart_disease      5104 non-null   int64  
 4   ever_married       5104 non-null   object 
 5   work_type          5104 non-null   object 
 6   Residence_type     5104 non-null   object 
 7   avg_glucose_level  5104 non-null   float64
 8   bmi                4903 non-null   float64
 9   smoking_status     5104 non-null   object 
 10  stroke             5104 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 438.8+ KB


# Data preprocessing


In [5]:
# fill na with mean value
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5104 entries, 0 to 5103
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5104 non-null   object 
 1   age                5104 non-null   float64
 2   hypertension       5104 non-null   int64  
 3   heart_disease      5104 non-null   int64  
 4   ever_married       5104 non-null   object 
 5   work_type          5104 non-null   object 
 6   Residence_type     5104 non-null   object 
 7   avg_glucose_level  5104 non-null   float64
 8   bmi                5104 non-null   float64
 9   smoking_status     5104 non-null   object 
 10  stroke             5104 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 438.8+ KB


In [6]:
# label encode
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['work_type'] = le.fit_transform(df['work_type'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

df.head()


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,28.891862,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [7]:
from sklearn.model_selection import train_test_split
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']
y = df['stroke']        
X = df[features]        

implementing oversampling with smote

In [8]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()        
X, y = oversample.fit_resample(X, y)

split dataset into training dataset and testing dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
count = y_test.value_counts().get(0,0)
print(count)

(7771, 10) (1943, 10) (7771,) (1943,)
981


#  Training model

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
model = RandomForestClassifier()

In [13]:
model.fit(X_train,y_train)

In [14]:
print("Training accuracy on Decision Tree: ", model.score(X_train, y_train)*100)
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))         
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'K-Fold Validation Mean Accuracy: {accuracies.mean()*100:.2f}%')
print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()

Training accuracy on Decision Tree:  100.0
Confusion Matrix: 
 [[914  67]
 [ 35 927]]
Model testing accuracy: 94.75038600102934 %
K-Fold Validation Mean Accuracy: 94.21%
Precision Score: 93.26%
Recall Score: 96.36%
F1 Score: 94.79%



In [15]:
import joblib 
joblib.dump(model, "rf.pkg")
print("Model saved successfully")

Model saved successfully


Finding best parameter for model by using GridSearch

In [14]:

# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#     'criterion' :[ 'gini' , 'entropy','log_loss'],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# best = grid_search.best_params_
# print(best)

In [15]:
model = RandomForestClassifier(criterion='log_loss', max_features='sqrt', max_depth=100)

In [16]:
model.fit(X_train,y_train)

# Score

In [17]:
print("Training accuracy on Decision Tree: ", model.score(X_train, y_train)*100)

Training accuracy on Decision Tree:  100.0


In [18]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(X_test))         
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'K-Fold Validation Mean Accuracy: {accuracies.mean()*100:.2f}%')
print(f'Precision Score: {precision_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(X_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(X_test))*100:.2f}%')
print()

Confusion Matrix: 
 [[891  73]
 [ 35 944]]
Model testing accuracy: 94.44158517756047 %
K-Fold Validation Mean Accuracy: 94.07%
Precision Score: 92.82%
Recall Score: 96.42%
F1 Score: 94.59%



# Conclusion

- the score of this model is pretty good with this problem and with this dataset as well
- the implementation of random forest model is significant better than decision tree with this dataset
- althought there are fluctuations in the score of the model due to the randomly spliting the dataset into training set and testing set and the process of create tree and decide feature in the model, the fluctions is negligible and in the acceptable range. 