In [15]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [16]:
# Data Import
file_path = Path("Resources/healthcare-dataset-stroke-data.csv")


stroke_df = pd.read_csv(file_path)

stroke_df.drop(columns=['id', 'gender', 'ever_married', 'work_type', 'Residence_type'], inplace=True)

stroke_df = stroke_df.dropna()

stroke_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,67.0,0,1,228.69,36.6,formerly smoked,1
2,80.0,0,1,105.92,32.5,never smoked,1
3,49.0,0,0,171.23,34.4,smokes,1
4,79.0,1,0,174.12,24.0,never smoked,1
5,81.0,0,0,186.21,29.0,formerly smoked,1


In [17]:
stroke_df=pd.get_dummies(stroke_df, columns=['smoking_status'])
stroke_df.drop_duplicates(inplace=True)
y = stroke_df['stroke']
stroke_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0


In [18]:
X = stroke_df.drop(columns="stroke", axis=1)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

#### Model Raw Data (Medical Only)

In [21]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [22]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [23]:
# Making predictions
predictions = rf_model.predict(X_test)

In [24]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [25]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [26]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1155,15
Actual 1,56,2


Accuracy Score : 0.9421824104234527
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1170
           1       0.12      0.03      0.05        58

    accuracy                           0.94      1228
   macro avg       0.54      0.51      0.51      1228
weighted avg       0.91      0.94      0.93      1228



In [27]:
importances = rf_model.feature_importances_
importances

array([0.24245976, 0.0294893 , 0.02522736, 0.34730873, 0.28623027,
       0.01418913, 0.01630577, 0.02126681, 0.01752286])

In [28]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.347308728630153, 'avg_glucose_level'),
 (0.28623027453428834, 'bmi'),
 (0.24245976487491738, 'age'),
 (0.029489297654200237, 'hypertension'),
 (0.025227362927663036, 'heart_disease'),
 (0.021266805899216527, 'smoking_status_never smoked'),
 (0.017522856761750166, 'smoking_status_smokes'),
 (0.016305774980962917, 'smoking_status_formerly smoked'),
 (0.014189133736848518, 'smoking_status_Unknown')]