In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [18]:
# Data Import
x_train_file = Path("Resources/X_train_SMOTEENN_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_SMOTEENN_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_SMOTEENN_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_SMOTEENN_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

y_train = y_train['0']
y_test = y_test['stroke']

X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,-0.723988,0.328522,-0.300722,-0.215283,0.692615,-0.033583,-0.872666,-0.265636,1.626449,1.702739
1,1.379145,-2.099458,-0.300722,-0.215283,-1.443803,1.897353,-0.872666,-0.045597,-1.154692,-1.344075
2,1.379145,0.497602,-0.300722,-0.215283,0.692615,-0.033583,1.145914,1.800204,1.218347,1.702739
3,-0.723988,-0.516875,-0.300722,-0.215283,0.692615,-0.033583,-0.872666,-0.195650,2.805411,1.702739
4,-0.723988,-1.193192,-0.300722,-0.215283,-1.443803,-0.033583,1.145914,-0.954830,-1.230266,-1.344075
...,...,...,...,...,...,...,...,...,...,...
11667,-0.723988,0.762262,-0.300722,-0.215283,0.692615,-0.033583,-0.872666,2.231287,1.133651,-0.328471
11668,-0.723988,0.363244,-0.300722,-0.215283,0.692615,-0.999051,1.145914,-0.060650,0.445599,-1.344075
11669,1.379145,0.663229,-0.300722,-0.215283,0.692615,-0.999051,-0.872666,-0.732008,-1.092665,-1.344075
11670,-0.723988,0.994758,-0.300722,-0.215283,0.692615,-0.033583,-0.872666,2.275768,0.896119,-0.328471


#### Model Raw Data (Medical Only)

In [8]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [9]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [10]:
# Making predictions
predictions = rf_model.predict(X_test)

In [11]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [12]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [13]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,872,52
Actual 1,1,41


Accuracy Score : 0.9451345755693582
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       924
           1       0.44      0.98      0.61        42

    accuracy                           0.95       966
   macro avg       0.72      0.96      0.79       966
weighted avg       0.97      0.95      0.95       966



In [14]:
importances = rf_model.feature_importances_
importances

array([0.02385713, 0.46086017, 0.01208493, 0.00789193, 0.02163122,
       0.06934564, 0.02602369, 0.20186821, 0.12990296, 0.04653413])

In [15]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.46086017316953587, 'age'),
 (0.20186820833841693, 'avg_glucose_level'),
 (0.12990296168662088, 'bmi'),
 (0.06934563590255742, 'work_type'),
 (0.046534130229486624, 'smoking_status'),
 (0.026023685249889086, 'Residence_type'),
 (0.023857133585690693, 'gender'),
 (0.02163121776860005, 'ever_married'),
 (0.012084926387788927, 'hypertension'),
 (0.00789192768141359, 'heart_disease')]