In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [2]:
# Data Import
x_train_file = Path("Resources/X_train_SMOTEENN_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_SMOTEENN_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_SMOTEENN_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_SMOTEENN_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

X_train.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)
X_test.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)

y_train = y_train['stroke']
y_test = y_test['stroke']

y_train

0       0
1       0
2       0
3       0
4       0
       ..
6568    1
6569    1
6570    1
6571    1
6572    1
Name: stroke, Length: 6573, dtype: int64

#### Model SMOTE Data (Medical Only)

In [3]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [4]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [5]:
# Making predictions
predictions = rf_model.predict(X_test)

In [6]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [7]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [8]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,830,94
Actual 1,20,22


Accuracy Score : 0.8819875776397516
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       924
           1       0.19      0.52      0.28        42

    accuracy                           0.88       966
   macro avg       0.58      0.71      0.61       966
weighted avg       0.94      0.88      0.91       966



In [9]:
importances = rf_model.feature_importances_
importances

array([0.02869446, 0.44537928, 0.03623954, 0.02067725, 0.15668653,
       0.12841732, 0.18390562])

In [10]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.44537928359278056, 'age'),
 (0.1839056210524201, 'smoking_status'),
 (0.15668652650124315, 'avg_glucose_level'),
 (0.12841732159648817, 'bmi'),
 (0.036239538386845294, 'hypertension'),
 (0.02869446080752477, 'gender'),
 (0.020677248062698106, 'heart_disease')]