In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [11]:
# Data Import
x_train_file = Path("Resources/X_train_SMOTEENN_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_SMOTEENN_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_SMOTEENN_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_SMOTEENN_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

X_train.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)
X_test.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)

y_train = y_train['0']
y_test = y_test['stroke']

y_train

0        0
1        0
2        0
3        0
4        0
        ..
11667    1
11668    1
11669    1
11670    1
11671    1
Name: 0, Length: 11672, dtype: int64

#### Model Raw Data (Medical Only)

In [3]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [4]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [5]:
# Making predictions
predictions = rf_model.predict(X_test)

In [6]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [7]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [8]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,865,59
Actual 1,1,41


Accuracy Score : 0.937888198757764
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       924
           1       0.41      0.98      0.58        42

    accuracy                           0.94       966
   macro avg       0.70      0.96      0.77       966
weighted avg       0.97      0.94      0.95       966



In [9]:
importances = rf_model.feature_importances_
importances

array([0.02573371, 0.49752656, 0.01064918, 0.00786326, 0.23733118,
       0.16636798, 0.05452813])

In [10]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.49752656327291045, 'age'),
 (0.23733117504398857, 'avg_glucose_level'),
 (0.1663679798962429, 'bmi'),
 (0.05452813157412993, 'smoking_status'),
 (0.02573371260485835, 'gender'),
 (0.010649182041730647, 'hypertension'),
 (0.007863255566139244, 'heart_disease')]