In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [2]:
# Data Import
x_train_file = Path("Resources/X_train_actual_balanced_unscaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_actual_balanced_unscaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_actual_balanced_unscaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_actual_balanced_unscaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

X_train.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)
X_test.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)

y_train = y_train['stroke']
y_test = y_test['stroke']

X_train

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status
0,1,78.0,0,0,133.19,23.6,1
1,0,69.0,0,0,103.73,34.7,2
2,0,41.0,0,0,91.93,24.7,3
3,0,56.0,0,0,162.23,27.3,0
4,1,12.0,0,0,76.26,20.5,2
...,...,...,...,...,...,...,...
327,1,54.0,0,0,71.22,28.5,2
328,0,70.0,0,0,221.58,47.5,2
329,0,51.0,0,0,85.59,30.5,2
330,0,38.0,0,0,196.20,32.8,2


#### Model Raw Data (Medical Only)

In [3]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [4]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [5]:
# Making predictions
predictions = rf_model.predict(X_test)

In [6]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [7]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [8]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,35,7
Actual 1,7,35


Accuracy Score : 0.8333333333333334
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        42
           1       0.83      0.83      0.83        42

    accuracy                           0.83        84
   macro avg       0.83      0.83      0.83        84
weighted avg       0.83      0.83      0.83        84



In [9]:
importances = rf_model.feature_importances_
importances

array([0.02357239, 0.41608423, 0.05918891, 0.03040652, 0.23364764,
       0.17898438, 0.05811593])

In [10]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.4160842271816528, 'age'),
 (0.23364763765761026, 'avg_glucose_level'),
 (0.1789843835503861, 'bmi'),
 (0.05918891105638497, 'hypertension'),
 (0.05811592662872331, 'smoking_status'),
 (0.0304065190228745, 'heart_disease'),
 (0.02357239490236805, 'gender')]