In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [2]:
# Data Import
x_train_file = Path("Resources/X_train_SMOTEENN_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_SMOTEENN_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_SMOTEENN_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_SMOTEENN_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

y_train = y_train['stroke']
y_test = y_test['stroke']

X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,-0.829948,-0.479290,-0.317082,-0.233018,0.730297,-0.171871,0.981537,-0.000772,-0.828217,0.580955
1,-0.829948,-1.670513,-0.317082,-0.233018,-1.369306,1.662687,-1.018810,-1.024157,-1.610809,-1.295192
2,1.202320,1.197246,-0.317082,-0.233018,0.730297,-0.171871,0.981537,-0.915742,0.139351,1.519028
3,1.202320,0.050142,-0.317082,-0.233018,0.730297,-0.171871,0.981537,-0.126803,0.665822,0.580955
4,-0.829948,-1.582275,-0.317082,-0.233018,-1.369306,1.662687,0.981537,-0.977403,-1.710412,-1.295192
...,...,...,...,...,...,...,...,...,...,...
6568,1.202320,0.942155,-0.317082,-0.233018,0.730297,-0.171871,0.981537,-0.326108,0.006928,-0.683453
6569,-0.829948,1.489543,3.153756,-0.233018,0.730297,0.114769,-1.018810,-0.684412,-0.013604,0.874093
6570,1.202320,1.561426,-0.317082,-0.233018,0.730297,-0.171871,-1.018810,-0.467821,-0.122891,-0.357119
6571,1.202320,-0.013532,-0.317082,-0.233018,0.730297,-0.171871,-1.018810,-0.485303,-0.370126,-1.295192


#### Model Raw Data (All Features)

In [3]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [4]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [5]:
# Making predictions
predictions = rf_model.predict(X_test)

In [6]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [7]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [8]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,850,74
Actual 1,25,17


Accuracy Score : 0.8975155279503105
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       924
           1       0.19      0.40      0.26        42

    accuracy                           0.90       966
   macro avg       0.58      0.66      0.60       966
weighted avg       0.94      0.90      0.91       966



In [9]:
importances = rf_model.feature_importances_
importances

array([0.02771587, 0.40639505, 0.02609713, 0.01270278, 0.07845719,
       0.09188434, 0.02531149, 0.11078383, 0.08222307, 0.13842925])

In [10]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.40639505309505636, 'age'),
 (0.13842925204798245, 'smoking_status'),
 (0.11078383234354384, 'avg_glucose_level'),
 (0.0918843423359233, 'work_type'),
 (0.08222307341996211, 'bmi'),
 (0.07845718577582757, 'ever_married'),
 (0.027715867952303408, 'gender'),
 (0.0260971297823888, 'hypertension'),
 (0.025311486376403933, 'Residence_type'),
 (0.012702776870608236, 'heart_disease')]