In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [3]:
# Data Import
x_train_file = Path("Resources/X_train_actual_balanced_unscaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_actual_balanced_unscaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_actual_balanced_unscaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_actual_balanced_unscaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

y_train = y_train['stroke']
y_test = y_test['stroke']

X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,78.0,0,0,1,3,1,133.19,23.6,1
1,0,69.0,0,0,1,2,0,103.73,34.7,2
2,0,41.0,0,0,1,0,1,91.93,24.7,3
3,0,56.0,0,0,1,0,0,162.23,27.3,0
4,1,12.0,0,0,0,4,1,76.26,20.5,2
...,...,...,...,...,...,...,...,...,...,...
327,1,54.0,0,0,1,2,1,71.22,28.5,2
328,0,70.0,0,0,1,2,1,221.58,47.5,2
329,0,51.0,0,0,1,0,0,85.59,30.5,2
330,0,38.0,0,0,1,2,0,196.20,32.8,2


#### Model Raw Data (Medical Only)

In [4]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [5]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [6]:
# Making predictions
predictions = rf_model.predict(X_test)

In [7]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [8]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [9]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,32,10
Actual 1,6,36


Accuracy Score : 0.8095238095238095
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.76      0.80        42
           1       0.78      0.86      0.82        42

    accuracy                           0.81        84
   macro avg       0.81      0.81      0.81        84
weighted avg       0.81      0.81      0.81        84



In [10]:
importances = rf_model.feature_importances_
importances

array([0.0230881 , 0.38383474, 0.05183667, 0.02501743, 0.03035735,
       0.048949  , 0.02325657, 0.20606934, 0.1539157 , 0.0536751 ])

In [11]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.38383473783126404, 'age'),
 (0.20606934258216755, 'avg_glucose_level'),
 (0.1539157028543954, 'bmi'),
 (0.05367509736395524, 'smoking_status'),
 (0.051836670286554455, 'hypertension'),
 (0.04894899963820411, 'work_type'),
 (0.03035734758065237, 'ever_married'),
 (0.025017428477786624, 'heart_disease'),
 (0.02325657412045375, 'Residence_type'),
 (0.02308809926456656, 'gender')]