In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [2]:
# Data Import
x_train_file = Path("Resources/X_train_actual_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_actual_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_actual_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_actual_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

X_train.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)
X_test.drop(columns=['work_type','ever_married','Residence_type'], inplace=True)

y_train = y_train['stroke']
y_test = y_test['stroke']

X_train

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status
0,1.193046,1.054871,-0.488678,-0.343286,0.261246,-0.893721,-0.588710
1,-0.838191,0.657670,-0.488678,-0.343286,-0.261658,0.768508,0.413607
2,-0.838191,-0.578070,-0.488678,-0.343286,-0.471103,-0.728995,1.415924
3,-0.838191,0.083933,-0.488678,-0.343286,0.776694,-0.339645,-1.591027
4,1.193046,-1.857942,-0.488678,-0.343286,-0.749239,-1.357947,0.413607
...,...,...,...,...,...,...,...
327,1.193046,-0.004334,-0.488678,-0.343286,-0.838697,-0.159944,0.413607
328,-0.838191,0.701803,-0.488678,-0.343286,1.830134,2.685312,0.413607
329,-0.838191,-0.136734,-0.488678,-0.343286,-0.583635,0.139556,0.413607
330,-0.838191,-0.710470,-0.488678,-0.343286,1.379649,0.483982,0.413607


#### Model Raw Data (Medical Only)

In [3]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [4]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [5]:
# Making predictions
predictions = rf_model.predict(X_test)

In [6]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [7]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [8]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,34,8
Actual 1,7,35


Accuracy Score : 0.8214285714285714
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        42
           1       0.81      0.83      0.82        42

    accuracy                           0.82        84
   macro avg       0.82      0.82      0.82        84
weighted avg       0.82      0.82      0.82        84



In [9]:
importances = rf_model.feature_importances_
importances

array([0.02349461, 0.41710416, 0.05492538, 0.02842996, 0.23619688,
       0.18101173, 0.05883728])

In [10]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.41710416407034695, 'age'),
 (0.2361968813807764, 'avg_glucose_level'),
 (0.18101173494305559, 'bmi'),
 (0.05883728028498999, 'smoking_status'),
 (0.05492537719175291, 'hypertension'),
 (0.028429956183012296, 'heart_disease'),
 (0.023494605946065835, 'gender')]