In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [11]:
# Data Import
x_train_file = Path("Resources/X_train_SMOTEENN_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_SMOTEENN_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_SMOTEENN_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_SMOTEENN_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

y_train = y_train['stroke']
y_test = y_test['stroke']

X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1.484478,-2.196615,-0.301993,-0.208833,-1.520465,2.044734,-0.787626,-0.437241,-1.722847,-1.398197
1,-0.672839,-1.981487,-0.301993,-0.208833,-1.520465,0.036963,1.269638,-0.154305,-1.785652,-1.398197
2,1.484478,-1.723333,-0.301993,-0.208833,-1.520465,-0.966922,-0.787626,0.749761,-1.550134,-1.398197
3,-0.672839,1.073336,-0.301993,-0.208833,0.657693,-1.970808,1.269638,-0.760903,0.946358,-1.398197
4,-0.672839,-0.733742,-0.301993,-0.208833,0.657693,0.036963,-0.787626,0.779903,1.637211,0.741489
...,...,...,...,...,...,...,...,...,...,...
7844,-0.672839,-0.080750,-0.301993,-0.208833,0.657693,-1.970808,-0.787626,-0.280547,-0.262635,-0.328354
7845,-0.672839,-0.335967,-0.301993,-0.208833,0.657693,0.036963,-0.787626,-0.997505,0.089083,-0.328354
7846,-0.672839,1.106870,-0.301993,-0.208833,0.657693,0.036963,-0.787626,-0.236449,0.524386,0.741489
7847,-0.672839,1.202413,-0.301993,4.788515,0.657693,-1.970808,-0.787626,1.710479,-0.063265,-1.398197


#### Model Raw Data (All Features)

In [12]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [13]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [14]:
# Making predictions
predictions = rf_model.predict(X_test)

In [15]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [16]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [17]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,851,73
Actual 1,1,41


Accuracy Score : 0.9233954451345756
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       924
           1       0.36      0.98      0.53        42

    accuracy                           0.92       966
   macro avg       0.68      0.95      0.74       966
weighted avg       0.97      0.92      0.94       966



In [18]:
importances = rf_model.feature_importances_
importances

array([0.01737856, 0.52466762, 0.00792369, 0.00477202, 0.01944346,
       0.06638197, 0.02467315, 0.17313986, 0.12333567, 0.03828398])

In [19]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.5246676216853897, 'age'),
 (0.173139864637067, 'avg_glucose_level'),
 (0.1233356690656475, 'bmi'),
 (0.0663819741165727, 'work_type'),
 (0.03828398174310104, 'smoking_status'),
 (0.024673154313702216, 'Residence_type'),
 (0.01944346087710613, 'ever_married'),
 (0.017378560926044995, 'gender'),
 (0.00792369250602575, 'hypertension'),
 (0.004772020129342923, 'heart_disease')]