In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from collections import Counter

In [8]:
# Data Import
x_train_file = Path("Resources/X_train_actual_balanced_scaled_train_dataset.csv")
x_test_file = Path("Resources/X_test_actual_balanced_scaled_train_dataset.csv")
y_train_file = Path("Resources/y_train_actual_balanced_scaled_train_dataset.csv")
y_test_file = Path("Resources/y_test_actual_balanced_scaled_train_dataset.csv")

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

y_train = y_train['stroke']
y_test = y_test['stroke']

y_train

0      1
1      0
2      0
3      1
4      0
      ..
327    1
328    1
329    0
330    0
331    1
Name: stroke, Length: 332, dtype: int64

#### Model Raw Data (Medical Only)

In [9]:
# Create model instance
rf_model = RandomForestClassifier(n_estimators=512) 

In [10]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [11]:
# Making predictions
predictions = rf_model.predict(X_test)

In [12]:
# confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [13]:
# accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [14]:
# results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33,9
Actual 1,5,37


Accuracy Score : 0.8333333333333334
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.79      0.82        42
           1       0.80      0.88      0.84        42

    accuracy                           0.83        84
   macro avg       0.84      0.83      0.83        84
weighted avg       0.84      0.83      0.83        84



In [15]:
importances = rf_model.feature_importances_
importances

array([0.02364544, 0.37726349, 0.04999569, 0.02513882, 0.03105617,
       0.0516164 , 0.02240673, 0.21158538, 0.15317452, 0.05411737])

In [17]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.3772634852141376, 'age'),
 (0.2115853760756451, 'avg_glucose_level'),
 (0.15317451733345622, 'bmi'),
 (0.0541173679311521, 'smoking_status'),
 (0.05161640336187801, 'work_type'),
 (0.049995690318026434, 'hypertension'),
 (0.031056171482313738, 'ever_married'),
 (0.025138819584997453, 'heart_disease'),
 (0.02364543705958039, 'gender'),
 (0.02240673163881297, 'Residence_type')]