## Single Random Forest for Multi-Class Classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 1. Load the dataset
df = pd.read_csv("data/symptoms.csv")

In [3]:
df.duplicated()

0      False
1       True
2       True
3       True
4       True
       ...  
352     True
353     True
354     True
355     True
356     True
Length: 357, dtype: bool

In [4]:
df.isna()

Unnamed: 0,Confusion/Hallucination,Blurred vision,Shortness of breath,Chest pain,Excessive hunger,Fatigue,Headache,Increased appetite,Lethargy,Obesity,Frequent urination,Insomnia/Restlessness,Sweating,Vomiting,Weakness on one side of body,Weight loss,prognosis
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
353,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
354,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
355,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [5]:
# df = df.drop_duplicates()
df = df.dropna()
df

Unnamed: 0,Confusion/Hallucination,Blurred vision,Shortness of breath,Chest pain,Excessive hunger,Fatigue,Headache,Increased appetite,Lethargy,Obesity,Frequent urination,Insomnia/Restlessness,Sweating,Vomiting,Weakness on one side of body,Weight loss,prognosis
0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
2,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
3,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
4,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,Stroke
353,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
354,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,Stroke
355,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,Stroke


In [6]:
# 2. Separate features (X) from target (y)
#    - We assume the target column is named 'Prognosis'
X = df.drop("prognosis", axis=1)
y = df["prognosis"]

In [7]:
# 3. Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# 4. Train a single Random Forest classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf_model.fit(X_train, y_train)

In [14]:
# 5. Evaluate
y_pred = rf_model.predict(X_test)
print("=== Single Random Forest (Multi-Class) ===")
print(f'Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%')
print("Classification Report:\n", classification_report(y_test, y_pred))

=== Single Random Forest (Multi-Class) ===
Accuracy: 100.00%
Classification Report:
                precision    recall  f1-score   support

    Diabetes        1.00      1.00      1.00        24
Heart disease       1.00      1.00      1.00        25
       Stroke       1.00      1.00      1.00        23

     accuracy                           1.00        72
    macro avg       1.00      1.00      1.00        72
 weighted avg       1.00      1.00      1.00        72



In [10]:
# 6. Get overall feature importance
feature_importances = rf_model.feature_importances_
# Print each feature with its importance
print("\n=== Feature Importances (Single Random Forest) ===")
for feat, imp in zip(X.columns, feature_importances):
    print(f"{feat}: {imp:.4f}")


=== Feature Importances (Single Random Forest) ===
Confusion/Hallucination: 0.1032
Blurred vision: 0.0162
Shortness of breath: 0.0840
Chest pain: 0.1173
Excessive hunger: 0.0387
Fatigue: 0.0290
Headache: 0.0617
Increased appetite: 0.1367
Lethargy: 0.0115
Obesity: 0.0435
Frequent urination: 0.1355
Insomnia/Restlessness: 0.0350
Sweating: 0.0839
Vomiting: 0.0053
Weakness on one side of body: 0.0676
Weight loss: 0.0309


In [11]:
import os
import joblib

# Save model and Feature Importance
# define the path to the storage directory
models_dir = os.path.join(r'C:\\Users\\echo\\Documents\\fyp\\backend', 'models')
os.makedirs(models_dir, exist_ok=True)

# Save the Random Forest model
symptoms_model_path = os.path.join(models_dir, 'symptoms_model.joblib')
joblib.dump(rf_model, symptoms_model_path)
print(f"Symptom model saved: {os.path.exists(symptoms_model_path)}")

Symptom model saved: True


##### One-vs-Rest Random Forests

In [12]:
# 4. Identify the distinct prognoses
# classes = y.unique()
classes = sorted(y.unique())  # or y_train.unique()
print("Classes to process:", classes)

Classes to process: ['Diabetes ', 'Heart disease', 'Stroke']


In [13]:
# 5. For each prognosis, train a one-vs-rest Random Forest
for target_class in classes:
    print(f"\n=== One-vs-Rest Model for {target_class} ===")
    
    # Convert labels to binary: 1 if target_class, 0 otherwise
    y_train_bin = (y_train == target_class).astype(int)
    y_test_bin = (y_test == target_class).astype(int)
    
    # Create and train the model
    ovr_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    ovr_rf.fit(X_train, y_train_bin)
    
    # Evaluate
    y_pred_bin = ovr_rf.predict(X_test)
    accuracy = accuracy_score(y_test_bin, y_pred_bin)
    print("Accuracy (OvR):", accuracy)
    print("Classification Report (OvR):")
    print(classification_report(y_test_bin, y_pred_bin, zero_division=0))
    
    # Get feature importance for this OvR model
    feature_importances = ovr_rf.feature_importances_
    print("--- Feature Importances ---")
    for feat, imp in zip(X.columns, feature_importances):
        print(f"{feat}: {imp:.4f}")


=== One-vs-Rest Model for Diabetes  ===
Accuracy (OvR): 1.0
Classification Report (OvR):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        24

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72

--- Feature Importances ---
Confusion/Hallucination: 0.0000
Blurred vision: 0.0357
Shortness of breath: 0.0000
Chest pain: 0.0000
Excessive hunger: 0.0854
Fatigue: 0.0904
Headache: 0.0000
Increased appetite: 0.2550
Lethargy: 0.0403
Obesity: 0.0879
Frequent urination: 0.2366
Insomnia/Restlessness: 0.0881
Sweating: 0.0000
Vomiting: 0.0221
Weakness on one side of body: 0.0000
Weight loss: 0.0586

=== One-vs-Rest Model for Heart disease ===
Accuracy (OvR): 1.0
Classification Report (OvR):
              precision    recall  f1-score   support

           0       1.00      1.00     