In [3]:
# 04_supervised_learning_fixed.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, accuracy_score

# -----------------------------
# 1. Load cleaned dataset
# -----------------------------
DATA_PATH = "data/processed_cleaned.csv"
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
X = df.drop(columns=['target']).values
y = df['target'].values

# -----------------------------
# 2. Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 3. Define models
# -----------------------------
models = {
    "LogisticRegression": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000, multi_class='auto'))
    ]),
    "DecisionTree": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', DecisionTreeClassifier(random_state=42))
    ]),
    "RandomForest": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
    ]),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC(probability=True, kernel='rbf', random_state=42))
    ])
}

# -----------------------------
# 4. Train, evaluate, plot ROC
# -----------------------------
results = {}
binary_class = len(np.unique(y)) == 2
plt.figure(figsize=(8,6)) if binary_class else None

for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # AUC and ROC handling
    if hasattr(pipeline.named_steps['clf'], "predict_proba"):
        y_proba = pipeline.predict_proba(X_test)
        if binary_class:
            auc = roc_auc_score(y_test, y_proba[:,1])
            y_plot = y_proba[:,1]
        else:
            auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            y_plot = None  # skip ROC curve for multi-class
    else:  # decision_function (SVM)
        y_scores = pipeline.decision_function(X_test)
        if binary_class:
            auc = roc_auc_score(y_test, y_scores)
            y_plot = y_scores
        else:
            auc = roc_auc_score(y_test, y_scores, multi_class='ovr')
            y_plot = None

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {'accuracy': acc, 'report': report, 'auc': auc}

    print("----", name, "----")
    print("Accuracy:", acc)
    print("AUC:", auc)
    print(report)

    # ROC curve (only for binary classification)
    if binary_class and y_plot is not None:
        fpr, tpr, _ = roc_curve(y_test, y_plot)
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.2f})")

if binary_class:
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves")
    plt.legend()
    plt.grid()
    plt.show()

# -----------------------------
# 5. Save model metrics
# -----------------------------
os.makedirs("results", exist_ok=True)
with open("results/model_metrics.json", "w") as f:
    json.dump(results, f, indent=2)
print("Saved metrics to results/model_metrics.json")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


---- LogisticRegression ----
Accuracy: 0.5901639344262295
AUC: 0.8056484052346121
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        33
           1       0.33      0.27      0.30        11
           2       0.17      0.14      0.15         7
           3       0.43      0.43      0.43         7
           4       0.00      0.00      0.00         3

    accuracy                           0.59        61
   macro avg       0.34      0.34      0.34        61
weighted avg       0.55      0.59      0.57        61

---- DecisionTree ----
Accuracy: 0.45901639344262296
AUC: 0.5242207792207793
              precision    recall  f1-score   support

           0       0.74      0.76      0.75        33
           1       0.17      0.27      0.21        11
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
