 Explore Support Vector Machines (SVM)

In [None]:
# Step 1: Load Libraries
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import json
import os

# Step 2: Load Cleaned Dataset
cleaned_file = "D:/MIMIC-IV-Data-Pipeline/processed_data/mimic_cleaned_v8.csv.gz"
df = pd.read_csv(cleaned_file, compression="gzip")

# Step 3: Define Target Variable and Split Data
target = "delirium"
X = df.drop(columns=[target])
y = df[target]

# Identify categorical variables
low_cardinality_cols = ["admission_type", "admission_location", "discharge_location",
                        "insurance", "marital_status", "race", "gender", "age_group"]
high_cardinality_cols = ["primary_diagnosis", "drug"]

# One-Hot Encoding for Low-Cardinality Features
X = pd.get_dummies(X, columns=low_cardinality_cols, drop_first=True)

# Frequency Encoding for High-Cardinality Features
for col in high_cardinality_cols:
    freq_map = X[col].value_counts(normalize=True)
    X[col] = X[col].map(freq_map)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standard Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train SVM Model
svm_model = SVC(kernel="rbf", class_weight="balanced", probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Step 5: Make Predictions
y_pred_svm = svm_model.predict(X_test_scaled)

# Step 6: Evaluate Model Performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm, output_dict=True)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm).tolist()

# Compute ROC AUC Score
y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1]
roc_auc_svm = roc_auc_score(y_test, y_pred_proba_svm)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_proba_svm)

# Save ROC Curve Plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, label=f"ROC Curve (AUC = {roc_auc_svm:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM")
plt.legend()

roc_plot_path_svm = "D:/MIMIC-IV-Data-Pipeline/ROC_SVM.png"
plt.savefig(roc_plot_path_svm)
plt.close()

# Step 7: Save Model Performance Metrics
performance_metrics_svm = {
    "Model": "SVM",
    "Accuracy": accuracy_svm,
    "Precision (Delirium = 1)": report_svm["1"]["precision"],
    "Recall (Delirium = 1)": report_svm["1"]["recall"],
    "F1-Score (Delirium = 1)": report_svm["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_svm,
    "ROC AUC Score": roc_auc_svm,
    "ROC Curve Path": roc_plot_path_svm
}

performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing performance data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_svm)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("SVM model performance saved successfully.")

# Step 8: Display Performance Summary
print("SVM Model Performance:")
print("Accuracy:", accuracy_svm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
