In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
import os

# Load the dataset
def load_data():
    data = pd.read_csv('Cardiovascular_Disease_Dataset.csv')
    data = data[data['serumcholestrol'] >= 100]
    # Filter restingBP < 80 if zero values are high (check output)
    # data = data[data['restingBP'] >= 80]
    return data

# Preprocess the data
def preprocess_data(df, exclude_slope=True):
    X = df.drop(['patientid', 'target'], axis=1)
    if exclude_slope:
        X = X.drop(['slope'], axis=1, errors='ignore')
    y = df['target']
    
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    return X_scaled, y, df, imputer, scaler

# Train and evaluate the SVM model
def train_svm(X, y, df, feature_ablation=False):
    X_temp, X_holdout, y_temp, y_holdout = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.222, random_state=42, stratify=y_temp)
    
    if feature_ablation:
        X_train = X_train.drop(['noofmajorvessels', 'oldpeak'], axis=1, errors='ignore')
        X_test = X_test.drop(['noofmajorvessels', 'oldpeak'], axis=1, errors='ignore')
        X_holdout = X_holdout.drop(['noofmajorvessels', 'oldpeak'], axis=1, errors='ignore')
    
    svm_model = SVC(probability=True, random_state=42, class_weight='balanced', C=0.1, gamma='scale', kernel='rbf')
    
    svm_model.fit(X_train, y_train)
    
    y_train_pred = svm_model.predict(X_train)
    y_train_pred_proba = svm_model.predict_proba(X_train)[:, 1]
    y_pred = svm_model.predict(X_test)
    y_pred_proba = svm_model.predict_proba(X_test)[:, 1]
    y_holdout_pred = svm_model.predict(X_holdout)
    y_holdout_pred_proba = svm_model.predict_proba(X_holdout)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    
    perm_importance = permutation_importance(svm_model, X_test, y_test, n_repeats=10, random_state=42)
    
    cv_scores = cross_val_score(svm_model, X, y, cv=5, scoring='accuracy')
    
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_pred)
    cm_holdout = confusion_matrix(y_holdout, y_holdout_pred)
    
    metrics = {
        'Training Accuracy': accuracy_score(y_train, y_train_pred),
        'Training ROC AUC': roc_auc_score(y_train, y_train_pred_proba),
        'Test Accuracy': accuracy_score(y_test, y_pred),
        'Test Precision': precision_score(y_test, y_pred),
        'Test Recall': recall_score(y_test, y_pred),
        'Test F1 Score': f1_score(y_test, y_pred),
        'Test ROC AUC': roc_auc_score(y_test, y_pred_proba),
        'Holdout Accuracy': accuracy_score(y_holdout, y_holdout_pred),
        'Holdout ROC AUC': roc_auc_score(y_holdout, y_holdout_pred_proba),
        '5-Fold CV Accuracy': cv_scores.mean(),
        '5-Fold CV Accuracy Std': cv_scores.std(),
        'Confusion Matrix Train': cm_train,
        'Confusion Matrix Test': cm_test,
        'Confusion Matrix Holdout': cm_holdout
    }
    
    return svm_model, metrics, X_train, X_test, y_train, y_test, X_holdout, y_holdout, df, fpr, tpr, perm_importance

# Plot metrics, ROC curve, correlation heatmap, and confusion matrices
def plot_metrics_and_roc(metrics, fpr, tpr, df, model_name='Slope Excluded'):
    os.makedirs('static', exist_ok=True)
    
    # Bar chart
    labels = ['Training', 'Test', 'Holdout', '5-Fold CV']
    accuracy = [metrics['Training Accuracy'], metrics['Test Accuracy'], metrics['Holdout Accuracy'], metrics['5-Fold CV Accuracy']]
    roc_auc = [metrics['Training ROC AUC'], metrics['Test ROC AUC'], metrics['Holdout ROC AUC']]
    
    x = np.arange(len(labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(x[:3] - width/2, accuracy[:3], width, label='Accuracy', color='skyblue')
    ax.bar(x[:3] + width/2, roc_auc, width, label='ROC AUC', color='lightcoral')
    ax.bar(x[3] - width/2, accuracy[3], width, color='skyblue')
    
    ax.set_xlabel('Dataset')
    ax.set_ylabel('Score')
    ax.set_title(f'SVM Model Performance Metrics ({model_name})')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    ax.set_ylim(0, 1)
    
    ax.errorbar(x[3], accuracy[3], yerr=metrics['5-Fold CV Accuracy Std'], fmt='none', c='black', capsize=5)
    
    plt.tight_layout()
    plt.savefig(f'static/svm_metrics_plot_{model_name.lower().replace(" ", "_")}.png')
    plt.close()
    
    # ROC curve (placeholder)
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(fpr if fpr is not None else [0, 1], tpr if tpr is not None else [0, 1], 
            label=f'ROC Curve (AUC = {metrics["Test ROC AUC"]:.4f})', color='skyblue')
    ax.plot([0, 1], [0, 1], label='Random Guess', linestyle='--', color='lightcoral')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'SVM ROC Curve (Test Set, {model_name})')
    ax.legend()
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig(f'static/svm_roc_curve_{model_name.lower().replace(" ", "_")}.png')
    plt.close()
    
    # Confusion matrix heatmaps
    for dataset, cm in [('Train', metrics['Confusion Matrix Train']), 
                        ('Test', metrics['Confusion Matrix Test']), 
                        ('Holdout', metrics['Confusion Matrix Holdout'])]:
        fig, ax = plt.subplots(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        ax.set_title(f'Confusion Matrix ({model_name} - {dataset})')
        plt.tight_layout()
        plt.savefig(f'static/confusion_matrix_{model_name.lower().replace(" ", "_")}_{dataset.lower()}.png')
        plt.close()
    
    # Correlation heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    corr_matrix = df.drop(['patientid', 'target', 'slope'], axis=1, errors='ignore').corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, ax=ax)
    ax.set_title('Correlation Heatmap of Features (Slope Excluded)')
    plt.tight_layout()
    plt.savefig('static/correlation_heatmap.png')
    plt.close()

# Main function
def main():
    df = load_data()
    X, y, df, imputer, scaler = preprocess_data(df, exclude_slope=True)
    
    print("Training SVM with Slope Excluded:")
    model, metrics, X_train, X_test, y_train, y_test, X_holdout, y_holdout, df, fpr, tpr, perm_importance = train_svm(X, y, df)
    
    print("Model Evaluation Metrics (Slope Excluded):")
    for metric, value in metrics.items():
        if 'Confusion Matrix' not in metric:
            print(f"{metric}: {value}")
    
    print("\nConfusion Matrix (Slope Excluded - Train):")
    print(metrics['Confusion Matrix Train'])
    print("\nConfusion Matrix (Slope Excluded - Test):")
    print(metrics['Confusion Matrix Test'])
    print("\nConfusion Matrix (Slope Excluded - Holdout):")
    print(metrics['Confusion Matrix Holdout'])
    
    print("\nPermutation Importance:")
    print(pd.DataFrame({'Feature': X_test.columns, 'Importance': perm_importance.importances_mean}).sort_values(by='Importance', ascending=False))
    
    print("\nClass Distribution:")
    print(df['target'].value_counts(normalize=True))
    
    print("\nFeature Correlations with Target (Slope Excluded):")
    correlations = df.drop(['patientid', 'slope'], axis=1, errors='ignore').corr()['target'].sort_values(ascending=False)
    print(correlations)
    
    print("\nZero Values in Features:")
    zero_values = (df.drop(['patientid', 'target', 'slope'], axis=1, errors='ignore') == 0).sum()
    print(zero_values)
    
    print("\nTest Set Size:", X_test.shape[0])
    print("Test Set Class Distribution:")
    print(pd.Series(y_test).value_counts(normalize=True))
    
    print("\nHoldout Set Size:", X_holdout.shape[0])
    print("Holdout Set Class Distribution:")
    print(pd.Series(y_holdout).value_counts(normalize=True))
    
    print("\nTest Set ROC Curve Data (Slope Excluded):")
    print("False Positive Rate:", fpr.tolist())
    print("True Positive Rate:", tpr.tolist())
    
    print("\nTraining SVM with Ablated Features (Excluding noofmajorvessels, oldpeak):")
    model_ablated, metrics_ablated, _, _, _, _, _, _, _, fpr_ablated, tpr_ablated, _ = train_svm(X, y, df, feature_ablation=True)
    
    print("Model Evaluation Metrics (Ablated Features):")
    for metric, value in metrics_ablated.items():
        if 'Confusion Matrix' not in metric:
            print(f"{metric}: {value}")
    
    print("\nConfusion Matrix (Ablated Features - Train):")
    print(metrics_ablated['Confusion Matrix Train'])
    print("\nConfusion Matrix (Ablated Features - Test):")
    print(metrics_ablated['Confusion Matrix Test'])
    print("\nConfusion Matrix (Ablated Features - Holdout):")
    print(metrics_ablated['Confusion Matrix Holdout'])
    
    # Plot metrics and ROC curves
    plot_metrics_and_roc(metrics, fpr, tpr, df, model_name='Slope Excluded')
    plot_metrics_and_roc(metrics_ablated, fpr_ablated, tpr_ablated, df, model_name='Ablated Features')
    
    import pickle
    with open('svm_cardiovascular_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    with open('imputer.pkl', 'wb') as f:
        pickle.dump(imputer, f)
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

if __name__ == '__main__':
    main()

Training SVM with Slope Excluded:
Model Evaluation Metrics (Slope Excluded):
Training Accuracy: 0.9164133738601824
Training ROC AUC: 0.972630260895567
Test Accuracy: 0.8936170212765957
Test Precision: 0.8818181818181818
Test Recall: 0.9326923076923077
Test F1 Score: 0.9065420560747663
Test ROC AUC: 0.9621108058608059
Holdout Accuracy: 0.851063829787234
Holdout ROC AUC: 0.9358974358974359
5-Fold CV Accuracy: 0.8957446808510638
5-Fold CV Accuracy Std: 0.02745422957566795

Confusion Matrix (Slope Excluded - Train):
[[261  33]
 [ 22 342]]

Confusion Matrix (Slope Excluded - Test):
[[71 13]
 [ 7 97]]

Confusion Matrix (Slope Excluded - Holdout):
[[35  7]
 [ 7 45]]

Permutation Importance:
              Feature  Importance
2           chestpain    0.093617
3           restingBP    0.048404
10   noofmajorvessels    0.045745
4     serumcholestrol    0.018085
6     restingrelectro    0.015957
5   fastingbloodsugar    0.010106
8       exerciseangia    0.006915
1              gender    0.005319
0

In [28]:
import pandas as pd

# Load your dataset
df = pd.read_csv("Cardiovascular_Disease_Dataset.csv")

# Drop non-numeric or irrelevant columns (like IDs)
df_corr = df.drop(columns=['patientid'], errors='ignore')

# Compute correlation
correlation_matrix = df_corr.corr()

# Print correlation with target
print(correlation_matrix['target'].sort_values(ascending=False))


target               1.000000
slope                0.797358
chestpain            0.554228
noofmajorvessels     0.489866
restingBP            0.482387
restingrelectro      0.426837
fastingbloodsugar    0.303233
maxheartrate         0.228343
serumcholestrol      0.195340
oldpeak              0.098053
gender               0.015769
age                  0.008356
exerciseangia       -0.039874
Name: target, dtype: float64


NameError: name 'X' is not defined