In [19]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from scipy.stats import zscore
from sklearn.feature_selection import SelectKBest, chi2

# Step 1: Load the dataset
df = pd.read_csv('heart_disease_uci.csv')

# Step 2: Handle missing values (only for numeric columns)
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)
# Step 2.1: Remove outliers using Z-score method

# Calculate Z-scores for numeric columns
# numeric_cols = df.select_dtypes(include=['number']).columns
# z_scores = np.abs(zscore(df[numeric_cols]))

# # Define a threshold for Z-scores
# threshold = 3

# # Filter rows where all Z-scores are below the threshold
# df = df[(z_scores < threshold).all(axis=1)]

# Step 2.2: Feature selection using correlation
# Calculate the correlation matrix
correlation_matrix = df.select_dtypes(include=['number']).corr()

# Select features with high correlation to the target variable
correlation_threshold = 0.1  # Define a threshold for correlation
correlated_features = correlation_matrix['num'][abs(correlation_matrix['num']) > correlation_threshold].index

# Step 2.3: Feature selection using Chi-square test

# Prepare data for Chi-square test (only non-negative values)
X_chi2 = df.drop(columns=['num']).select_dtypes(include=['number']).abs()
y_chi2 = df['num']

# Apply Chi-square test
chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(X_chi2, y_chi2)

# Get selected features based on Chi-square scores
chi2_scores = pd.Series(chi2_selector.scores_, index=X_chi2.columns)
chi2_threshold = chi2_scores.mean()  # Define a threshold for Chi-square scores
chi2_selected_features = chi2_scores[chi2_scores > chi2_threshold].index

# Combine features selected by correlation and Chi-square
selected_features = list(set(correlated_features).union(set(chi2_selected_features)))

# Keep only the selected features in the dataset
df = df[selected_features + ['num']]

# Step 3: Separate features and target
X = df.drop(columns=['num'])  # Replace 'num' with the actual target column name
y = df['num']  # Replace with the actual target column name

# Step 4: Encode categorical variables
# Identify columns that need encoding
categorical_cols = X.select_dtypes(include=['object']).columns

# Use one-hot encoding for categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Step 5: Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Step 6: Map your target to binary if needed
y_binary = np.where(y > 0, 1, 0)  # Assuming 'target_column_name' has values where 0 is one class and 1 or greater is another class

# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binary, test_size=0.2, random_state=42)

# Step 8: Build and train the SVM model
svm_model = SVC(kernel='linear', probability=True, random_state=42)
# svm_model = SVC(kernel='rbf', probability=True, random_state=42)
# svm_model = SVC(kernel='sigmoid', probability=True, random_state=42)
# svm_model = SVC(kernel='poly', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Step 9: Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Step 10: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix")
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

# Step 11: ROC Curve and AUC score
y_probs = svm_model.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

# Plot the ROC curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='darkorange')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guessing")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

ValueError: y should be a 1d array, got an array of shape (736, 2) instead.