In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
# Load the dataset
data = pd.read_csv('/content/originalheart.csv')

# Display the first few rows of the dataset to understand its structure
print(data.head())

# Display the column names
print(data.columns)

# Separate features (X) and target (y)
X = data.drop(columns=['target'])  # Replace 'target' with the correct name if different
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

print("Data Preparation Complete.")


    age  sex   cp  tresbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0    145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0    160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0    120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0    130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0    130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  target  
0    3.0  0.0  6.0       0  
1    2.0  3.0  3.0       2  
2    2.0  2.0  7.0       1  
3    3.0  0.0  3.0       0  
4    1.0  0.0  3.0       0  
Index(['age', 'sex', 'cp', 'tresbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')
Data Preparation Complete.


In [20]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Handle non-numeric values by replacing '?' with NaN (before converting to NumPy array)
X_train = pd.DataFrame(X_train).replace('?', np.nan)
X_test = pd.DataFrame(X_test).replace('?', np.nan)

# Convert all columns to numeric, coercing errors to NaN
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill missing values with the median of each column (often better than mean)
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Optimize Decision Tree hyperparameters
dt_model = DecisionTreeClassifier(
    criterion='gini',          # 'entropy' can also be tried
    max_depth=10,              # Depth of the tree, reducing overfitting
    min_samples_split=10,      # Minimum samples required to split a node
    min_samples_leaf=5,        # Minimum samples required at each leaf node
    random_state=42
)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Improved Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


Improved Decision Tree Accuracy: 62.30%
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83        30
           1       0.36      0.31      0.33        13
           2       0.50      0.38      0.43         8
           3       0.56      0.56      0.56         9
           4       0.00      0.00      0.00         1

    accuracy                           0.62        61
   macro avg       0.44      0.42      0.43        61
weighted avg       0.61      0.62      0.62        61

Confusion Matrix:
 [[26  4  0  0  0]
 [ 4  4  1  2  2]
 [ 1  3  3  1  0]
 [ 2  0  2  5  0]
 [ 0  0  0  1  0]]


In [27]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 59.02%
Classification Report:
               precision    recall  f1-score   support

           0       0.77      1.00      0.87        30
           1       0.14      0.08      0.10        13
           2       0.50      0.50      0.50         8
           3       0.17      0.11      0.13         9
           4       0.00      0.00      0.00         1

    accuracy                           0.59        61
   macro avg       0.32      0.34      0.32        61
weighted avg       0.50      0.59      0.53        61

Confusion Matrix:
 [[30  0  0  0  0]
 [ 6  1  3  3  0]
 [ 2  0  4  2  0]
 [ 1  5  1  1  1]
 [ 0  1  0  0  0]]


In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Handle non-numeric values by replacing '?' with NaN
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)
X_train_df.replace('?', np.nan, inplace=True)
X_test_df.replace('?', np.nan, inplace=True)

# Convert all columns to numeric, coercing errors to NaN
X_train_df = X_train_df.apply(pd.to_numeric, errors='coerce')
X_test_df = X_test_df.apply(pd.to_numeric, errors='coerce')

# Fill missing values with the mean of each column
X_train_df.fillna(X_train_df.mean(), inplace=True)
X_test_df.fillna(X_test_df.mean(), inplace=True)

# Feature scaling to normalize the data for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_df)
X_test_scaled = scaler.transform(X_test_df)

# Initialize the SVM model with tuned parameters
svm_model = SVC(random_state=42, kernel='rbf', C=1.0, gamma='scale')

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"Improved SVM Accuracy: {svm_accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


Improved SVM Accuracy: 60.66%
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.97      0.85        30
           1       0.36      0.31      0.33        13
           2       0.38      0.38      0.38         8
           3       0.25      0.11      0.15         9
           4       0.00      0.00      0.00         1

    accuracy                           0.61        61
   macro avg       0.35      0.35      0.34        61
weighted avg       0.54      0.61      0.56        61

Confusion Matrix:
 [[29  0  1  0  0]
 [ 6  4  1  2  0]
 [ 2  2  3  1  0]
 [ 1  4  3  1  0]
 [ 0  1  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# Dictionary to store the accuracy of each model
model_accuracies = {
    "Decision Tree": dt_accuracy,
    "Random Forest": rf_accuracy,
    "SVM": svm_accuracy
}

# Determine the best model based on accuracy
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_model_accuracy = model_accuracies[best_model_name]

# Print the accuracy of each model
print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")

# Print the best model and its accuracy
print(f"\nBest Model: {best_model_name} with Accuracy: {best_model_accuracy * 100:.2f}%")


Decision Tree Accuracy: 62.30%
Random Forest Accuracy: 59.02%
SVM Accuracy: 60.66%

Best Model: Decision Tree with Accuracy: 62.30%
