In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Required Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Import classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [4]:
# Load the processed dataset
df = pd.read_csv("/content/drive/MyDrive/Heart_Disease_Prediction/data/processed/heart_disease_features_selected.csv")

In [5]:
# Splitting features and target variable
X = df.drop(columns=['num'])  # Features
y = df['num']  # Target

In [6]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardizing the dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Data preprocessing completed. Ready for model training!")

Data preprocessing completed. Ready for model training!


**Feature Scaling**

In [9]:
# Standardize features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Train & Evaluate Classification Models**

In [10]:
# Dictionary to store model results
model_results = {}

### 5.1 Logistic Regression

log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)
model_results['Logistic Regression'] = accuracy_score(y_test, y_pred_log)

### 5.2 Decision Tree Classifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
model_results['Decision Tree'] = accuracy_score(y_test, y_pred_dt)

### 5.3 Random Forest Classifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
model_results['Random Forest'] = accuracy_score(y_test, y_pred_rf)

### 5.4 Support Vector Machine (SVM)

svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
model_results['SVM'] = accuracy_score(y_test, y_pred_svm)

### 5.5 K-Nearest Neighbors (KNN)

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
model_results['KNN'] = accuracy_score(y_test, y_pred_knn)

### 5.6 Gradient Boosting Classifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
model_results['Gradient Boosting'] = accuracy_score(y_test, y_pred_gb)

**Compare Model Performance**

In [12]:
# Print model accuracy results
for model, acc in model_results.items():
    print(f"{model}: {acc:.4f}")

# Identify the best-performing model
best_model = max(model_results, key=model_results.get)
print(f"Best Model: {best_model} with accuracy {model_results[best_model]:.4f}")

Logistic Regression: 0.4837
Decision Tree: 0.5000
Random Forest: 0.5000
SVM: 0.5272
KNN: 0.5272
Gradient Boosting: 0.5217
Best Model: SVM with accuracy 0.5272


**Identify Best Model**

In [14]:
# Identify the best-performing model
best_model_name = max(model_results, key=model_results.get)
best_model = None
if best_model_name == 'Logistic Regression':
    best_model = log_reg
elif best_model_name == 'Decision Tree':
    best_model = dt
elif best_model_name == 'Random Forest':
    best_model = rf
elif best_model_name == 'SVM':
    best_model = svm
elif best_model_name == 'KNN':
    best_model = knn
elif best_model_name == 'Gradient Boosting':
    best_model = gb

print(f"Best Model: {best_model_name} with accuracy {model_results[best_model_name]:.4f}")

Best Model: SVM with accuracy 0.5272


**Save Best Model**

In [17]:
# Create models directory if it doesn't exist
import os
import joblib
models_dir = "/content/drive/MyDrive/Heart_Disease_Prediction/models/"
os.makedirs(models_dir, exist_ok=True)

# Save the best model
best_model_path = os.path.join(models_dir, "best_model.pkl")
joblib.dump(best_model, best_model_path)
print(f"Best model saved at: {best_model_path}")


Best model saved at: /content/drive/MyDrive/Heart_Disease_Prediction/models/best_model.pkl
