In [12]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("heart_disease.csv")

# Drop unnamed columns (if any)
df = df.loc[:, ~df.columns.str.contains('Unnamed')]

# Drop rows with missing values
df.dropna(inplace=True)

# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Encode categorical variables
for col in non_numeric_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Standardize numerical features
target_col = 'num'  # Ensure target variable is not scaled
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove(target_col)

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Check class distribution
print("Class distribution before balancing:", Counter(df[target_col]))

# Splitting dataset
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Class distribution after balancing:", Counter(y_train))

# Train models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced"),
    "Random Forest": RandomForestClassifier(class_weight="balanced"),
    "AdaBoost": AdaBoostClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=1))

# Ensemble Voting Classifier
voting_clf = VotingClassifier(
    estimators=[("LR", LogisticRegression(max_iter=1000)),
                ("DT", DecisionTreeClassifier(class_weight="balanced")),
                ("RF", RandomForestClassifier(class_weight="balanced")),
                ("AB", AdaBoostClassifier())],
    voting="hard"
)
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

# Evaluate ensemble model
ensemble_accuracy = accuracy_score(y_test, y_pred_voting)
results["Voting Classifier"] = ensemble_accuracy
print(f"\nVoting Classifier Accuracy: {ensemble_accuracy:.4f}")
print(classification_report(y_test, y_pred_voting, zero_division=1))

# Save the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name] if best_model_name != "Voting Classifier" else voting_clf
joblib.dump(best_model, "heart_disease_model.pkl")
print(f"\n✅ Best model '{best_model_name}' saved as 'heart_disease_model.pkl'")


Class distribution before balancing: Counter({0: 160, 1: 56, 2: 35, 3: 35, 4: 13})
Class distribution after balancing: Counter({1: 128, 0: 128, 2: 128, 4: 128, 3: 128})

Logistic Regression Accuracy: 0.5833
              precision    recall  f1-score   support

           0       0.87      0.84      0.86        32
           1       0.45      0.45      0.45        11
           2       0.00      0.00      0.00         7
           3       0.43      0.43      0.43         7
           4       0.00      0.00      0.00         3

    accuracy                           0.58        60
   macro avg       0.35      0.35      0.35        60
weighted avg       0.60      0.58      0.59        60


Decision Tree Accuracy: 0.5000
              precision    recall  f1-score   support

           0       0.84      0.81      0.83        32
           1       0.21      0.27      0.24        11
           2       0.00      0.00      0.00         7
           3       0.10      0.14      0.12         7
 