In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_selection import SelectFromModel

In [7]:
file_path = '../datasets/heart.csv'
data = pd.read_csv(file_path)

In [8]:
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# Preprocessing for numerical data: scaling
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
# Preprocessing for categorical data: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess data
X_processed = preprocessor.fit_transform(X)

Categorical columns: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
Numerical columns: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']


In [9]:
# Split the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_processed, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
# Define models
log_reg = LogisticRegression(max_iter=1000)
mlp = MLPClassifier(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# Parameter grids
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

In [11]:
# Grid Search with StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=cv, scoring='accuracy')
grid_search_mlp = GridSearchCV(mlp, param_grid_mlp, cv=cv, scoring='accuracy')
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=cv, scoring='accuracy')
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=cv, scoring='accuracy')

# Fit models
grid_search_log_reg.fit(X_train, y_train)
grid_search_mlp.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

best_log_reg = grid_search_log_reg.best_estimator_
best_mlp = grid_search_mlp.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_gb = grid_search_gb.best_estimator_



In [12]:
# Ensemble Methods
models = [
    ('Logistic Regression', best_log_reg),
    ('Neural Network', best_mlp),
    ('Random Forest', best_rf),
    ('Gradient Boosting', best_gb)
]

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=models,
    final_estimator=LogisticRegression()
)

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=models,
    voting='soft'
)

# Fit ensemble models
stacking_clf.fit(X_train, y_train)
voting_clf.fit(X_train, y_train)



In [13]:
# Evaluate models
models.append(('Stacking Classifier', stacking_clf))
models.append(('Voting Classifier', voting_clf))

for name, model in models:
    y_pred = model.predict(X_val)
    print(f"{name} Validation Accuracy: {accuracy_score(y_val, y_pred)}")
    print(f"{name} Classification Report:\n{classification_report(y_val, y_pred)}")
    print(f"{name} ROC AUC Score: {roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])}\n")

# Test set evaluation
best_model = max(models, key=lambda m: accuracy_score(y_val, m[1].predict(X_val)))[1]
y_test_pred = best_model.predict(X_test)
print(f"Best Model Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Best Model Test Classification Report:\n{classification_report(y_test, y_test_pred)}")
print(f"Best Model Test ROC AUC Score: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}")

Logistic Regression Validation Accuracy: 0.8695652173913043
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85        57
           1       0.91      0.86      0.89        81

    accuracy                           0.87       138
   macro avg       0.86      0.87      0.87       138
weighted avg       0.87      0.87      0.87       138

Logistic Regression ROC AUC Score: 0.9395711500974658

Neural Network Validation Accuracy: 0.8478260869565217
Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        57
           1       0.89      0.84      0.87        81

    accuracy                           0.85       138
   macro avg       0.84      0.85      0.84       138
weighted avg       0.85      0.85      0.85       138

Neural Network ROC AUC Score: 0.9443361490145116

Random Forest Validation Accuracy: 0.88405797