In [42]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("andrewmvd/heart-failure-clinical-data")

print("Path to dataset files:", path)


Path to dataset files: /Users/anton/.cache/kagglehub/datasets/andrewmvd/heart-failure-clinical-data/versions/1


## Model Explanation
In this project, I’m leveraging advanced ensemble methods, specifically boosting algorithms like CatBoost, XGBoost, and LightGBM, combined into a stacking ensemble to predict heart failure outcomes with high accuracy. Boosting algorithms are particularly effective because they iteratively focus on the data points that are hardest to predict, which helps reduce errors and create a model that generalizes well. By combining these individual models in a stacking ensemble, I’m taking advantage of their unique strengths—CatBoost’s ability to handle categorical data, XGBoost’s speed and efficiency, and LightGBM’s capacity to handle large datasets—resulting in a model that’s not just accurate but also robust. To address the class imbalance in the data (e.g., fewer instances of heart failure), I used SMOTE, a synthetic oversampling technique, to balance the dataset and ensure the model treats both classes fairly. This approach works so well because it captures complex patterns in the data while mitigating overfitting, ultimately achieving a high accuracy of 92.68% and a stellar ROC-AUC of 98%, making it both predictive and reliable for real-world applications.

In [43]:
import pandas as pd

# Load the dataset
dataset_path = f"{path}/heart_failure_clinical_records_dataset.csv"  # Replace with actual filename if different
data = pd.read_csv(dataset_path)

# Display the first few rows
print(data.head())


    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6            1  
2       

In [44]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Load dataset
# Assuming 'data' is already loaded as a pandas DataFrame
# Replace 'data.csv' with the path to your dataset


# Separate features and target
X = data.drop(columns=['DEATH_EVENT'])
y = data['DEATH_EVENT']

# Feature selection: Drop low-importance features
X = X.drop(columns=['smoking', 'anaemia', 'high_blood_pressure'])

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Initialize base models
catboost_model = CatBoostClassifier(
    iterations=500, learning_rate=0.1, depth=4, l2_leaf_reg=3, random_seed=42, verbose=0
)
xgb_model = XGBClassifier(
    n_estimators=500, learning_rate=0.1, max_depth=4, reg_lambda=3, random_state=42
)
lgbm_model = LGBMClassifier(
    n_estimators=500, learning_rate=0.1, max_depth=4, reg_lambda=3, random_state=42
)

# Initialize Stacking Classifier
estimators = [
    ('catboost', catboost_model),
    ('xgb', xgb_model),
    ('lgbm', lgbm_model),
]
stacking_model = StackingClassifier(
    estimators=estimators, final_estimator=CatBoostClassifier(
        iterations=200, learning_rate=0.05, depth=6, random_seed=42, verbose=0
    ), cv=3
)

# Train the stacking model
stacking_model.fit(X_train, y_train)

# Make predictions
y_pred = stacking_model.predict(X_test)
y_pred_proba = stacking_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROC-AUC: {roc_auc:.2f}")

# Print a detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 162, number of negative: 162
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 411
[LightGBM] [Info] Number of data points in the train set: 324, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 108, number of negative: 108
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 216, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 108, number of negative: 108
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead 