In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [2]:
# Load the dataset
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [3]:
# Basic info
print(df.info())
print(df.describe())
print(df.isnull().sum())  # Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB
None
             

In [4]:
# Check class distribution
print(df['DEATH_EVENT'].value_counts())
# 0    203
# 1     96
# The dataset is imbalanced (about 2:1 ratio)

DEATH_EVENT
0    203
1     96
Name: count, dtype: int64


In [5]:
# Feature engineering
# Let's create some interaction terms that might be relevant
df['age_ejection_fraction'] = df['age'] * df['ejection_fraction']
df['creatinine_phosphokinase_serum_creatinine'] = df['creatinine_phosphokinase'] * df['serum_creatinine']

In [6]:
# Split into features and target
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

In [7]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Scale numerical features
scaler = StandardScaler()
num_cols = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets',
            'serum_creatinine', 'serum_sodium', 'time', 'age_ejection_fraction',
            'creatinine_phosphokinase_serum_creatinine']

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [9]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [10]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'SVM': SVC(random_state=42, probability=True),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

In [11]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("\n")

Random Forest Accuracy: 0.8333
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        41
           1       0.74      0.74      0.74        19

    accuracy                           0.83        60
   macro avg       0.81      0.81      0.81        60
weighted avg       0.83      0.83      0.83        60

[[36  5]
 [ 5 14]]


XGBoost Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.84      0.88      0.86        41
           1       0.71      0.63      0.67        19

    accuracy                           0.80        60
   macro avg       0.77      0.75      0.76        60
weighted avg       0.80      0.80      0.80        60

[[36  5]
 [ 7 12]]


SVM Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.84      0.88      0.86        41
           1       0.71      0.63      0.67        19

    accuracy                           0.80        60
   macro 

In [12]:
# Select best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"Best model is {best_model_name} with accuracy {results[best_model_name]:.4f}")

Best model is Random Forest with accuracy 0.8333


In [13]:
# Feature importance for the best model (if applicable)
if hasattr(best_model, 'feature_importances_'):
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    print(feature_importances)

                                      Feature  Importance
11                                       time    0.304205
7                            serum_creatinine    0.154598
4                           ejection_fraction    0.110514
12                      age_ejection_fraction    0.076428
13  creatinine_phosphokinase_serum_creatinine    0.073805
2                    creatinine_phosphokinase    0.063470
0                                         age    0.060442
6                                   platelets    0.057003
8                                serum_sodium    0.053773
9                                         sex    0.012645
5                         high_blood_pressure    0.008694
10                                    smoking    0.008684
3                                    diabetes    0.008132
1                                     anaemia    0.007606


In [14]:
import joblib

# Save the best model and scaler
joblib.dump(best_model, 'heart_failure_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']