In [71]:
# Install necessary libraries if not already present.
# No specific TensorFlow version needed for Gradient Boosting, so this line can be commented out.
!pip install tensorflow==2.12.0

# Import core data manipulation and machine learning libraries.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Import GradientBoostingClassifier for the new model.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle # Used for saving scikit-learn models.
import os

print("Libraries imported successfully! ‚úÖ")

Libraries imported successfully! ‚úÖ


In [72]:
# --- Section 1: Data Loading ---

# Define the path to the dataset CSV file.
# Ensure 'heart_failure_clinical_records_dataset (1).csv' is accessible in your Colab environment.
file_path = 'heart_failure_clinical_records_dataset (1).csv'

# Read the CSV file into a pandas DataFrame.
df = pd.read_csv(file_path)

print(f"\nDataset '{file_path}' loaded successfully! üìä")
print("First 5 rows of the dataset for initial review:")
print(df.head())
print(f"\nDataset shape: {df.shape[0]} rows, {df.shape[1]} columns. This indicates the number of samples and features.")
print("\nDataset information (including data types and non-null counts):")
df.info()


Dataset 'heart_failure_clinical_records_dataset (1).csv' loaded successfully! üìä
First 5 rows of the dataset for initial review:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00              

In [73]:
# --- Section 2: Data Preprocessing ---

# Separate features (independent variables) and the target variable (dependent variable).
# 'DEATH_EVENT' is the target, indicating whether a heart failure event occurred (1) or not (0).
X = df.drop('DEATH_EVENT', axis=1) # Features: all columns except 'DEATH_EVENT'.
y = df['DEATH_EVENT'] # Target: the 'DEATH_EVENT' column.

print("\nFeatures (X) and Target (y) separated. ‚ú®")
print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")

# Split the dataset into training and testing sets.
# test_size=0.2: 20% of data for testing, 80% for training.
# random_state=42: Ensures reproducibility of the split for consistent results.
# stratify=y: Maintains the same proportion of 'DEATH_EVENT' outcomes in both training and testing sets, crucial for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nData split into training and testing sets: ‚û°Ô∏è")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Initialize the StandardScaler.
# This scales features to have a mean of 0 and a standard deviation of 1, which benefits many ML algorithms.
scaler = StandardScaler()
print("\nStandardScaler initialized. ‚öôÔ∏è")

# Fit the scaler on the training data and transform it.
# The scaler learns the scaling parameters ONLY from the training data to prevent data leakage.
X_train_scaled = scaler.fit_transform(X_train)
print("\nX_train data scaled. ‚úÖ")
print(f"Mean of X_train_scaled (should be close to 0): {np.mean(X_train_scaled):.4f}")
print(f"Standard Deviation of X_train_scaled (should be close to 1): {np.std(X_train_scaled):.4f}")
print("First 5 rows of scaled X_train (example of transformed data):")
print(X_train_scaled[:5])

# Transform the test data using the *same* scaler fitted on the training data.
# This ensures consistency in scaling between training and inference.
X_test_scaled = scaler.transform(X_test)
print("\nX_test data scaled using the same scaler. ‚úÖ")
print("First 5 rows of scaled X_test (example of transformed data):")
print(X_test_scaled[:5])


Features (X) and Target (y) separated. ‚ú®
Shape of X (features): (299, 12)
Shape of y (target): (299,)

Data split into training and testing sets: ‚û°Ô∏è
X_train shape: (239, 12), y_train shape: (239,)
X_test shape: (60, 12), y_test shape: (60,)

StandardScaler initialized. ‚öôÔ∏è

X_train data scaled. ‚úÖ
Mean of X_train_scaled (should be close to 0): 0.0000
Standard Deviation of X_train_scaled (should be close to 1): 1.0000
First 5 rows of scaled X_train (example of transformed data):
[[-0.26905031  1.11069566 -0.20073472 -0.90033664  0.17652783 -0.77028133
  -1.00472172 -0.36043709  0.55991522 -1.33381774 -0.68283063 -0.46784708]
 [-0.70688258 -0.90033664 -0.53431791  1.11069566  1.84742492 -0.77028133
   1.0516855  -0.54446714 -0.34580213  0.74972762 -0.68283063 -1.35916712]
 [ 1.2195794  -0.90033664 -0.02058    -0.90033664 -1.49436926  1.29822697
   0.01340146  0.46769812 -1.47794881  0.74972762 -0.68283063 -1.59168539]
 [ 0.25634841 -0.90033664 -0.45512902 -0.90033664 -1.076644

In [74]:
# --- Section 3: Model Definition (Gradient Boosting Classifier) ---

# Initialize the Gradient Boosting Classifier.
# Gradient Boosting builds an ensemble of weak prediction models (typically decision trees) sequentially.
# Each new tree corrects the errors made by previously built trees.
# n_estimators: The number of boosting stages (trees) to perform. More estimators can improve performance but increase training time and risk overfitting.
# learning_rate: Shrinks the contribution of each tree. A lower learning rate often requires more estimators.
# max_depth: The maximum depth of the individual regression estimators. Controls the complexity of each tree.
# random_state: Ensures reproducibility of the results.
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

print("\nGradient Boosting Classifier model defined. üå≥")
print("Initial model parameters:")
print(model.get_params())


Gradient Boosting Classifier model defined. üå≥
Initial model parameters:
{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [75]:
# --- Section 4: Model Training (Gradient Boosting Classifier) ---

print("\nTraining the Gradient Boosting Classifier... üöÄ")
# Train the model using the scaled training data.
# For scikit-learn models, training is done directly with the .fit() method.
model.fit(X_train_scaled, y_train)

print("\nModel training complete. üéâ")


Training the Gradient Boosting Classifier... üöÄ

Model training complete. üéâ


In [76]:
# --- Section 5: Model Evaluation (Gradient Boosting Classifier) ---

print("\nEvaluating model on the test set... üîç")
# Make predictions on the scaled test set.
y_pred = model.predict(X_test_scaled)

# Calculate and print the accuracy score.
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Print a detailed classification report including precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix for a breakdown of true positives, true negatives, false positives, and false negatives.
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Evaluating model on the test set... üîç
Test Accuracy: 0.8333

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.93      0.88        41
           1       0.80      0.63      0.71        19

    accuracy                           0.83        60
   macro avg       0.82      0.78      0.79        60
weighted avg       0.83      0.83      0.83        60


Confusion Matrix:
[[38  3]
 [ 7 12]]


In [77]:
# --- Section 6: Conditional Model and Scaler Saving (Gradient Boosting Classifier) ---

# This block checks if the Test Accuracy is above 80% (0.80).
# The model and scaler are saved ONLY if this condition is met, as per project requirements.
if accuracy > 0.80:
    # Define filenames for the trained Gradient Boosting model and the StandardScaler.
    # Scikit-learn models are typically saved as .pkl files.
    model_filename = 'gradient_boosting_model.pkl' # Changed filename
    scaler_filename = 'scaler.pkl'

    # Save the trained Gradient Boosting model using pickle.
    # Pickle serializes the Python object (the model) into a byte stream.
    with open(model_filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"\nModel saved successfully as '{model_filename}'! ‚úÖ")

    # Save the StandardScaler using pickle.
    with open(scaler_filename, 'wb') as file:
        pickle.dump(scaler, file)
    print(f"Scaler saved successfully as '{scaler_filename}'! ‚úÖ")

    # Instructions for downloading the files from Google Colab's file browser.
    print("\nTo download these files from Google Colab:")
    print(f"1. Go to the 'Files' icon on the left sidebar (folder icon).")
    print(f"2. Locate '{model_filename}' and '{scaler_filename}' in the file list.")
    print(f"3. Click the three dots (‚ãÆ) next to each file and select 'Download'.")
    print("Remember to replace your old files in your local project with these new ones.")
else:
    # This message is printed if the accuracy requirement is not met, explicitly preventing saving.
    print("\nTest Accuracy is not yet over 80%. ‚ùå Please re-run the training (Section 4) after adjusting parameters, or consider further model tuning (e.g., n_estimators, learning_rate) before saving.")
    print("No model or scaler files were saved as the accuracy target was not met.")


Model saved successfully as 'gradient_boosting_model.pkl'! ‚úÖ
Scaler saved successfully as 'scaler.pkl'! ‚úÖ

To download these files from Google Colab:
1. Go to the 'Files' icon on the left sidebar (folder icon).
2. Locate 'gradient_boosting_model.pkl' and 'scaler.pkl' in the file list.
3. Click the three dots (‚ãÆ) next to each file and select 'Download'.
Remember to replace your old files in your local project with these new ones.
