In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
df = pd.read_csv("heart_train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             734 non-null    int64  
 1   Sex             734 non-null    object 
 2   ChestPainType   734 non-null    object 
 3   RestingBP       734 non-null    int64  
 4   Cholesterol     734 non-null    int64  
 5   FastingBS       734 non-null    int64  
 6   RestingECG      734 non-null    object 
 7   MaxHR           734 non-null    int64  
 8   ExerciseAngina  734 non-null    object 
 9   Oldpeak         734 non-null    float64
 10  ST_Slope        734 non-null    object 
 11  HeartDisease    734 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 68.9+ KB


In [4]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
# classify the variables between numerical and categorical
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [5]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [6]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [7]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [8]:
model_pipeline = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss'))
])

In [9]:
y.value_counts(normalize=True)

HeartDisease
1    0.553134
0    0.446866
Name: proportion, dtype: float64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
y_train.value_counts(normalize=True)

HeartDisease
1    0.553663
0    0.446337
Name: proportion, dtype: float64

In [13]:
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")


Training set size: 587
Validation set size: 147


In [14]:
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

Model training complete.


In [15]:
print("\nEvaluating model on the validation set...")
y_pred_val = model_pipeline.predict(X_val)
y_pred_proba_val = model_pipeline.predict_proba(X_val)[:, 1]


Evaluating model on the validation set...


In [16]:
from sklearn.metrics import ( # Changed imports for classification metrics
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, precision_score, recall_score, f1_score
)

In [17]:
accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val)
recall = recall_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, y_pred_proba_val)
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.8707
Precision: 0.8523
Recall: 0.9259
F1 Score: 0.8876
ROC AUC Score: 0.9076

Confusion Matrix:
 [[53 13]
 [ 6 75]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.80      0.85        66
           1       0.85      0.93      0.89        81

    accuracy                           0.87       147
   macro avg       0.88      0.86      0.87       147
weighted avg       0.87      0.87      0.87       147



In [19]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "eval_metric": "logloss"
    }

    model = Pipeline(steps=[
        ('pre', preprocessor),
        ('clf', XGBClassifier(**params))
    ])

    # Cross-validation AUC
    auc = cross_val_score(model, X, y, cv=3, scoring="roc_auc", n_jobs=-1).mean()
    return auc

# Create and run the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Best params
print("Best trial:")
print(study.best_trial)

# Train with best params
best_model = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', XGBClassifier(**study.best_params, eval_metric='logloss', use_label_encoder=False))
])
best_model.fit(X_train, y_train)

[I 2025-05-23 19:42:39,121] A new study created in memory with name: no-name-36bf4d2d-9c62-46d8-ba1f-c0ea160a0b32
[I 2025-05-23 19:42:40,178] Trial 0 finished with value: 0.905844143297822 and parameters: {'n_estimators': 114, 'max_depth': 12, 'learning_rate': 0.0020865733845126344, 'subsample': 0.7670417427860745, 'colsample_bytree': 0.9038918492889119, 'gamma': 3.676703188746886, 'reg_alpha': 3.109459281126612, 'reg_lambda': 2.9628013733498797}. Best is trial 0 with value: 0.905844143297822.
[I 2025-05-23 19:42:40,723] Trial 1 finished with value: 0.9184308789156783 and parameters: {'n_estimators': 268, 'max_depth': 11, 'learning_rate': 0.0633596878845108, 'subsample': 0.8515451299648213, 'colsample_bytree': 0.5311870877582178, 'gamma': 3.490683224812448, 'reg_alpha': 4.935440193197688, 'reg_lambda': 2.895250046457588}. Best is trial 1 with value: 0.9184308789156783.
[I 2025-05-23 19:42:41,211] Trial 2 finished with value: 0.9221174371889429 and parameters: {'n_estimators': 149, 'max

Best trial:
FrozenTrial(number=14, state=1, values=[0.934009922938687], datetime_start=datetime.datetime(2025, 5, 23, 19, 42, 42, 30839), datetime_complete=datetime.datetime(2025, 5, 23, 19, 42, 42, 69736), params={'n_estimators': 139, 'max_depth': 3, 'learning_rate': 0.07365836199226374, 'subsample': 0.7207363238851062, 'colsample_bytree': 0.544577960332076, 'gamma': 1.6026311335477408, 'reg_alpha': 1.1626260496430427, 'reg_lambda': 0.16858282091017252}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=300, log=False, low=50, step=1), 'max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.3, log=True, low=0.001, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'reg_alpha': FloatDistribution(hig

In [21]:
best_params = study.best_params
print(best_params)

{'n_estimators': 139, 'max_depth': 3, 'learning_rate': 0.07365836199226374, 'subsample': 0.7207363238851062, 'colsample_bytree': 0.544577960332076, 'gamma': 1.6026311335477408, 'reg_alpha': 1.1626260496430427, 'reg_lambda': 0.16858282091017252}


In [22]:
best_model = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', XGBClassifier(**best_params, eval_metric='logloss'))
])
best_model.fit(X_train, y_train)

In [24]:
print("\nEvaluating model on the validation set...")
y_pred_val = best_model.predict(X_val)
y_pred_proba_val = best_model.predict_proba(X_val)[:, 1]


Evaluating model on the validation set...


In [25]:
accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val)
recall = recall_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, y_pred_proba_val)
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.8571
Precision: 0.8488
Recall: 0.9012
F1 Score: 0.8743
ROC AUC Score: 0.9177

Confusion Matrix:
 [[53 13]
 [ 8 73]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.80      0.83        66
           1       0.85      0.90      0.87        81

    accuracy                           0.86       147
   macro avg       0.86      0.85      0.85       147
weighted avg       0.86      0.86      0.86       147

