In [67]:
import pandas as pd
from pathlib import Path
import os
print(os.environ.get("MLFLOW_TRACKING_USERNAME"))

PROJECT_ROOT = Path("..")
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "heart_disease_cleaned.csv"

df = pd.read_csv(DATA_PATH)
df.head()

None


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [68]:
X = df.drop("target", axis=1)
y = df["target"]

In [69]:
categorical_features = [
    "sex", "cp", "fbs", "restecg",
    "exang", "slope", "thal"
]

numerical_features = [
    "age", "trestbps", "chol",
    "thalach", "oldpeak", "ca"
]
print("Feature separation completed.")

Feature separation completed.


In [70]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print("Data split into training and testing sets.")

Data split into training and testing sets.


In [72]:
from sklearn.linear_model import LogisticRegression

log_reg_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

log_reg_pipeline.fit(X_train, y_train)


In [73]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ))
])

rf_pipeline.fit(X_train, y_train)


In [74]:
from sklearn.model_selection import cross_validate

scoring = ["accuracy", "precision", "recall", "roc_auc"]
lr_cv = cross_validate(
    log_reg_pipeline,
    X_train, y_train,
    cv=5,
    scoring=scoring
)


In [75]:
rf_cv = cross_validate(
    rf_pipeline,
    X_train, y_train,
    cv=5,
    scoring=scoring
)


In [76]:
import numpy as np

def summarize_cv_results(cv_results):
    return {
        metric: np.mean(cv_results[f"test_{metric}"])
        for metric in scoring
    }

lr_results = summarize_cv_results(lr_cv)
rf_results = summarize_cv_results(rf_cv)

pd.DataFrame([lr_results, rf_results],
             index=["Logistic Regression", "Random Forest"])


Unnamed: 0,accuracy,precision,recall,roc_auc
Logistic Regression,0.82619,0.843436,0.765217,0.899506
Random Forest,0.784949,0.775921,0.747036,0.88834


In [77]:
import mlflow
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()

mlflow.set_tracking_uri(f"file:///{PROJECT_ROOT}/mlruns")
mlflow.set_experiment("Heart Disease Classification")


from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    RocCurveDisplay
)
import matplotlib.pyplot as plt


In [78]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }
    return metrics, y_prob


In [79]:
with mlflow.start_run(run_name="Logistic Regression"):
    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)

    # Train model
    log_reg_pipeline.fit(X_train, y_train)

    # Evaluate
    metrics, y_prob = evaluate_model(
        log_reg_pipeline, X_test, y_test
    )

    # Log metrics
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    # ROC Curve
    RocCurveDisplay.from_estimator(
        log_reg_pipeline, X_test, y_test
    )
    plt.title("ROC Curve - Logistic Regression")
    plt.savefig("roc_logistic.png")
    mlflow.log_artifact("roc_logistic.png")
    plt.close()

    # Log model
    mlflow.sklearn.log_model(
        log_reg_pipeline,
        artifact_path="model"
    )




In [80]:
with mlflow.start_run(run_name="Random Forest"):
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 200)

    rf_pipeline.fit(X_train, y_train)

    metrics, y_prob = evaluate_model(
        rf_pipeline, X_test, y_test
    )

    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    RocCurveDisplay.from_estimator(
        rf_pipeline, X_test, y_test
    )
    plt.title("ROC Curve - Random Forest")
    plt.savefig("roc_rf.png")
    mlflow.log_artifact("roc_rf.png")
    plt.close()

    mlflow.sklearn.log_model(
        rf_pipeline,
        artifact_path="model"
    )




In [81]:
mlflow.get_experiment_by_name("Heart Disease Classification")

<Experiment: artifact_location='file:///C:\\Users\\SESA661370\\Documents\\MLOps/mlruns/557730518286305649', creation_time=1766995521113, experiment_id='557730518286305649', last_update_time=1766995521113, lifecycle_stage='active', name='Heart Disease Classification', tags={}>