# Install libraries

In [None]:
# Ensure you have the dependencies for this notebook
%pip install -r requirements.txt

# Please fill following cell

In [None]:
NICKNAME = ""  # <-Please provide your nickname (lowercase no special characters)


EXPERIMENT_NAME = f"{NICKNAME}-HeartConditionClassifier-experiment"
ENDPOINT_NAME = f"{NICKNAME}-api-endpoint"

MODELS = {
    "xgboost": {
        "name": f"{NICKNAME}-HeartConditionClassifier-model-XGBoost",
        "type": "XGBoost",
    },
    "lr": {
        "name": f"{NICKNAME}-HeartConditionClassifier-model-LogisticRegression",
        "type": "LogisticRegression",
    }
}

# 1. Load data (heart dataset)

In [None]:
import pandas as pd
# Load data (heart dataset)
df = pd.read_csv("https://raw.githubusercontent.com/HooverCz/ML-API/dev/heart.csv")

print(f"Shape of DF: {df.shape}")
df.head()

# 2. Train simple ML model

### Step 1: Importing Libraries

We start by importing necessary libraries for data preprocessing, model selection, and evaluation. These include numpy for numerical operations, scikit-learn for machine learning tasks, and XGBoost for gradient boosting.

### Step 2: Data Splitting

Next, we split our dataset into training and testing sets using the `train_test_split()` function from scikit-learn. This step is crucial to evaluate the model's performance on unseen data.

### Step 3: Preprocessing

We define transformers for categorical and continuous columns and combine them using `ColumnTransformer`. 

- **Categorical Preprocessing:** Categorical columns are one-hot encoded using the `OneHotEncoder` transformer to convert them into a numerical format. One-hot encoding is essential for algorithms that require numerical inputs, as it creates binary columns for each category.

- **Continuous Preprocessing:** Continuous columns are scaled using `RobustScaler`. Robust scaling is robust to outliers and ensures that all features have the same scale, preventing certain features from dominating the model training process.

### Step 4: Pipeline Construction

We construct pipelines for logistic regression and XGBoost classifiers, combining preprocessing with the respective classifiers. Pipelines allow us to chain together multiple processing steps into a single object, making it easier to manage and reproduce.

### Step 5: Hyperparameter Tuning

We define a hyperparameter grid for grid search to find the best hyperparameters for logistic regression. Grid search is a technique used to find the optimal combination of hyperparameters for a given model, improving its performance.

This pipeline setup enables efficient model training and hyperparameter tuning for classification tasks.




In [None]:
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Split data and train/test the pipeline
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['output'], axis=1),
    df['output'],
    test_size=0.2,
    random_state=42)


# Define the columns to be encoded and scaled
cat_cols = ['sex', 'exng', 'caa', 'cp', 'fbs', 'restecg', 'slp', 'thall']
con_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]

# Define the preprocessor for categorical and continuous columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

continuous_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols),
        ('con', continuous_transformer, con_cols)
    ])

# Combine preprocessing with the classifier in a single pipeline
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', LogisticRegression())])

xgboost_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', XGBClassifier(use_label_encoder=False, eval_metric="logloss"))])


# Define hyperparameter grid for grid search
param_grid = {
    'clf__penalty' : ['l1', 'l2'],
    'clf__C' : np.logspace(-4, 4, 20),
    'clf__solver' : ['liblinear']
    }

lr_grid_search = GridSearchCV(lr_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

## 2.1. Train logistic regression model

### Step 1: Fit Grid Search to Data

The grid search is fitted to the training data using MLflow to track experiments. This allows us to efficiently search for the best hyperparameters for the logistic regression model.

### Step 2: Get the Best Model

The best model from the grid search is obtained using the `best_estimator_` attribute.

### Step 3: Prediction

Using the best model, predictions are made on the test data (`X_test`). Both class predictions (`y_pred`) and predicted probabilities (`y_pred_proba`) are calculated.

### Step 4: Calculate Test Metrics

Several test metrics such as accuracy, precision, recall, and F1-score are calculated using scikit-learn metrics functions. These metrics help evaluate the performance of the model on the test set.

### Step 5: Generate ROC Curve

An ROC curve is generated to visualize the trade-off between true positive rate (sensitivity) and false positive rate (1-specificity) of the model. This curve helps assess the model's predictive performance across different probability thresholds.

### Step 6: Log Parameters and Metrics to MLflow

The best hyperparameters and test metrics are logged to MLflow. Additionally, the ROC curve plot is saved as an artifact and logged to MLflow for later reference.

### Step 7: Register the Final Model

The final model is registered with MLflow, including its hyperparameters, evaluation metrics, and an input example. This allows for easy model reproducibility and deployment in future.

### Step 8: Print Test Accuracy Score

Finally, the test accuracy score of the logistic regression model is printed for easy reference.

This comprehensive process enables efficient model training, evaluation, and tracking using MLflow for seamless experimentation and reproducibility.


In [None]:
import mlflow
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt


# Fit the grid search to the data
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)
with mlflow.start_run(run_name=MODELS["lr"]["type"]):
    lr_grid_search.fit(X_train, y_train)

    # Get the best model from the grid search
    best_model = lr_grid_search.best_estimator_

    # Predict using the best model
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)

    # Calculate test metrics
    metrics = {
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision": precision_score(y_true=y_test, y_pred=y_pred),
        "test_recall": recall_score(y_true=y_test, y_pred=y_pred),
        "test_f1": f1_score(y_true=y_test, y_pred=y_pred),
    }
    tags = {
        "owner": NICKNAME,
        "project": "heart-condition-classification",
        "business_unit": "18AAD",
        "model_type": MODELS["lr"]["type"],
    }

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")

    # Log parameters and metrics to MLflow
    mlflow.log_params(lr_grid_search.best_params_)
    mlflow.log_metrics(metrics)
    mlflow.log_artifact("ROC-Curve.png")
    mlflow.set_tags(tags)

    signature = mlflow.models.infer_signature(X_train, best_model.predict(X_train))

    # Register the final model with MLflow
    mlflow.sklearn.log_model(
        best_model,
        artifact_path=MODELS["lr"]["name"],
        input_example=X_train.iloc[:1],
        signature=signature,
        )
    mlflow.register_model(
        model_uri=f"runs:/{mlflow.active_run().info.run_id}/{MODELS['lr']['name']}",
        name=MODELS["lr"]["name"],
        tags={**tags, **metrics},
    )

    print(f"The test accuracy score of {MODELS['lr']['name']} is {metrics['test_accuracy']}")


## 2.2. Train XGBoost classfier model

In [None]:
import mlflow
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt


# Fit the grid search to the data
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)
with mlflow.start_run(run_name=MODELS["xgboost"]["type"]):
    xgboost_pipeline.fit(X_train, y_train)


    # Predict using the XGBoost model
    y_pred = xgboost_pipeline.predict(X_test)
    y_pred_proba = xgboost_pipeline.predict_proba(X_test)

    # Calculate test metrics
    metrics = {
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision": precision_score(y_true=y_test, y_pred=y_pred),
        "test_recall": recall_score(y_true=y_test, y_pred=y_pred),
        "test_f1": f1_score(y_true=y_test, y_pred=y_pred),
    }
    print(metrics)
    tags = {
        "owner": NICKNAME,
        "project": "heart-condition-classification",
        "business_unit": "18AAD",
        "model_type": MODELS["xgboost"]["type"],
    }

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")

    # Log parameters and metrics to MLflow
    mlflow.log_params(xgboost_pipeline.get_params()["model"].get_xgb_params())
    mlflow.log_metrics(metrics)
    mlflow.log_artifact("ROC-Curve.png")
    mlflow.set_tags(tags)

    signature = mlflow.models.infer_signature(X_train, xgboost_pipeline.predict(X_train))

    # Register the final model with MLflow
    mlflow.sklearn.log_model(
        xgboost_pipeline,
        artifact_path=MODELS["xgboost"]["name"],
        input_example=X_train.iloc[:1],
        signature=signature,
        )
    mlflow.register_model(
        model_uri=f"runs:/{mlflow.active_run().info.run_id}/{MODELS['xgboost']['name']}",
        name=MODELS["xgboost"]["name"],
        tags={**tags, **metrics},
    )

    print(f"The test accuracy score of {MODELS['xgboost']['type']} is {metrics['test_accuracy']}")


## 2.3. MLFlow autolog

In [None]:
# Fit the grid search to the data
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)
with mlflow.start_run(run_name=f'{MODELS["lr"]["type"]}-autolog'):
    mlflow.sklearn.autolog()
    lr_grid_search.fit(X_train, y_train)

    # Get the best model from the grid search
    best_model = lr_grid_search.best_estimator_

    # Predict using the best model
    y_pred = best_model.predict(X_test)


In [None]:
from mlflow.tracking.client import MlflowClient

mlflow_client = MlflowClient()
model_name = MODELS['lr']['name']

for item in mlflow_client.search_model_versions(f"name = '{model_name}'"):
    print(item)

In [None]:
mlflow_client.search_runs(experiment_ids=mlflow.get_experiment_by_name(f"{EXPERIMENT_NAME}").experiment_id)

# 3. Batch Inference

In [None]:

model_version = mlflow_client.search_model_versions(f"name = '{model_name}'")[0].version
print(f"Newest version for model: {model_name} is version: {model_version}")

model = mlflow.sklearn.load_model(model_uri=f"models:/{model_name}/{model_version}")
type(model)

In [None]:
model.predict(X_test)

# 4. Real-time (online) inference

## 4.1. Create API inference endpoint

In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

import json
import requests
import mlflow

In [None]:
# Get MLclient for interacting with azureml workspace via code

ml_client = MLClient.from_config(credential=DefaultAzureCredential())
ml_client

In [None]:
from mlflow.deployments import get_deploy_client

deployment_client = get_deploy_client(mlflow.get_tracking_uri())
deployment_client.__dict__

In [None]:
endpoint_config = {
    "auth_mode": "key",
    "identity": {
        "type": "system_assigned"
    }
}
endpoint_config_path = "endpoint_config.json"
with open(endpoint_config_path, "w") as outfile:
    outfile.write(json.dumps(endpoint_config))

In [None]:
# Create empty endpoint
endpoint = deployment_client.create_endpoint(
    name=ENDPOINT_NAME,
    config={"endpoint-config-file": endpoint_config_path},)

endpoint

In [None]:
# Create deployment config - compute type + size and dump it as json
deployment_name = "default"
deploy_config = {
    "instance_type": "Standard_DS2_v2",
    "instance_count": 1,
}

deployment_config_path = "deployment_config.json"
with open(deployment_config_path, "w") as outfile:
    outfile.write(json.dumps(deploy_config))

In [None]:
# Deploy our ML model to the endpoint (takes between 6-10 minutes)

deployment = deployment_client.create_deployment(
    name=deployment_name,
    endpoint=ENDPOINT_NAME,
    model_uri=f"models:/{model_name}/{model_version}",
    config={"deploy-config-file": deployment_config_path},
)

In [None]:
# Update endpoint to point 100 % of traffic to our model deployment

traffic_config = {"traffic": {deployment_name: 100}}
traffic_config_path = "traffic_config.json"
with open(traffic_config_path, "w") as outfile:
    outfile.write(json.dumps(traffic_config))


deployment_client.update_endpoint(
    endpoint=ENDPOINT_NAME,
    config={"endpoint-config-file": traffic_config_path},
)

In [None]:
import requests


# Get URL and credentials to the endpoint

scoring_uri = deployment_client.get_endpoint(endpoint=ENDPOINT_NAME)["properties"]["scoringUri"]
print(scoring_uri)

endpoint_secret_key = ""  # <- please fill; navigate to Endpoints (left menu) -> Select your endpoint -> Consume

# Make API call to the endpoint

headers = {
    "Content-Type": "application/json",
    "Authorization": ("Bearer " + endpoint_secret_key),
    "azureml-model-deployment": "default",
}

sample_data = X_test.sample(5).to_json(orient="split", index=False)

sample_request = {
    "input_data": json.loads(sample_data)
}

print(f"Sample request: {sample_request}")

req = requests.post(scoring_uri, json=sample_request, headers=headers)


print(f"Response: {req.json()}")