In [1]:
# MLflow, Quick Start

In [2]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.16.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.16.0 (from mlflow)
  Downloading mlflow_skinny-2.16.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.16.0->mlflow)
  Downloading databricks_sdk-0.32.0-py3-none-any.whl.metadata (37 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.16.0->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.16.0->mlflow)
  Downloading opentelemetry_api-1.2

In [None]:
mlflow server --host 127.0.0.1 --port 8080

In [3]:
# Import sklearn library
from sklearn.linear_model import Ridge


# Define parameters
params = {
    "alpha": 1.0,
    "fit_intercept": True,
    "solver": "auto",
}

# Instantiate the Ridge model with parameters
ridge = Ridge(**params)

# Train the model
ridge.fit(X_train, y_train)

# Predict
predictions = ridge.predict(X_test)

# Evaluate your model
mae, mse, rmse, r_squared = evaluation(y_test, predictions)

NameError: name 'X_train' is not defined

In [None]:
# Import the library
import mlflow

# Connect to the MLflow server (in this case, we are using our own computer)
mlflow.set_tracking_uri(uri="http://localhost:8080")

# Set the tracking experiment (in this case, House Prices is going to be our experiment name)
mlflow.set_experiment("House Prices")

In [None]:
# Start an MLflow run
with mlflow.start_run(run_name=run_name):

  ####### The format has to be json #######
  ''' Selected data can be uploaded at the same time'''

  # Log the hyperparameters
  mlflow.log_params(params)

  # Log the loss metric
  mlflow.log_metrics(metric_eval)


  ####### The format has to be int or float #######
  ''' Selected data can be uploaded one by one'''

  # Log a hyperparameter
  mlflow.log_param("lr", 0.001)

  # Log a loss metric
  mlflow.log_metric("val_loss", val_loss)


  ####### Log the model #######

  # Infer the model signature
  signature = infer_signature(X_train, model.predict(X_train))

  # Log the model
  model_info = mlflow.sklearn.log_model( # Depends on the framework -> mlflow.pytorch, mlflow.spark...
      sk_model=model,
      artifact_path="hose-price",
      signature=signature,
      input_example=X_train,
  )


  # Set a tag that we can use to remind ourselves what this run was for
  mlflow.set_tag("Training Info", "Basic Ridge model for house prices")

In [None]:
mlflow.enable_system_metrics_logging()

In [None]:
################ log image ######################

# Read the image file back into a variable
correlation_matrix_image = Image.open('correlation_matrix.png')

# Log the correlation matrix image as an artifact
mlflow.log_image(correlation_matrix_image, "correlation_matrix.png")


############### log dataset #####################

dataset_train = mlflow.data.from_pandas(X_train, "training_data")
mlflow.log_input(dataset_train, context="training")

In [None]:
# optuna
def register_model_mlflow(run_name, params, model, X_train, X_test, y_train, y_test):
    """
    Registers a trained machine learning model and its associated artifacts with MLflow.

    Parameters:
        run_name (str): Name of the MLflow run.
        params (dict): Hyperparameters used for training the model.
        model (sklearn.base.BaseEstimator): Trained machine learning model.
        X_train (pandas.DataFrame): Features of the training dataset.
        X_test (pandas.DataFrame): Features of the testing dataset.
        y_train (pandas.Series): Target variable
	 of the training dataset.
        y_test (pandas.Series): Target variable of the testing dataset.

    Returns:
        tuple: Trained model instance and evaluation metrics.

    """
    with mlflow.start_run(run_name=run_name):
        # Instantiate the model with specified hyperparameters
        model_instance = model(**params)
        model_instance.fit(X_train, y_train)
        predictions = model_instance.predict(X_test)

        # Evaluate the model
        mae, mse, rmse, r_squared = evaluation(y_test, predictions)

        # Log predictions as a table
        prediction_table = X_test.copy()
        prediction_table["ground_truth"] = y_test
        prediction_table["predictions"] = predictions
        mlflow.log_table(data=prediction_table, artifact_file="predictions.csv")

        # Log evaluation metrics
        metric_eval = {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared}
        mlflow.log_metrics(metric_eval)

        # Log hyperparameters
        mlflow.log_params(params)

        # Set a tag to describe the training
        mlflow.set_tag("Training Info", "Basic Ridge model for house prices")

        # Log the trained model
        signature = infer_signature(X_train, model_instance.predict(X_train))
        model_artifact_path = "ridge_model"
        mlflow.sklearn.log_model(
            sk_model=model_instance,
            artifact_path=model_artifact_path,
            signature=signature,
            input_example=X_train,
        )

        # Log an image as an artifact
        correlation_matrix_image = Image.open('correlation_matrix.png')
        mlflow.log_image(correlation_matrix_image, "correlation_matrix.png")

        # Log the datasets
        train_dataset = mlflow.data.from_pandas(X_train, "training_data")
        mlflow.log_input(train_dataset, context="training")
        test_dataset = mlflow.data.from_pandas(X_test, "test_data")
        mlflow.log_input(test_dataset, context="test")

        return model_instance, metric_eval

In [None]:
# hypertuning
def objective(trial):
    """
    Objective function for hyperparameter optimization.

    Args:
        trial (optuna.trial.Trial): A single optimization trial.

    Returns:
        float: Root mean squared error (RMSE) metric for the Ridge regression model.
    """
    # Define hyperparameters to be optimized
    params = {
        "alpha": trial.suggest_float('alpha', 1e-10, 1, log=True),
        "fit_intercept": trial.suggest_categorical('fit_intercept', [True, False]),
        "solver": trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    }

    # Generate a unique identifier for the run
    new_uuid = uuid.uuid4()

    # Register the model and evaluate its performance
    model, metric_eval = register_model_mlflow(f"Ridge_{new_uuid}", params, Ridge, X_train, X_test, y_train, y_test)

    # Select the metric to be maximized (RMSE)
    rmse = metric_eval["RMSE"]

    return rmse

In [None]:
# Create a study object and optimize the objective function.
study = optuna.create_study(direction='minimize') # Minimaize or maximize, depends on the metric
study.optimize(objective, n_trials=50) # You have to specify how much trials you want to test

In [None]:
import mlflow
import mlflow.pytorch
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

# Set the MLflow tracking URI
mlflow.set_tracking_uri("http://localhost:8080")

# Specify the experiment name
EXPERIMENT_NAME = "House Prices"
mlflow.set_experiment(EXPERIMENT_NAME)

# Initialize the MLflow client
client = MlflowClient()

# Name of the model to be used
MODEL_NAME = 'tracking-quickstart'

# Search for the latest version of the model in the model registry
max_version = 0
for mv in client.search_model_versions(f"name='{MODEL_NAME}'"):
    current_stage = dict(mv)['aliases']
    if current_stage == ['champion']: # champion is the tag we selected for deploy the model
        model_deploy = mv

# Download the model
model = mlflow.sklearn.load_model(model_deploy.source, dst_path=None)

# Use the model
model.predict(X_test[:1])