In [0]:
# Import packages
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import mlflow
from math import sqrt
from mlflow.client import MlflowClient
from datetime import datetime as dt

# Set the experiment name to an experiment in the shared experiments folder
mlflow.set_experiment("/mlflow_sdk_test")

client = MlflowClient()



In [0]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
diabetespd = pd.DataFrame(data=diabetes.data)

In [0]:
diabetespd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [0]:
# Start MLflow run for this experiment

# End any existing runs
mlflow.end_run()

with mlflow.start_run() as run:
    # Turn autolog on to save model artifacts, requirements, etc.
    mlflow.autolog(log_models=True)

    print(run.info.run_id)

    diabetes_X = diabetes.data
    diabetes_y = diabetes.target

    # Split data into test training sets, 3:1 ratio
    (
        diabetes_X_train,
        diabetes_X_test,
        diabetes_y_train,
        diabetes_y_test,
    ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

    alpha = 0.9
    solver = "cholesky"
    regr = linear_model.Ridge(alpha=alpha, solver=solver)

    regr.fit(diabetes_X_train, diabetes_y_train)

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # Log desired metrics
    mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
    mlflow.log_metric(
        "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
    )

2023/02/15 14:52:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/15 14:52:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/15 14:52:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
a517d3dce91041a49fdf34efd41ae772


In [0]:
with mlflow.start_run(run_id="3fcf403e1566422493cd6e625693829d") as run:
    mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

In [0]:
my_experiment = mlflow.set_experiment("/mlflow_sdk_test")
print(my_experiment.experiment_id)

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/386932948801159', experiment_id='386932948801159', lifecycle_stage='active', name='/mlflow_sdk_test', tags={'mlflow.experiment.sourceName': '/mlflow_sdk_test',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'matt.collins@coeo.com',
 'mlflow.ownerId': '4719763231603709'}>


In [0]:
# End any existing runs
mlflow.end_run()

with mlflow.start_run(experiment_id=experiment_id):
    # Turn autolog on to save model artifacts, requirements, etc.
    mlflow.autolog(log_models=True)

    print(run.info.run_id)

    diabetes_X = diabetes.data
    diabetes_y = diabetes.target

    # Split data into test training sets, 3:1 ratio
    (
        diabetes_X_train,
        diabetes_X_test,
        diabetes_y_train,
        diabetes_y_test,
    ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

    alpha = 0.8
    solver = "cholesky"
    regr = linear_model.Ridge(alpha=alpha, solver=solver)

    regr.fit(diabetes_X_train, diabetes_y_train)

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # Log desired metrics
    mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
    mlflow.log_metric(
        "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
    )
    mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2023/02/11 22:26:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/11 22:26:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/11 22:26:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
3fcf403e1566422493cd6e625693829d


In [0]:
# Start MLflow run for this experiment

# End any existing runs
mlflow.end_run()

# Explicitly name runs
today = dt.today()

run_name = "Ridge Regression " + str(today)

with mlflow.start_run(run_name=run_name) as run:
    # Turn autolog on to save model artifacts, requirements, etc.
    mlflow.autolog(log_models=True)

    print(run.info.run_id)

    diabetes_X = diabetes.data
    diabetes_y = diabetes.target

    # Split data into test training sets, 3:1 ratio
    (
        diabetes_X_train,
        diabetes_X_test,
        diabetes_y_train,
        diabetes_y_test,
    ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

    alpha = 0.5
    solver = "cholesky"
    regr = linear_model.Ridge(alpha=alpha, solver=solver)

    regr.fit(diabetes_X_train, diabetes_y_train)

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # Log desired metrics
    mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
    mlflow.log_metric(
        "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
    )
    mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2023/02/11 22:30:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/11 22:30:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/11 22:30:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
08adf2e12e3f49f4bc8235f3e373a81f


In [0]:
# End any existing runs
mlflow.end_run()

# Explicitly name runs
run_name = "Ridge Regression Nested"

with mlflow.start_run(run_name=run_name) as parent_run:
    print(parent_run.info.run_id)

    with mlflow.start_run(run_name="Child Run: alpha 0.1", nested=True):
        # Turn autolog on to save model artifacts, requirements, etc.
        mlflow.autolog(log_models=True)

        diabetes_X = diabetes.data
        diabetes_y = diabetes.target

        # Split data into test training sets, 3:1 ratio
        (
            diabetes_X_train,
            diabetes_X_test,
            diabetes_y_train,
            diabetes_y_test,
        ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

        alpha = 0.1
        solver = "cholesky"
        regr = linear_model.Ridge(alpha=alpha, solver=solver)

        regr.fit(diabetes_X_train, diabetes_y_train)

        diabetes_y_pred = regr.predict(diabetes_X_test)

        # Log desired metrics
        mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
        mlflow.log_metric(
            "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
        )
        mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

In [0]:
# End any existing runs
mlflow.end_run()

# Explicitly name runs
run_name = "Ridge Regression Nested"

with mlflow.start_run(run_id="61d34b13649c45699e7f05290935747c") as parent_run:
    print(parent_run.info.run_id)
    with mlflow.start_run(run_name="Child Run: alpha 0.2", nested=True):
        # Turn autolog on to save model artifacts, requirements, etc.
        mlflow.autolog(log_models=True)

        diabetes_X = diabetes.data
        diabetes_y = diabetes.target

        # Split data into test training sets, 3:1 ratio
        (
            diabetes_X_train,
            diabetes_X_test,
            diabetes_y_train,
            diabetes_y_test,
        ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

        alpha = 0.2
        solver = "cholesky"
        regr = linear_model.Ridge(alpha=alpha, solver=solver)

        regr.fit(diabetes_X_train, diabetes_y_train)

        diabetes_y_pred = regr.predict(diabetes_X_test)

        # Log desired metrics
        mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
        mlflow.log_metric(
            "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
        )
        mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2023/02/12 18:41:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/12 18:41:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/12 18:41:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [0]:
# Create DataFrame of all runs in *current* experiment
df = mlflow.search_runs(order_by=["start_time DESC"])

# Print a list of the columns available
print(list(df.columns))

['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time', 'end_time', 'metrics.training_mae', 'metrics.training_r2_score', 'metrics.training_rmse', 'metrics.mean_squared_error_diabetes_X_test', 'metrics.rmse', 'metrics.training_mse', 'metrics.mean_squared_error-2_diabetes_X_test', 'metrics.training_score', 'metrics.mse', 'metrics.r2', 'metrics.r2_score_diabetes_X_test', 'params.selection', 'params.random_state', 'params.tol', 'params.copy_X', 'params.warm_start', 'params.normalize', 'params.positive', 'params.precompute', 'params.alpha', 'params.fit_intercept', 'params.max_iter', 'params.n_jobs', 'params.n_iter', 'params.alpha_1', 'params.lambda_init', 'params.verbose', 'params.compute_score', 'params.lambda_2', 'params.lambda_1', 'params.alpha_2', 'params.alpha_init', 'params.solver', 'tags.mlflow.databricks.cluster.id', 'tags.mlflow.databricks.notebookRevisionID', 'tags.mlflow.databricks.workspaceID', 'tags.mlflow.source.name', 'tags.mlflow.databricks.notebookPath', 'tags.m

In [0]:
# Create DataFrame with subset of columns
runs_df = df[
    [
        "run_id",
        "experiment_id",
        "status",
        "start_time",
        "metrics.mse",
        "tags.mlflow.source.type",
        "tags.estimator_name",
        "tags.mlflow.rootRunId",
    ]
].copy()
runs_df.head()

Unnamed: 0,run_id,experiment_id,status,start_time,metrics.mse,tags.mlflow.source.type,tags.estimator_name,tags.mlflow.rootRunId
0,a517d3dce91041a49fdf34efd41ae772,386932948801159,FINISHED,2023-02-15 14:52:40.757000+00:00,2817.598955,NOTEBOOK,Lasso,
1,792d316d13a14507a8db2451047f14d3,386932948801159,FINISHED,2023-02-15 14:52:33.633000+00:00,2769.32368,NOTEBOOK,Lasso,
2,670cb457254f4da5ae328932855550c8,386932948801159,FINISHED,2023-02-15 14:52:25.424000+00:00,2753.918884,NOTEBOOK,Lasso,
3,6b642e794f6546e7b8e3c8e6e7080fcc,386932948801159,FINISHED,2023-02-14 23:33:50.705000+00:00,2848.295308,NOTEBOOK,LinearRegression,
4,94e597abc0854c14adafb80447b2e1a4,386932948801159,FINISHED,2023-02-14 23:33:38.894000+00:00,2826.186241,NOTEBOOK,BayesianRidge,


In [0]:
# Feature engineering to create some additional columns
runs_df["start_date"] = runs_df["start_time"].dt.date
runs_df["start_timestamp"] = runs_df["start_time"].dt.time
runs_df["is_nested_parent"] = runs_df[["run_id", "tags.mlflow.rootRunId"]].apply(
    lambda x: 1 if x["run_id"] == x["tags.mlflow.rootRunId"] else 0, axis=1
)
runs_df["is_nested_child"] = runs_df[["run_id", "tags.mlflow.rootRunId"]].apply(
    lambda x: 1
    if x["tags.mlflow.rootRunId"] is not None
    and x["run_id"] != x["tags.mlflow.rootRunId"]
    else 0,
    axis=1,
)
runs_df[["run_id", "start_date", "start_timestamp", "is_nested_parent", "is_nested_child"]].head()

Unnamed: 0,run_id,start_date,start_timestamp,is_nested_parent,is_nested_child
0,a517d3dce91041a49fdf34efd41ae772,2023-02-15,14:52:40.757000,0,0
1,792d316d13a14507a8db2451047f14d3,2023-02-15,14:52:33.633000,0,0
2,670cb457254f4da5ae328932855550c8,2023-02-15,14:52:25.424000,0,0
3,6b642e794f6546e7b8e3c8e6e7080fcc,2023-02-14,23:33:50.705000,0,0
4,94e597abc0854c14adafb80447b2e1a4,2023-02-14,23:33:38.894000,0,0


In [0]:
pd.DataFrame(runs_df.groupby("start_date")["run_id"].count()).reset_index()

Unnamed: 0,start_date,run_id
0,2023-02-11,7
1,2023-02-12,4
2,2023-02-14,4
3,2023-02-15,3


In [0]:
pd.DataFrame(runs_df.groupby("tags.estimator_name")["run_id"].count()).reset_index()

Unnamed: 0,tags.estimator_name,run_id
0,BayesianRidge,2
1,Lasso,3
2,LinearRegression,1
3,Ridge,10
