In [0]:
# Import packages
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import mlflow
from math import sqrt
from mlflow.client import MlflowClient
from datetime import datetime as dt

# Set the experiment name to an experiment in the shared experiments folder
mlflow.set_experiment("/mlflow_sdk_test")

client = MlflowClient()



In [0]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
diabetespd = pd.DataFrame(data=diabetes.data)

In [0]:
diabetespd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [0]:
# Start MLflow run for this experiment

# End any existing runs
mlflow.end_run()

with mlflow.start_run() as run:
    # Turn autolog on to save model artifacts, requirements, etc.
    mlflow.autolog(log_models=True)

    print(run.info.run_id)

    diabetes_X = diabetes.data
    diabetes_y = diabetes.target

    # Split data into test training sets, 3:1 ratio
    (
        diabetes_X_train,
        diabetes_X_test,
        diabetes_y_train,
        diabetes_y_test,
    ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

    alpha = 0.9
    solver = "cholesky"
    regr = linear_model.Ridge(alpha=alpha, solver=solver)

    regr.fit(diabetes_X_train, diabetes_y_train)

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # Log desired metrics
    mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
    mlflow.log_metric(
        "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
    )

2023/02/15 14:52:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/15 14:52:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/15 14:52:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
a517d3dce91041a49fdf34efd41ae772


In [0]:
with mlflow.start_run(run_id="3fcf403e1566422493cd6e625693829d") as run:
    mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

In [0]:
my_experiment = mlflow.set_experiment("/mlflow_sdk_test")
print(my_experiment.experiment_id)

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/386932948801159', experiment_id='386932948801159', lifecycle_stage='active', name='/mlflow_sdk_test', tags={'mlflow.experiment.sourceName': '/mlflow_sdk_test',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'matt.collins@coeo.com',
 'mlflow.ownerId': '4719763231603709'}>


In [0]:
# End any existing runs
mlflow.end_run()

with mlflow.start_run(experiment_id=experiment_id):
    # Turn autolog on to save model artifacts, requirements, etc.
    mlflow.autolog(log_models=True)

    print(run.info.run_id)

    diabetes_X = diabetes.data
    diabetes_y = diabetes.target

    # Split data into test training sets, 3:1 ratio
    (
        diabetes_X_train,
        diabetes_X_test,
        diabetes_y_train,
        diabetes_y_test,
    ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

    alpha = 0.8
    solver = "cholesky"
    regr = linear_model.Ridge(alpha=alpha, solver=solver)

    regr.fit(diabetes_X_train, diabetes_y_train)

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # Log desired metrics
    mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
    mlflow.log_metric(
        "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
    )
    mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2023/02/11 22:26:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/11 22:26:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/11 22:26:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
3fcf403e1566422493cd6e625693829d


In [0]:
# Start MLflow run for this experiment

# End any existing runs
mlflow.end_run()

# Explicitly name runs
today = dt.today()

run_name = "Ridge Regression " + str(today)

with mlflow.start_run(run_name=run_name) as run:
    # Turn autolog on to save model artifacts, requirements, etc.
    mlflow.autolog(log_models=True)

    print(run.info.run_id)

    diabetes_X = diabetes.data
    diabetes_y = diabetes.target

    # Split data into test training sets, 3:1 ratio
    (
        diabetes_X_train,
        diabetes_X_test,
        diabetes_y_train,
        diabetes_y_test,
    ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

    alpha = 0.5
    solver = "cholesky"
    regr = linear_model.Ridge(alpha=alpha, solver=solver)

    regr.fit(diabetes_X_train, diabetes_y_train)

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # Log desired metrics
    mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
    mlflow.log_metric(
        "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
    )
    mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2023/02/11 22:30:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/11 22:30:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/11 22:30:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
08adf2e12e3f49f4bc8235f3e373a81f


In [0]:
# End any existing runs
mlflow.end_run()

# Explicitly name runs
run_name = "Ridge Regression Nested"

with mlflow.start_run(run_name=run_name) as parent_run:
    print(parent_run.info.run_id)

    with mlflow.start_run(run_name="Child Run: alpha 0.1", nested=True):
        # Turn autolog on to save model artifacts, requirements, etc.
        mlflow.autolog(log_models=True)

        diabetes_X = diabetes.data
        diabetes_y = diabetes.target

        # Split data into test training sets, 3:1 ratio
        (
            diabetes_X_train,
            diabetes_X_test,
            diabetes_y_train,
            diabetes_y_test,
        ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

        alpha = 0.1
        solver = "cholesky"
        regr = linear_model.Ridge(alpha=alpha, solver=solver)

        regr.fit(diabetes_X_train, diabetes_y_train)

        diabetes_y_pred = regr.predict(diabetes_X_test)

        # Log desired metrics
        mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
        mlflow.log_metric(
            "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
        )
        mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

In [0]:
# End any existing runs
mlflow.end_run()

# Explicitly name runs
run_name = "Ridge Regression Nested"

with mlflow.start_run(run_id="61d34b13649c45699e7f05290935747c") as parent_run:
    print(parent_run.info.run_id)
    with mlflow.start_run(run_name="Child Run: alpha 0.2", nested=True):
        # Turn autolog on to save model artifacts, requirements, etc.
        mlflow.autolog(log_models=True)

        diabetes_X = diabetes.data
        diabetes_y = diabetes.target

        # Split data into test training sets, 3:1 ratio
        (
            diabetes_X_train,
            diabetes_X_test,
            diabetes_y_train,
            diabetes_y_test,
        ) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

        alpha = 0.2
        solver = "cholesky"
        regr = linear_model.Ridge(alpha=alpha, solver=solver)

        regr.fit(diabetes_X_train, diabetes_y_train)

        diabetes_y_pred = regr.predict(diabetes_X_test)

        # Log desired metrics
        mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
        mlflow.log_metric(
            "rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
        )
        mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2023/02/12 18:41:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/02/12 18:41:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/02/12 18:41:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [0]:
# Create DataFrame of all runs in *current* experiment
df = mlflow.search_runs(order_by=["start_time DESC"])

# Print a list of the columns available
# print(list(df.columns))

df.head()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.training_mae,metrics.training_r2_score,metrics.training_rmse,metrics.mean_squared_error_diabetes_X_test,metrics.rmse,metrics.training_mse,metrics.mean_squared_error-2_diabetes_X_test,metrics.training_score,metrics.mse,metrics.r2,metrics.r2_score_diabetes_X_test,params.selection,params.random_state,params.tol,params.copy_X,params.warm_start,params.normalize,params.positive,params.precompute,params.alpha,params.fit_intercept,params.max_iter,params.n_jobs,params.n_iter,params.alpha_1,params.lambda_init,params.verbose,params.compute_score,params.lambda_2,params.lambda_1,params.alpha_2,params.alpha_init,params.solver,tags.mlflow.databricks.cluster.id,tags.mlflow.databricks.notebookRevisionID,tags.mlflow.databricks.workspaceID,tags.mlflow.source.name,tags.mlflow.databricks.notebookPath,tags.mlflow.log-model.history,tags.mlflow.databricks.notebook.commandID,tags.mlflow.source.type,tags.mlflow.databricks.webappURL,tags.mlflow.databricks.cluster.libraries,tags.mlflow.user,tags.mlflow.databricks.workspaceURL,tags.mlflow.runName,tags.estimator_class,tags.mlflow.databricks.cluster.info,tags.mlflow.databricks.notebookID,tags.estimator_name,tags.version,tags.type,tags.algorithm,tags.mlflow.parentRunId,tags.mlflow.rootRunId
0,a517d3dce91041a49fdf34efd41ae772,386932948801159,FINISHED,dbfs:/databricks/mlflow-tracking/3869329488011...,2023-02-15 14:52:40.757000+00:00,2023-02-15 14:52:43.831000+00:00,47.025294,0.476581,56.248308,2817.598955,53.08106,3163.872148,2817.598955,0.476581,2817.598955,,,cyclic,,0.0001,True,False,False,False,False,0.3,True,1000.0,,,,,,,,,,,,1009-152753-v0fw7fw8,1676472764021,7157766864159089,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,"[{""artifact_path"":""model"",""signature"":{""inputs...",3025108289180260401_6717441847375294816_f2db63...,NOTEBOOK,https://ukwest.azuredatabricks.net,"{""installable"":[{""pypi"":{""package"":""azureml-sd...",matt.collins@coeo.com,adb-7157766864159089.9.azuredatabricks.net,dapper-sponge-324,sklearn.linear_model._coordinate_descent.Lasso,"{""cluster_name"":""matt.collins@coeo.com's Clust...",68298173970105,Lasso,,,,,
1,792d316d13a14507a8db2451047f14d3,386932948801159,FINISHED,dbfs:/databricks/mlflow-tracking/3869329488011...,2023-02-15 14:52:33.633000+00:00,2023-02-15 14:52:36.666000+00:00,46.02722,0.492605,55.380586,2769.32368,52.624364,3067.009277,2769.32368,0.492605,2769.32368,,,cyclic,,0.0001,True,False,False,False,False,0.2,True,1000.0,,,,,,,,,,,,1009-152753-v0fw7fw8,1676472756873,7157766864159089,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,"[{""artifact_path"":""model"",""signature"":{""inputs...",3025108289180260401_8031251311194488415_b313d3...,NOTEBOOK,https://ukwest.azuredatabricks.net,"{""installable"":[{""pypi"":{""package"":""azureml-sd...",matt.collins@coeo.com,adb-7157766864159089.9.azuredatabricks.net,grandiose-squid-647,sklearn.linear_model._coordinate_descent.Lasso,"{""cluster_name"":""matt.collins@coeo.com's Clust...",68298173970105,Lasso,,,,,
2,670cb457254f4da5ae328932855550c8,386932948801159,FINISHED,dbfs:/databricks/mlflow-tracking/3869329488011...,2023-02-15 14:52:25.424000+00:00,2023-02-15 14:52:29.500000+00:00,44.931393,0.507827,54.543546,2753.918884,52.477794,2974.998424,2753.918884,0.507827,2753.918884,,,cyclic,,0.0001,True,False,False,False,False,0.1,True,1000.0,,,,,,,,,,,,1009-152753-v0fw7fw8,1676472749719,7157766864159089,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,"[{""artifact_path"":""model"",""signature"":{""inputs...",3025108289180260401_5677277739369238272_987b36...,NOTEBOOK,https://ukwest.azuredatabricks.net,"{""installable"":[{""pypi"":{""package"":""azureml-sd...",matt.collins@coeo.com,adb-7157766864159089.9.azuredatabricks.net,peaceful-panda-85,sklearn.linear_model._coordinate_descent.Lasso,"{""cluster_name"":""matt.collins@coeo.com's Clust...",68298173970105,Lasso,,,,,
3,6b642e794f6546e7b8e3c8e6e7080fcc,386932948801159,FINISHED,dbfs:/databricks/mlflow-tracking/3869329488011...,2023-02-14 23:33:50.705000+00:00,2023-02-14 23:33:54.276000+00:00,44.054842,0.519035,53.918956,2848.295308,53.369423,2907.253864,2848.295308,0.519035,2848.295308,,,,,,True,,False,False,,,True,,,,,,,,,,,,,1009-152753-v0fw7fw8,1676417634462,7157766864159089,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,"[{""artifact_path"":""model"",""signature"":{""inputs...",2467510099193316612_5728511315957524811_e74e1c...,NOTEBOOK,https://ukwest.azuredatabricks.net,"{""installable"":[{""pypi"":{""package"":""azureml-sd...",matt.collins@coeo.com,adb-7157766864159089.9.azuredatabricks.net,mercurial-ram-182,sklearn.linear_model._base.LinearRegression,"{""cluster_name"":""matt.collins@coeo.com's Clust...",68298173970105,LinearRegression,,,,,
4,94e597abc0854c14adafb80447b2e1a4,386932948801159,FINISHED,dbfs:/databricks/mlflow-tracking/3869329488011...,2023-02-14 23:33:38.894000+00:00,2023-02-14 23:33:42.679000+00:00,44.337345,0.515208,54.133034,2826.186241,53.161887,2930.38538,2826.186241,0.515208,2826.186241,,,,,0.1,True,,False,,,,True,,,300.0,1e-06,,False,False,1e-06,1e-06,1e-06,,,1009-152753-v0fw7fw8,1676417622874,7157766864159089,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,/Users/matt.collins@coeo.com/Diabetes Test/MLf...,"[{""artifact_path"":""model"",""signature"":{""inputs...",2467510099193316612_5248635080929826775_31d71e...,NOTEBOOK,https://ukwest.azuredatabricks.net,"{""installable"":[{""pypi"":{""package"":""azureml-sd...",matt.collins@coeo.com,adb-7157766864159089.9.azuredatabricks.net,vaunted-auk-290,sklearn.linear_model._bayes.BayesianRidge,"{""cluster_name"":""matt.collins@coeo.com's Clust...",68298173970105,BayesianRidge,,,,,


In [0]:
# Create DataFrame with subset of columns
runs_df = df[
    [
        "run_id",
        "experiment_id",
        "status",
        "start_time",
        "metrics.mse",
        "tags.mlflow.source.type",
        "tags.mlflow.user",
        "tags.estimator_name",
        "tags.mlflow.rootRunId",
    ]
].copy()
runs_df.head()

Unnamed: 0,run_id,experiment_id,status,start_time,metrics.mse,tags.mlflow.source.type,tags.mlflow.user,tags.estimator_name,tags.mlflow.rootRunId
0,a517d3dce91041a49fdf34efd41ae772,386932948801159,FINISHED,2023-02-15 14:52:40.757000+00:00,2817.598955,NOTEBOOK,matt.collins@coeo.com,Lasso,
1,792d316d13a14507a8db2451047f14d3,386932948801159,FINISHED,2023-02-15 14:52:33.633000+00:00,2769.32368,NOTEBOOK,matt.collins@coeo.com,Lasso,
2,670cb457254f4da5ae328932855550c8,386932948801159,FINISHED,2023-02-15 14:52:25.424000+00:00,2753.918884,NOTEBOOK,matt.collins@coeo.com,Lasso,
3,6b642e794f6546e7b8e3c8e6e7080fcc,386932948801159,FINISHED,2023-02-14 23:33:50.705000+00:00,2848.295308,NOTEBOOK,matt.collins@coeo.com,LinearRegression,
4,94e597abc0854c14adafb80447b2e1a4,386932948801159,FINISHED,2023-02-14 23:33:38.894000+00:00,2826.186241,NOTEBOOK,matt.collins@coeo.com,BayesianRidge,


In [0]:
# Feature engineering to create some additional columns
runs_df["start_date"] = runs_df["start_time"].dt.date
runs_df["is_nested_parent"] = runs_df[["run_id", "tags.mlflow.rootRunId"]].apply(
    lambda x: 1 if x["run_id"] == x["tags.mlflow.rootRunId"] else 0, axis=1
)
runs_df["is_nested_child"] = runs_df[["run_id", "tags.mlflow.rootRunId"]].apply(
    lambda x: 1
    if x["tags.mlflow.rootRunId"] is not None
    and x["run_id"] != x["tags.mlflow.rootRunId"]
    else 0,
    axis=1,
)
runs_df

Unnamed: 0,run_id,experiment_id,status,start_time,metrics.mse,tags.mlflow.source.type,tags.mlflow.user,tags.estimator_name,tags.mlflow.rootRunId,start_date,is_nested_parent,is_nested_child
0,a517d3dce91041a49fdf34efd41ae772,386932948801159,FINISHED,2023-02-15 14:52:40.757000+00:00,2817.598955,NOTEBOOK,matt.collins@coeo.com,Lasso,,2023-02-15,0,0
1,792d316d13a14507a8db2451047f14d3,386932948801159,FINISHED,2023-02-15 14:52:33.633000+00:00,2769.32368,NOTEBOOK,matt.collins@coeo.com,Lasso,,2023-02-15,0,0
2,670cb457254f4da5ae328932855550c8,386932948801159,FINISHED,2023-02-15 14:52:25.424000+00:00,2753.918884,NOTEBOOK,matt.collins@coeo.com,Lasso,,2023-02-15,0,0
3,6b642e794f6546e7b8e3c8e6e7080fcc,386932948801159,FINISHED,2023-02-14 23:33:50.705000+00:00,2848.295308,NOTEBOOK,matt.collins@coeo.com,LinearRegression,,2023-02-14,0,0
4,94e597abc0854c14adafb80447b2e1a4,386932948801159,FINISHED,2023-02-14 23:33:38.894000+00:00,2826.186241,NOTEBOOK,matt.collins@coeo.com,BayesianRidge,,2023-02-14,0,0
5,88649d5069c748d8bb25a230c4ee3185,386932948801159,FINISHED,2023-02-14 23:33:24.883000+00:00,2826.186254,NOTEBOOK,matt.collins@coeo.com,BayesianRidge,,2023-02-14,0,0
6,35dbe9fa63a24ecca68cbaa3d63c931f,386932948801159,FAILED,2023-02-14 23:32:32.301000+00:00,,NOTEBOOK,matt.collins@coeo.com,,,2023-02-14,0,0
7,60ce90bc59c44a039d5dc4d4b6617661,386932948801159,FINISHED,2023-02-12 18:47:50.248000+00:00,3105.468751,NOTEBOOK,matt.collins@coeo.com,Ridge,,2023-02-12,0,0
8,52dc3209057748f49f9d1ccf36f53214,386932948801159,FINISHED,2023-02-12 18:41:44.216000+00:00,2811.930024,NOTEBOOK,matt.collins@coeo.com,Ridge,61d34b13649c45699e7f05290935747c,2023-02-12,0,1
9,e458e0b918b946e4a64daf09bc03f94c,386932948801159,FINISHED,2023-02-12 18:39:04.411000+00:00,2810.031462,NOTEBOOK,matt.collins@coeo.com,Ridge,61d34b13649c45699e7f05290935747c,2023-02-12,0,1


In [0]:
pd.DataFrame(runs_df.groupby("start_date")["run_id"].count()).reset_index()

Unnamed: 0,start_date,run_id
0,2023-02-11,7
1,2023-02-12,4
2,2023-02-14,4
3,2023-02-15,3


In [0]:
pd.DataFrame(runs_df.groupby("tags.estimator_name")["run_id"].count()).reset_index()

Unnamed: 0,tags.estimator_name,run_id
0,BayesianRidge,2
1,Lasso,3
2,LinearRegression,1
3,Ridge,10


In [0]:
pd.DataFrame(runs_df.groupby("tags.mlflow.user")["run_id"].count()).reset_index()

Unnamed: 0,tags.mlflow.user,run_id
0,matt.collins@coeo.com,18
