# Examples for scikit-learn Autologging

| File                                           | Description                                         |
| :--------------------------------------------- | :-------------------------------------------------- |
| [linear_regression.py](https://www.google.com/url?sa=i&url=https%3A%2F%2Fru.pinterest.com%2Fpin%2F595671488248154464%2F&psig=AOvVaw239wq5rotNeaT5CphAL-vH&ust=1732901616336000&source=images&cd=vfe&opi=89978449&ved=0CBQQjRxqFwoTCMj_4d_H_4kDFQAAAAAdAAAAABAJ) | Train a [LinearRegression][lr] model                |
| [pipeline.py](https://www.google.com/url?sa=i&url=https%3A%2F%2Fru.pinterest.com%2Fpin%2F595671488248154464%2F&psig=AOvVaw239wq5rotNeaT5CphAL-vH&ust=1732901616336000&source=images&cd=vfe&opi=89978449&ved=0CBQQjRxqFwoTCMj_4d_H_4kDFQAAAAAdAAAAABAJ)                   | Train a [Pipeline][pipe] model                      |
| [grid_search_cv.py](https://www.google.com/url?sa=i&url=https%3A%2F%2Fru.pinterest.com%2Fpin%2F595671488248154464%2F&psig=AOvVaw239wq5rotNeaT5CphAL-vH&ust=1732901616336000&source=images&cd=vfe&opi=89978449&ved=0CBQQjRxqFwoTCMj_4d_H_4kDFQAAAAAdAAAAABAJ)       | Perform a parameter search using [GridSearchCV][gs] |

[lr]: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
[pipe]: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
[gs]: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

In [None]:
from pprint import pprint
import numpy as np
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from pprint import pprint
import pandas as pd
from sklearn import datasets, svm
from sklearn.model_selection import GridSearchCV

import mlflow
from mlflow.tracking import MlflowClient

In [None]:
def yield_artifacts(run_id, path=None):
    """
    Recursively yield all artifact paths for a specified MLflow run.

    Args:
        run_id (str): The unique identifier of the MLflow run.
        path (str, optional): A specific path within the artifact repository.
            If not provided, the root directory is used.

    Yields:
        str: Paths to the artifacts stored in the run's artifact repository.

    This function uses the MLflow client to traverse the artifact directory structure
    and yields paths to all individual artifacts. If an artifact is a directory,
    the function recursively explores its contents.
    """
    client = MlflowClient()
    for item in client.list_artifacts(run_id, path):
        if item.is_dir:
            yield from yield_artifacts(run_id, item.path)
        else:
            yield item.path


def fetch_logged_data(run_id):
    """
    Fetch logged parameters, metrics, tags, and artifact paths from an MLflow run.

    Args:
        run_id (str): The unique identifier of the MLflow run.

    Returns:
        dict: A dictionary containing the following keys:
            - "params": A dictionary of parameters logged to the run.
            - "metrics": A dictionary of metrics logged to the run.
            - "tags": A dictionary of user-defined tags (excluding system tags).
            - "artifacts": A list of paths to artifacts stored in the run's artifact repository.

    This function uses the MLflow client to retrieve logged data from the specified run.
    System tags (e.g., tags starting with "mlflow.") are excluded from the returned tags.
    """
    client = MlflowClient()
    data = client.get_run(run_id).data
    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = list(yield_artifacts(run_id))
    return {
        "params": data.params,
        "metrics": data.metrics,
        "tags": tags,
        "artifacts": artifacts,
    }


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import mlflow
import mlflow.sklearn
from pprint import pprint

# Enable automatic logging of parameters, metrics, and models for scikit-learn
mlflow.sklearn.autolog()

# Prepare training data
# X is a 2D array representing features, y is the target variable calculated using a linear equation.
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3  # Linear function: y = 1*x1 + 2*x2 + 3

# Train a linear regression model
# The LinearRegression model from scikit-learn is trained on the data.
model = LinearRegression()
model.fit(X, y)

# Retrieve the MLflow run ID of the last active run
# This ID is used to query data logged during the run.
run_id = mlflow.last_active_run().info.run_id
print(f"Logged data and model in run {run_id}")

# Display logged data
# Use the `fetch_logged_data` function to retrieve and display:
# - Parameters: Model hyperparameters
# - Metrics: Training metrics (e.g., R² score)
# - Tags: Metadata about the run
# - Artifacts: Files/logs/models stored in the artifact repository
for key, data in fetch_logged_data(run_id).items():
    print(f"\n---------- logged {key} ----------")
    pprint(data)


2024/11/28 17:31:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '30cab0dbed3f4ebc9a0dc0acc94a0ad1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Logged data and model in run 30cab0dbed3f4ebc9a0dc0acc94a0ad1

---------- logged params ----------
{'copy_X': 'True',
 'fit_intercept': 'True',
 'n_jobs': 'None',
 'positive': 'False'}

---------- logged metrics ----------
{'training_mean_absolute_error': 2.220446049250313e-16,
 'training_mean_squared_error': 1.9721522630525295e-31,
 'training_r2_score': 1.0,
 'training_root_mean_squared_error': 4.440892098500626e-16,
 'training_score': 1.0}

---------- logged tags ----------
{'estimator_class': 'sklearn.linear_model._base.LinearRegression',
 'estimator_name': 'LinearRegression'}

---------- logged artifacts ----------
['estimator.html',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.pkl',
 'model/python_env.yaml',
 'model/requirements.txt']


In [None]:
import mlflow
import mlflow.sklearn
from sklearn import datasets, svm
from sklearn.model_selection import GridSearchCV
from pprint import pprint
import pandas as pd

# Enable automatic logging of parameters, metrics, and models for scikit-learn
mlflow.sklearn.autolog()

# Load the Iris dataset
# `iris.data` contains feature data, and `iris.target` contains labels (classifications).
iris = datasets.load_iris()

# Define parameters for GridSearchCV
# `parameters` specifies the hyperparameter grid for tuning.
parameters = {"kernel": ("linear", "rbf"), "C": [1, 10]}

# Create an SVM model and set up a grid search
svc = svm.SVC()  # Support Vector Classifier
clf = GridSearchCV(svc, parameters)  # Grid search over `kernel` and `C`

# Fit the grid search to the data
# This trains multiple models with different parameter combinations to find the best one.
clf.fit(iris.data, iris.target)

# Retrieve the MLflow run ID of the parent run (created by `autolog`)
run_id = mlflow.last_active_run().info.run_id

# Show data logged in the parent run
# The parent run contains information about the entire grid search process.
print("========== parent run ==========")
for key, data in fetch_logged_data(run_id).items():
    print(f"\n---------- logged {key} ----------")
    pprint(data)

# Show data logged in the child runs
# Each child run corresponds to a specific parameter combination in GridSearchCV.
filter_child_runs = f"tags.mlflow.parentRunId = '{run_id}'"
runs = mlflow.search_runs(filter_string=filter_child_runs)

# Extract specific columns for display:
# - `params.kernel` and `params.C`: The hyperparameters for each child run.
# - `metrics.mean_test_score`: The average test score for each parameter combination.
param_cols = [f"params.{p}" for p in parameters.keys()]
metric_cols = ["metrics.mean_test_score"]

print("\n========== child runs ==========\n")
pd.set_option("display.max_columns", None)  # Prevent truncating columns in the output
print(runs[["run_id", *param_cols, *metric_cols]])


2024/11/28 17:32:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e685c79ffb5f43b99211606b38d93be2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/11/28 17:32:16 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.



---------- logged params ----------
{'best_C': '1',
 'best_kernel': 'linear',
 'cv': 'None',
 'error_score': 'nan',
 'estimator': 'SVC()',
 'n_jobs': 'None',
 'param_grid': "{'kernel': ('linear', 'rbf'), 'C': [1, 10]}",
 'pre_dispatch': '2*n_jobs',
 'refit': 'True',
 'return_train_score': 'False',
 'scoring': 'None',
 'verbose': '0'}

---------- logged metrics ----------
{'best_cv_score': 0.9800000000000001,
 'training_accuracy_score': 0.9933333333333333,
 'training_f1_score': 0.9933326665999933,
 'training_precision_score': 0.9934640522875816,
 'training_recall_score': 0.9933333333333333,
 'training_score': 0.9933333333333333}

---------- logged tags ----------
{'estimator_class': 'sklearn.model_selection._search.GridSearchCV',
 'estimator_name': 'GridSearchCV'}

---------- logged artifacts ----------
['best_estimator/MLmodel',
 'best_estimator/conda.yaml',
 'best_estimator/model.pkl',
 'best_estimator/python_env.yaml',
 'best_estimator/requirements.txt',
 'cv_results.csv',
 'estimat

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import mlflow
import mlflow.sklearn
from pprint import pprint

# Enable automatic logging for scikit-learn models
# This feature automatically tracks model parameters, metrics, and artifacts (e.g., trained model files) in MLflow.
mlflow.sklearn.autolog()

# Prepare training data
# X is a matrix of features, and y is the target variable.
# The target variable is calculated as a linear combination of the features plus an intercept.
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3  # Linear equation: y = 1*x1 + 2*x2 + 3

# Create and train a pipeline model
# The pipeline consists of:
# 1. `StandardScaler`: Standardizes the features to have zero mean and unit variance.
# 2. `LinearRegression`: Fits a linear model to the standardized features and target.
pipe = Pipeline([("scaler", StandardScaler()), ("lr", LinearRegression())])
pipe.fit(X, y)

# Retrieve the MLflow run ID of the last active run
# This ID uniquely identifies the MLflow run and is used to fetch logged data.
run_id = mlflow.last_active_run().info.run_id
print(f"Logged data and model in run: {run_id}")

# Fetch and display logged data
# Use the `fetch_logged_data` function to retrieve:
# - Parameters: Hyperparameters of the pipeline components.
# - Metrics: Model evaluation metrics (e.g., R² score).
# - Tags: Metadata about the run (e.g., library versions).
# - Artifacts: Files such as the serialized model or other outputs.
for key, data in fetch_logged_data(run_id).items():
    print(f"\n---------- logged {key} ----------")
    pprint(data)


2024/11/28 17:32:17 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '41827cf62c8c4901ae5862a18242d2d8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Logged data and model in run: 41827cf62c8c4901ae5862a18242d2d8

---------- logged params ----------
{'lr': 'LinearRegression()',
 'lr__copy_X': 'True',
 'lr__fit_intercept': 'True',
 'lr__n_jobs': 'None',
 'lr__positive': 'False',
 'memory': 'None',
 'scaler': 'StandardScaler()',
 'scaler__copy': 'True',
 'scaler__with_mean': 'True',
 'scaler__with_std': 'True',
 'steps': "[('scaler', StandardScaler()), ('lr', LinearRegression())]",
 'verbose': 'False'}

---------- logged metrics ----------
{'training_mean_absolute_error': 2.220446049250313e-16,
 'training_mean_squared_error': 1.9721522630525295e-31,
 'training_r2_score': 1.0,
 'training_root_mean_squared_error': 4.440892098500626e-16,
 'training_score': 1.0}

---------- logged tags ----------
{'estimator_class': 'sklearn.pipeline.Pipeline', 'estimator_name': 'Pipeline'}

---------- logged artifacts ----------
['estimator.html',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.pkl',
 'model/python_env.yaml',
 'model/requirements.tx