# How to use mlflow to log runs directly into s3 and an mysql-database
For all functionality in src.models.mlflow_logging, you need to have a aws_credentials.yaml in the root of your project directory.


# Use Autolog for sklearn and pytorch models

In [1]:
# Imports for model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import mlflow

# Import logging
import src.models.mlflow_logging as ml_logging

# This connects to s3 & the mysql-instance and starts an mlflow-experiment with the chosen name
ml_logging.start_auto_logging("test_logging_local", "sklearn")

# Define some training data
X = np.linspace(0, 1, 11)
y = X * 2
X = np.array([[x] for x in X])
y = np.array([[i] for i in y])

# Define a model for demo purposes
model = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# sklearn's autolog will log training metrics and model parameters automatically when calling fit
model.fit(X_train, y_train)

# sklearns fit will not log anything
y_pred = model.predict(X_test)

my_own_mse = float(sum([np.sqrt((i - j) ** 2) for i, j in zip(y_pred, y_test)]))
# If you want to log other metrics which are not covered by autologging:
mlflow.log_metric("my_own_mse", my_own_mse)



An experiment with that name already exists, logging new run into it.


2022/10/18 11:40:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e8938b6b096a4606b6441a7d823b2070', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


# Alternative: Manual logging of run data
Use this if you use a model library not directly supported by mlflow.
Beware, this also creates a local mlflow folder and pickles the model locally, then uploads it.

In [3]:
# For Manual logging (if you use a model library not directly supported by mlflow, create an experiment, start a run and log manually.

# Imports for model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import mlflow
import pickle

# Import logging
import src.models.mlflow_logging as ml_logging

# Connect to AWS and create/set experiment
ml_logging.create_or_set_experiment("test_logging_local_manual")

# Start a new run
with mlflow.start_run():
    # Define some training data
    X = np.linspace(0, 1, 11)
    y = X * 2
    X = np.array([[x] for x in X])
    y = np.array([[i] for i in y])

    # Define a model for demo purposes
    model = LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # This will not log anything without autologging
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Manual logging
    my_own_mse = float(sum([np.sqrt((i - j) ** 2) for i, j in zip(y_pred, y_test)]))
    mlflow.log_metric("my_own_mse", my_own_mse)

    # Log model by pickling (you could use mlflow.sklearn.log_model(model, "model"), but this cell is to show unsupported libraries)
    with open("model.pkl", "wb") as f:
        pickle.dump(model, f)
    mlflow.log_artifact("model.pkl")

    # Log params of your model
    mlflow.log_param("Layers", 0)

An experiment with that name already exists, logging new run into it.


# Reading run data from the mlflow ui

In [5]:
import subprocess

credentials = ml_logging.get_aws_credentials()
conn_str = f"mysql+pymysql://{credentials['mysql']['user']}:{credentials['mysql']['password']}@mlflow-backend.chf6ry9cdkyl.eu-central-1.rds.amazonaws.com:3306/mlflowbackend"

# Calling the mlflow ui from here does not work.
# subprocess.run(["mlflow", "ui", "--backend-store-uri", f"mysql+pymysql://{credentials['mysql']['user']}:{credentials['mysql']['password']}@mlflow-backend.chf6ry9cdkyl.eu-central-1.rds.amazonaws.com:3306/mlflowbackend"])

# If you want to see the Mlflow ui to compare models, use this in your console:
# mlflow ui --backend-store-uri connstr
# To have access to artifacts in s3, you need to set your s3-credentials in a another credentials-file, as described in https://docs.aws.amazon.com/sdkref/latest/guide/file-location.html.
# Calling "mlflow ui" does not use env-variables set in python, so ml_logging.set_s3_credentials() does not work here.