In [None]:
import mlflow

import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip list

Package                       Version
----------------------------- -----------
aiofiles                      22.1.0
aiosqlite                     0.19.0
alembic                       1.10.4
altair                        4.2.2
anyio                         3.6.2
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asttokens                     2.2.1
async-generator               1.10
attrs                         22.2.0
Babel                         2.12.1
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.12.2
bleach                        6.0.0
blinker                       1.6.2
bokeh                         3.1.0
boltons                       23.0.0
Bottleneck                    1.3.7
brotlipy                      0.7.0
cached-property               1.5.2
certifi                       2022.12.7
certipy                       0.1.3
cffi                          1.15.1
charset-normalize

In [None]:
print(mlflow.__version__)

2.3.1


I like to create a virtual environment 
- python -m venv /path/to/new/virtual/environment
- source python /path/to/new/virtual/environment/bin/activate

# First simple example

## MLflow Tracking

Keep track of things like the hyperparameters and the metrics you’ve got. 

Open the tunnel with mlflow command in the command line and specify where you want to save your artifacts
if you want also the database add     --backend-store-uri sqlite:///mlflow.db \

mlflow server \
    --host 0.0.0.0 \
    --artifacts-destination /home/nuno/Desktop/NovaIMS/mlflow_project/ml_artifacts\
    --default-artifact-root /home/nuno/Desktop/NovaIMS/mlflow_project/ml_artifacts

In [None]:
# load dataset
db = load_wine()

# define train and test dataset
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

In [None]:
# descrition that will be used as metadata
description = "the simplest possible example"

# Mlflow tracking server
mlflow.set_tracking_uri("http://mlflow-starter-server:5000")

In [None]:
mlflow.set_experiment("mlflow_first_example")

2023/05/03 23:56:00 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_first_example' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/jovyan/mlruns/616838658831017469', creation_time=1683158160326, experiment_id='616838658831017469', last_update_time=1683158160326, lifecycle_stage='active', name='mlflow_first_example', tags={}>

In [None]:
# executes the run
with mlflow.start_run(run_name="tracking experiment_1", description=description) as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)
mlflow.end_run()

Runs are grouped into experiments. Each run can contain, for example, a different set of hyperparameters. Also, if you don’t specify an experiment name, the run you’re currently executing will be recorded under the Default experiment, which is created automatically by MLflow for you. Let's change to another name

In [None]:
mlflow.set_experiment("mlflow_first_example")

with mlflow.start_run(run_name="params_no_artifacts_logged") as run:

    params = {"n_estimators":100, "max_depth":6, "max_features":3}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    explained_variance = explained_variance_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_params(params)
    mlflow.log_param("test", "test")
    mlflow.log_metric("explained_variance", explained_variance)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("my_metric", 0.8)
    mlflow.set_tag("tag", "this_is_a_tag")
mlflow.end_run()

In [None]:
with mlflow.start_run(run_name="params_no_artifacts_logged") as run:

    params = {"n_estimators":120, "max_depth":3, "max_features":6}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    explained_variance = explained_variance_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_params(params)
    mlflow.log_param("test", "test2")
    mlflow.log_metric("explained_variance", explained_variance)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("my_metric", 0.9)
    mlflow.set_tag("tag", "this_is_a_tag_2")
mlflow.end_run()

In [None]:
with mlflow.start_run() as run:

    params = {"n_estimators":120, "max_depth":3, "max_features":6}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    explained_variance = explained_variance_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_params(params)
    mlflow.log_param("test", "test2")
    mlflow.log_metric("explained_variance", explained_variance)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("my_metric", 0.9)
    mlflow.set_tag("tag", "this_is_a_tag_2")
mlflow.end_run()

And we can save the model

In [None]:
with mlflow.start_run(run_name="real_model_to_save") as run:
    params = {"n_estimators":100, "max_depth":6, "max_features":3}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)

    mlflow.log_params(params)
    mlflow.sklearn.log_model(
    sk_model=rf,
    artifact_path="real_model_to_save",
  )
mlflow.end_run()

You now have not only your trained model managed for you (the model.pkl file), but you also have it’s dependencies automatically captured in three different flavours, i.e. conda.yaml, python_env.yaml and requirements.txt

We can also log an input example alongside the artifacts so that, for example, anyone could test a deployment:

In [None]:
from mlflow.models import infer_signature

In [None]:
signature = infer_signature(X_train, rf.predict(X_test))

In [None]:
with mlflow.start_run(run_name="artifact_run_sign") as run:
    params = {"n_estimators":100, "max_depth":6, "max_features":3}
    
    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    
    signature = infer_signature(X_train, rf.predict(X_test))
    input_example = X_train[0]
    mlflow.log_params(params)
    mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="random_forest_regressor",
        input_example=input_example,
        signature=signature
  )
mlflow.end_run()

Instead of explicitily giving the arguments for mlflow log, there is a nice function to the autolog:

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# load dataset
db = load_wine()

# define train and test dataset
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# connect to mlflow
mlflow.set_tracking_uri("http://mlflow-starter-server:5000")
mlflow.set_experiment("mlflow_tracking_with_autolog")


mlflow.autolog(log_model_signatures=True, log_input_examples=True)


# train the model
rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
rf.fit(X_train, y_train)

with mlflow.start_run(run_name="run_2") as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)
mlflow.end_run()

2023/05/03 23:56:11 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_tracking_with_autolog' does not exist. Creating a new experiment.
2023/05/03 23:56:12 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/05/03 23:56:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bcd260fe9f96403f852ae85d92bd38be', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Librarires that support autologging:

- Scikit-learn

- Keras

- Gluon

- XGBoost

- LightGBM

- Statsmodels

- Spark

- Fastai

- Pytorch

If we want to test different parameters, we can use a nested run:

In [None]:
with mlflow.start_run(run_name="main_run_for_nested") as run:
    for estimators in range(20, 100, 20):
        with mlflow.start_run(run_name=f"nested_{estimators}_estimators", nested=True) as nested:
            rf = RandomForestRegressor(n_estimators=estimators, max_depth=6, max_features=3)
            rf.fit(X_train, y_train)

Improving our code with  hyperparameter fine-tunning:

In [None]:
from skopt import BayesSearchCV

# Bayesian
n_iter = 5

params = {
  "n_estimators": [33, 66, 200],
  "max_depth": [2, 4, 6],
  "max_features": [3, 4, 5]
    }

rf = RandomForestRegressor()
searcher_bayes = BayesSearchCV(estimator=rf,
                    search_spaces=params,
                    n_iter=n_iter,
                    random_state=123)

with mlflow.start_run(run_name="autolog_with_grid_search") as run:
    searcher_bayes.fit(X_train, y_train)
mlflow.end_run()



KeyboardInterrupt: 

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# load dataset
db = load_wine()

# define train and test dataset
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# connect to mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlflow_tracking_with_autolog")


mlflow.autolog(log_model_signatures=True, log_input_examples=True)


params = {
  "n_estimators": [33, 66, 200],
  "max_depth": [2, 4, 6],
  "max_features": [3, 4, 5]
}

rf = RandomForestRegressor()
searcher = GridSearchCV(estimator=rf, param_grid=params)

with mlflow.start_run(run_name="run_3") as run:
    searcher.fit(X_train, y_train)
mlflow.end_run() 

2023/05/04 00:01:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/05/04 00:02:05 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


Let's execute a model stored as an artifact inside a run

In [20]:
mlflow.autolog(log_model_signatures=True, log_input_examples=True)

with mlflow.start_run(run_name="model_to_predict") as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=10, max_features=10)
    rf.fit(X_train, y_train)

    run_id = run.info.run_id
mlflow.end_run()

2023/05/04 00:02:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


We could also use the UI or the API to get the relevant information for the model.

In [21]:
# get model path from run id 

model_path = f"runs:/{run_id}/model"
print(f"Loading model from: {model_path}")

# load using sklearn flavor
loaded_model = mlflow.sklearn.load_model(model_path)

print("Showing predictions")
print(loaded_model.predict(X_test))

Loading model from: runs:/5278c8b0350c44ee83d5bfb19c520f77/model
Showing predictions
[0.   0.   1.   1.02 0.99 0.   1.98 0.01 0.   1.83 2.   0.05 0.01 0.
 0.05 0.   0.   1.88 1.55 0.99 0.   1.13 0.99 1.66 1.11 1.98 1.88 0.04
 0.99 2.   0.01 1.   0.01 1.9  0.23 0.   1.02 0.04 1.   0.01 1.   0.17
 0.86 1.17 1.02]


What if the frameork where we are going to use our model is not scikit, e.g. could be PyTorch or Tensorflow. We can call the model using an abstract function, so that if you now change the flavour of your model, you don't need to use any if statement or change the method of your class

In [22]:
# get model path from run id 

model_path = f"runs:/{run_id}/model"
print(f"Loading model from: {model_path}")

# load using abstract flavor
loaded_model = mlflow.pyfunc.load_model(model_path)

print("Showing predictions")
print(loaded_model.predict(X_test))

Loading model from: runs:/5278c8b0350c44ee83d5bfb19c520f77/model
Showing predictions
[0.   0.   1.   1.02 0.99 0.   1.98 0.01 0.   1.83 2.   0.05 0.01 0.
 0.05 0.   0.   1.88 1.55 0.99 0.   1.13 0.99 1.66 1.11 1.98 1.88 0.04
 0.99 2.   0.01 1.   0.01 1.9  0.23 0.   1.02 0.04 1.   0.01 1.   0.17
 0.86 1.17 1.02]


Now, what if you want to use an unsupported framework or any different logic of prediction? 

We can creat a custom model by extending mlflow.pyfunc.PythonModel, with 2 methods: load_context (responsible for loading the ML artifacts) and predict. 

I am going to use one example from MLFlow community the VADER sentiment analysis (ADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media)

In [29]:
import mlflow
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# create an MLflow-compliant model by extending PythonModel
class TextAnalyzerModel(mlflow.pyfunc.PythonModel):
    
    def __init__(self):
        super().__init__()
        self._analyser = SentimentIntensityAnalyzer()

    def _preprocess(self):
        pass

    def _score(self, txt):
        prediction_scores = self._analyser.polarity_scores(txt)
        return prediction_scores

    def predict(self, context, model_input):
        model_output = model_input.apply(lambda col: self._score(col))
        return model_output

# connect to mlflow and set experiment
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("sentiment_analysis")

# enable autolog
mlflow.autolog(log_model_signatures=True, log_input_examples=True)

model_artifact_path = "vader_model"
model = TextAnalyzerModel()

# execute run
with mlflow.start_run(run_name="Vader Sentiment Analysis") as run:
    mlflow.log_param("algorithm", "VADER")
    mlflow.pyfunc.log_model(artifact_path=model_artifact_path, 
                          python_model=model)
    run_id = run.info.run_id
mlflow.end_run()

2023/05/04 00:02:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


After creating the the new experiment and run, we can load the model and make a prediction.

In [24]:
model_uri = f"runs:/{run_id}/vader_model"

loaded_model = mlflow.pyfunc.load_model(model_uri)

queries = ["This is a bad class. I hate MLOps and the professor! :-C. But our campus is good.",
           "Lovely weather during the weekend.",
           "LOL, this guy fell off a chair while listening the professor.",
           "This is INSANE! How can you do such TERRIBLE thing?????"]

for q in queries:
    m_input = pd.DataFrame([q])
    scores = loaded_model.predict(m_input)
    print(f"<{q}> -- {str(scores[0])}")

<This is a bad class. I hate MLOps and the professor! :-C. But our campus is good.> -- {'neg': 0.215, 'neu': 0.616, 'pos': 0.169, 'compound': 0.1386}
<Lovely weather during the weekend.> -- {'neg': 0.0, 'neu': 0.513, 'pos': 0.487, 'compound': 0.5859}
<LOL, this guy fell off a chair while listening the professor.> -- {'neg': 0.0, 'neu': 0.739, 'pos': 0.261, 'compound': 0.5473}
<This is INSANE! How can you do such TERRIBLE thing?????> -- {'neg': 0.516, 'neu': 0.484, 'pos': 0.0, 'compound': -0.8597}


Final step: let's registry our model. Why ? If you keep evolving a model, you’ll need to know which version is in production. You have to implement here the logic of champion and challenger, tested model, ready for deploy, etc. 

In [25]:
with mlflow.start_run(run_name="log_and_register") as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)

    mlflow.sklearn.log_model(
    sk_model=rf,
    artifact_path="sklearn-model",
    registered_model_name="my_registered_model_1")
mlflow.end_run()

Successfully registered model 'my_registered_model_1'.
2023/05/04 00:02:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: my_registered_model_1, version 1
Created version '1' of model 'my_registered_model_1'.


To use this version of the model, directly from the model registry, you only need to change the path, i.e. change the source from runs to models :

In [26]:
model_name = "my_registered_model_1"
model_version = 1
model_path = f"models:/{model_name}/{model_version}"

model = mlflow.pyfunc.load_model(model_path)

In [27]:
print("Showing predictions")
print(model.predict(X_test))

Showing predictions
[0.01       0.01333333 0.98       1.01       1.01       0.07
 1.98       0.15       0.01       1.74       2.         0.06666667
 0.19       0.08333333 0.28       0.08333333 0.23333333 1.9
 1.83       1.03       0.05       1.33       0.99       1.45
 1.26       1.85       1.92       0.16333333 1.05       1.94
 0.         0.99       0.09       1.86       0.29666667 0.07
 1.09       0.05       1.         0.02       1.04       0.33333333
 0.55       1.1        1.07      ]


We can clearly distinguish the version of the model: 
- which is in production (Production); 
- which is being tested (Staging); 
- which has been decommissioned (Archived);
- which has just been generated (None).

In [30]:
model_name = "my_registered_model_1"
model_path = f"models:/{model_name}/production"

model = mlflow.pyfunc.load_model(model_path)

MlflowException: No versions of model with name 'my_registered_model_1' and stage 'production' found

In [31]:
model_name = "my_registered_model_1"
model_path = f"models:/{model_name}/staging"

model = mlflow.pyfunc.load_model(model_path)

MlflowException: No versions of model with name 'my_registered_model_1' and stage 'staging' found