In [0]:
## Source to learn:
## azure: https://learn.microsoft.com/en-us/azure/databricks/machine-learning/tutorial/
## databrick: https://docs.databricks.com/machine-learning/index.html
## ref of this code: https://learn.microsoft.com/en-us/azure/databricks/_extras/notebooks/source/mlflow/ml-quickstart-training.html
## I just copy the code from ref, objective is know how to machine learning in databricks work not optimize model.
## It still need some set up and config before can run all code in ref source.

In [0]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.4.0-py3-none-any.whl (18.1 MB)
[?25l[K     |                                | 10 kB 23.9 MB/s eta 0:00:01[K     |                                | 20 kB 28.4 MB/s eta 0:00:01[K     |                                | 30 kB 35.4 MB/s eta 0:00:01[K     |                                | 40 kB 25.0 MB/s eta 0:00:01[K     |                                | 51 kB 28.4 MB/s eta 0:00:01[K     |                                | 61 kB 32.0 MB/s eta 0:00:01[K     |▏                               | 71 kB 23.8 MB/s eta 0:00:01[K     |▏                               | 81 kB 25.3 MB/s eta 0:00:01[K     |▏                               | 92 kB 27.4 MB/s eta 0:00:01[K     |▏                               | 102 kB 24.5 MB/s eta 0:00:01[K     |▏                               | 112 kB 24.5 MB/s eta 0:00:01[K     |▏                               | 122 kB 24.5 MB/s eta 0:00:01[K     |▎                               | 133 kB 24.5 MB/s eta 

In [0]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[?25l[K     |▏                               | 10 kB 18.3 MB/s eta 0:00:01[K     |▍                               | 20 kB 15.2 MB/s eta 0:00:01[K     |▋                               | 30 kB 19.8 MB/s eta 0:00:01[K     |▉                               | 40 kB 12.5 MB/s eta 0:00:01[K     |█                               | 51 kB 11.3 MB/s eta 0:00:01[K     |█▎                              | 61 kB 13.1 MB/s eta 0:00:01[K     |█▌                              | 71 kB 13.4 MB/s eta 0:00:01[K     |█▋                              | 81 kB 13.2 MB/s eta 0:00:01[K     |█▉                              | 92 kB 14.4 MB/s eta 0:00:01[K     |██                              | 102 kB 14.7 MB/s eta 0:00:01[K     |██▎                             | 112 kB 14.7 MB/s eta 0:00:01[K     |██▌                             | 122 kB 14.7 MB/s eta 0:00:01[K     |██▊                             | 133 kB 14.7 MB

In [0]:
import mlflow
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
 
from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope

# Load and preprocess data
white_wine = pd.read_csv("/dbfs/databricks-datasets/wine-quality/winequality-white.csv", sep=';')
red_wine = pd.read_csv("/dbfs/databricks-datasets/wine-quality/winequality-red.csv", sep=';')
white_wine['is_red'] = 0.0
red_wine['is_red'] = 1.0
data_df = pd.concat([white_wine, red_wine], axis=0)
 
# Define classification labels based on the wine quality
data_labels = data_df['quality'] >= 7
data_df = data_df.drop(['quality'], axis=1)
 
# Split 80/20 train-test
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
  data_df,
  data_labels,
  test_size=0.2,
  random_state=1
)

In [0]:
mlflow.autolog()

2023/06/07 04:15:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/06/07 04:15:09 INFO mlflow._spark_autologging: Autologging successfully enabled for spark.
2023/06/07 04:15:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2023/06/07 04:15:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [0]:
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
dbutils.fs.put('file:///root/.databrickscfg','[DEFAULT]\nhost=https://adb-1639995782443110.10.azuredatabricks.net\ntoken = '+token,overwrite=True)

Wrote 111 bytes.
Out[17]: True

In [0]:
with mlflow.start_run(run_name='gradient_boost') as run:
  model = sklearn.ensemble.GradientBoostingClassifier(random_state=0)
  
  # Models, parameters, and training metrics are tracked automatically
  model.fit(X_train, y_train)
 
  predicted_probs = model.predict_proba(X_test)
  roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
  
  # The AUC score on test data is not automatically logged, so log it manually
  mlflow.log_metric("test_auc", roc_auc)
  print("Test AUC of: {}".format(roc_auc))



Test AUC of: 0.8834365701533531


In [0]:
# Start a new run and assign a run_name for future reference
with mlflow.start_run(run_name='gradient_boost') as run:
  model_2 = sklearn.ensemble.GradientBoostingClassifier(
    random_state=0, 
    
    # Try a new parameter setting for n_estimators
    n_estimators=200,
  )
  model_2.fit(X_train, y_train)
 
  predicted_probs = model_2.predict_proba(X_test)
  roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
  mlflow.log_metric("test_auc", roc_auc)
  print("Test AUC of: {}".format(roc_auc))

Test AUC of: 0.8914761673151751


In [0]:
# runmodel from another notebook
# can find the run id in mlflow experiments
#------------------------------------------
# model_loaded = mlflow.pyfunc.load_model(
#   'runs:/{run_id}/model'.format(
#     run_id=run.info.run_id
#   )
# )
 
# predictions_loaded = model_loaded.predict(X_test)
# predictions_original = model_2.predict(X_test)

Out[20]: '69224792f0d04ec9bc90ffe902bccd18'

In [0]:
# Define the search space to explore
search_space = {
  'n_estimators': scope.int(hp.quniform('n_estimators', 20, 1000, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'max_depth': scope.int(hp.quniform('max_depth', 2, 5, 1)),
}
 
def train_model(params):
  # Enable autologging on each worker
  mlflow.autolog()
  with mlflow.start_run(nested=True):
    model_hp = sklearn.ensemble.GradientBoostingClassifier(
      random_state=0,
      **params
    )
    model_hp.fit(X_train, y_train)
    predicted_probs = model_hp.predict_proba(X_test)
    # Tune based on the test AUC
    # In production settings, you could use a separate validation set instead
    roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
    mlflow.log_metric('test_auc', roc_auc)
    
    # Set the loss to -1*auc_score so fmin maximizes the auc_score
    return {'status': STATUS_OK, 'loss': -1*roc_auc}
 
# SparkTrials distributes the tuning using Spark workers
# Greater parallelism speeds processing, but each hyperparameter trial has less information from other trials
# On smaller clusters or Databricks Community Edition try setting parallelism=2
spark_trials = SparkTrials(
  parallelism=8
)
 
with mlflow.start_run(run_name='gb_hyperopt') as run:
  # Use hyperopt to find the parameters yielding the highest AUC
  best_params = fmin(
    fn=train_model, 
    space=search_space, 
    algo=tpe.suggest, 
    max_evals=32,
    trials=spark_trials)

  0%|          | 0/32 [00:00<?, ?trial/s, best loss=?]  3%|▎         | 1/32 [00:17<08:48, 17.04s/trial, best loss: -0.8809921463721676]  6%|▋         | 2/32 [00:20<04:23,  8.78s/trial, best loss: -0.8884988698786909]  9%|▉         | 3/32 [00:22<02:44,  5.69s/trial, best loss: -0.908227712291142]  16%|█▌        | 5/32 [00:31<02:15,  5.02s/trial, best loss: -0.9101982719157703] 19%|█▉        | 6/32 [00:44<03:10,  7.32s/trial, best loss: -0.9101982719157703] 22%|██▏       | 7/32 [00:47<02:31,  6.06s/trial, best loss: -0.9101982719157703] 25%|██▌       | 8/32 [00:50<02:03,  5.17s/trial, best loss: -0.9101982719157703] 28%|██▊       | 9/32 [00:51<01:30,  3.93s/trial, best loss: -0.9101982719157703] 31%|███▏      | 10/32 [00:58<01:46,  4.85s/trial, best loss: -0.9101982719157703] 38%|███▊      | 12/32 [01:05<01:24,  4.23s/trial, best loss: -0.9101982719157703] 41%|████      | 13/32 [01:09<01:19,  4.18s/trial, best loss: -0.9101982719157703] 47%|████▋     | 15/32 [01:17<01:09,  4

INFO:hyperopt-spark:Total Trials: 32: 32 succeeded, 0 failed, 0 cancelled.


In [0]:
# Sort runs by their test auc; in case of ties, use the most recent run
best_run = mlflow.search_runs(
  order_by=['metrics.test_auc DESC', 'start_time DESC'],
  max_results=10,
).iloc[0]
print('Best Run')
print('AUC: {}'.format(best_run["metrics.test_auc"]))
print('Num Estimators: {}'.format(best_run["params.n_estimators"]))
print('Max Depth: {}'.format(best_run["params.max_depth"]))
print('Learning Rate: {}'.format(best_run["params.learning_rate"]))
print('Run id: {}'.format(best_run.run_id))
 
best_model_pyfunc = mlflow.pyfunc.load_model(
  'runs:/{run_id}/model'.format(
    run_id=best_run.run_id
  )
)
best_model_predictions = best_model_pyfunc.predict(X_test[:5])
print("Test Predictions: {}".format(best_model_predictions))

Best Run
AUC: 0.9166642824444954
Num Estimators: 943
Max Depth: 5
Learning Rate: 0.3864233776522892
Run id: 484f6f23fc5e4d6f9d3f9c941a7034f0
Test Predictions: [False False False  True  True]
