In [2]:
import os
import tempfile

import pytorch_lightning as pl
from pl_bolts.datamodules import MNISTDataModule

import mlflow

from ray import air, tune
from ray.tune.integration.mlflow import mlflow_mixin
from ray.tune.integration.pytorch_lightning import TuneReportCallback

import torch
import torch.nn.functional as F

from torchmetrics import Accuracy

class LightningMNISTClassifier(pl.LightningModule):
    def __init__(self, config, data_dir=None):
        super(LightningMNISTClassifier, self).__init__()

        self.data_dir = data_dir or os.getcwd()
        self.lr = config["lr"]
        layer_1, layer_2 = config["layer_1"], config["layer_2"]
        self.batch_size = config["batch_size"]

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = torch.nn.Linear(28 * 28, layer_1)
        self.layer_2 = torch.nn.Linear(layer_1, layer_2)
        self.layer_3 = torch.nn.Linear(layer_2, 10)
        self.accuracy = Accuracy()

    def forward(self, x):
        batch_size, channels, width, height = x.size()
        x = x.view(batch_size, -1)
        x = self.layer_1(x)
        x = torch.relu(x)
        x = self.layer_2(x)
        x = torch.relu(x)
        x = self.layer_3(x)
        x = torch.log_softmax(x, dim=1)
        return x

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = F.nll_loss(logits, y)
        acc = self.accuracy(logits, y)
        self.log("ptl/train_loss", loss)
        self.log("ptl/train_accuracy", acc)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = F.nll_loss(logits, y)
        acc = self.accuracy(logits, y)
        return {"val_loss": loss, "val_accuracy": acc}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_accuracy", avg_acc)

@mlflow_mixin
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
    model = LightningMNISTClassifier(config, data_dir)
    dm = MNISTDataModule(
        data_dir=data_dir, num_workers=1, batch_size=config["batch_size"]
    )
    metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"}
    mlflow.pytorch.autolog()
    mlflow.log_param("layer_1", config["layer_1"])
    mlflow.log_param("layer_2", config["layer_2"],)
    mlflow.log_param("batch_size", config["batch_size"],)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        progress_bar_refresh_rate=0,
        callbacks=[TuneReportCallback(metrics, on="validation_end")],
    )
    trainer.fit(model, dm)


def tune_mnist(
    num_samples=10,
    num_epochs=10,
    gpus_per_trial=0,
    tracking_uri=None,
    experiment_name="mnist",
):
    data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_")
    # Download data
    MNISTDataModule(data_dir=data_dir).prepare_data()

    # Set the MLflow experiment, or create it if it does not exist.
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(experiment_name)

    config = {
        "layer_1": tune.choice([32, 64, 128]),
        "layer_2": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
        "mlflow": {
            "experiment_name": experiment_name,
            "tracking_uri": mlflow.get_tracking_uri(),
        },
        "data_dir": os.path.join(tempfile.gettempdir(), "mnist_data_"),
        "num_epochs": num_epochs,
    }

    trainable = tune.with_parameters(
        train_mnist_tune,
        data_dir=data_dir,
        num_epochs=num_epochs,
        num_gpus=gpus_per_trial,
    )

    tuner = tune.Tuner(
        tune.with_resources(trainable, resources={"cpu": 1, "gpu": gpus_per_trial}),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            num_samples=num_samples,
        ),
        run_config=air.RunConfig(
            name="mnist",
        ),
        param_space=config,
    )
    results = tuner.fit()

    print("Best hyperparameters found were: ", results.get_best_result().config)

tune_mnist(num_samples=8, num_epochs=3, gpus_per_trial=0)

  from .autonotebook import tqdm as notebook_tqdm
2022/11/16 19:19:14 INFO mlflow.tracking.fluent: Experiment with name 'mnist' does not exist. Creating a new experiment.
2022-11-16 19:19:16,427	INFO worker.py:1528 -- Started a local Ray instance.


0,1
Current time:,2022-11-16 19:20:47
Running for:,00:01:29.48
Memory:,5.9/15.6 GiB

Trial name,status,loc,batch_size,layer_1,layer_2,lr,iter,total time (s),loss,acc
train_mnist_tune_909a4_00000,TERMINATED,192.168.2.11:31501,64,32,128,0.00279034,3,60.399,0.13781,0.959192
train_mnist_tune_909a4_00001,TERMINATED,192.168.2.11:31533,64,64,128,0.00135698,3,61.412,0.126337,0.9626
train_mnist_tune_909a4_00002,TERMINATED,192.168.2.11:31535,64,64,64,0.000800728,3,60.0352,0.170907,0.949801
train_mnist_tune_909a4_00003,TERMINATED,192.168.2.11:31537,128,128,128,0.0107236,3,48.4096,0.153693,0.957807
train_mnist_tune_909a4_00004,TERMINATED,192.168.2.11:31539,128,128,256,0.00408494,3,49.4064,0.120173,0.967559
train_mnist_tune_909a4_00005,TERMINATED,192.168.2.11:31541,64,64,64,0.0723153,3,60.1377,0.583836,0.852892
train_mnist_tune_909a4_00006,TERMINATED,192.168.2.11:31543,128,128,128,0.000268052,3,48.976,0.236286,0.931322
train_mnist_tune_909a4_00007,TERMINATED,192.168.2.11:31547,32,32,256,0.0133476,3,71.8085,0.255765,0.931833


[2m[36m(ImplicitFunc pid=31501)[0m The git executable must be specified in one of the following ways:
[2m[36m(ImplicitFunc pid=31501)[0m     - be included in your $PATH
[2m[36m(ImplicitFunc pid=31501)[0m     - be set via $GIT_PYTHON_GIT_EXECUTABLE
[2m[36m(ImplicitFunc pid=31501)[0m     - explicitly set via git.refresh()
[2m[36m(ImplicitFunc pid=31501)[0m 
[2m[36m(ImplicitFunc pid=31501)[0m All git commands will error until this is rectified.
[2m[36m(ImplicitFunc pid=31501)[0m 
[2m[36m(ImplicitFunc pid=31501)[0m $GIT_PYTHON_REFRESH environment variable. Use one of the following values:
[2m[36m(ImplicitFunc pid=31501)[0m     - error|e|raise|r|2: for a raised exception
[2m[36m(ImplicitFunc pid=31501)[0m 
[2m[36m(ImplicitFunc pid=31501)[0m Example:
[2m[36m(ImplicitFunc pid=31501)[0m     export GIT_PYTHON_REFRESH=quiet
[2m[36m(ImplicitFunc pid=31501)[0m 
[2m[36m(train_mnist_tune pid=31501)[0m   rank_zero_deprecation(
[2m[36m(train_mnist_tune pid=3

Trial name,acc,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_mnist_tune_909a4_00000,0.959192,2022-11-16_19-20-22,True,,7e66b62dcdc44db0869e6f2914d12da4,"0_batch_size=64,layer_1=32,layer_2=128,lr=0.0028",MaxUbuntuDesktop,3,0.13781,192.168.2.11,31501,60.399,19.8846,60.399,1668626422,0,,3,909a4_00000,0.0162296
train_mnist_tune_909a4_00001,0.9626,2022-11-16_19-20-30,True,,791319d8c9654459a713337c4195f1c7,"1_batch_size=64,layer_1=64,layer_2=128,lr=0.0014",MaxUbuntuDesktop,3,0.126337,192.168.2.11,31533,61.412,19.2996,61.412,1668626430,0,,3,909a4_00001,0.0517948
train_mnist_tune_909a4_00002,0.949801,2022-11-16_19-20-29,True,,8d3fd665b6d64746bad46aca411f4bcc,"2_batch_size=64,layer_1=64,layer_2=64,lr=0.0008",MaxUbuntuDesktop,3,0.170907,192.168.2.11,31535,60.0352,18.8297,60.0352,1668626429,0,,3,909a4_00002,0.0255485
train_mnist_tune_909a4_00003,0.957807,2022-11-16_19-20-17,True,,b4f859cb9f6441b1864e7faadb9fdf89,"3_batch_size=128,layer_1=128,layer_2=128,lr=0.0107",MaxUbuntuDesktop,3,0.153693,192.168.2.11,31537,48.4096,16.3549,48.4096,1668626417,0,,3,909a4_00003,0.047677
train_mnist_tune_909a4_00004,0.967559,2022-11-16_19-20-18,True,,a0f11e4dce514981811817f664bc8ef0,"4_batch_size=128,layer_1=128,layer_2=256,lr=0.0041",MaxUbuntuDesktop,3,0.120173,192.168.2.11,31539,49.4064,17.0806,49.4064,1668626418,0,,3,909a4_00004,0.0365021
train_mnist_tune_909a4_00005,0.852892,2022-11-16_19-20-29,True,,41dcdc392252400d9a5c699569cb714b,"5_batch_size=64,layer_1=64,layer_2=64,lr=0.0723",MaxUbuntuDesktop,3,0.583836,192.168.2.11,31541,60.1377,18.7456,60.1377,1668626429,0,,3,909a4_00005,0.0587533
train_mnist_tune_909a4_00006,0.931322,2022-11-16_19-20-17,True,,3ba2851e7bbb47d1adcacbf56ce12f4a,"6_batch_size=128,layer_1=128,layer_2=128,lr=0.0003",MaxUbuntuDesktop,3,0.236286,192.168.2.11,31543,48.976,16.51,48.976,1668626417,0,,3,909a4_00006,0.0252337
train_mnist_tune_909a4_00007,0.931833,2022-11-16_19-20-41,True,,b93b7c2f418043fbafdd3ccf363ac169,"7_batch_size=32,layer_1=32,layer_2=256,lr=0.0133",MaxUbuntuDesktop,3,0.255765,192.168.2.11,31547,71.8085,13.0043,71.8085,1668626441,0,,3,909a4_00007,0.0456395


2022-11-16 19:20:47,242	INFO tune.py:777 -- Total run time: 89.62 seconds (89.47 seconds for the tuning loop).


Best hyperparameters found were:  {'layer_1': 128, 'layer_2': 256, 'lr': 0.004084936760385664, 'batch_size': 128, 'mlflow': {'experiment_name': 'mnist', 'tracking_uri': 'file:///mlruns'}, 'data_dir': '/tmp/mnist_data_', 'num_epochs': 3}
