diff --git a/examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_server.conf b/examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_server.conf index 5b0f694a6a..9bc187c8ab 100644 --- a/examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_server.conf +++ b/examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_server.conf @@ -48,7 +48,6 @@ "id": "mlflow_receiver_with_tracking_uri", "path": "nvflare.app_opt.tracking.mlflow.mlflow_receiver.MLflowReceiver", "args": { - tracking_uri = "file:///{WORKSPACE}/{JOB_ID}/mlruns" "kwargs": { "experiment_name": "hello-pt-experiment", "run_name": "hello-pt-with-mlflow", diff --git a/examples/hello-world/step-by-step/cifar10/code/fl/train_with_mlflow.py b/examples/hello-world/step-by-step/cifar10/code/fl/train_with_mlflow.py index 3e05f715b2..1d43b88d4d 100644 --- a/examples/hello-world/step-by-step/cifar10/code/fl/train_with_mlflow.py +++ b/examples/hello-world/step-by-step/cifar10/code/fl/train_with_mlflow.py @@ -139,7 +139,7 @@ def evaluate(input_weights): running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print(f"({client_id}) [{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}") - global_step = input_model.current_round * steps + epoch * len(trainloader) + i + global_step = input_model.current_round * local_epochs * batch_size + epoch * batch_size + i mlflow.log_metric("loss", running_loss / 2000, global_step) running_loss = 0.0 diff --git a/examples/hello-world/step-by-step/cifar10/sag/sag.ipynb b/examples/hello-world/step-by-step/cifar10/sag/sag.ipynb index 6d8b8ca76e..dbd515b0f0 100644 --- a/examples/hello-world/step-by-step/cifar10/sag/sag.ipynb +++ b/examples/hello-world/step-by-step/cifar10/sag/sag.ipynb @@ -232,8 +232,8 @@ "source": [ "! nvflare job create -j /tmp/nvflare/jobs/cifar10_sag_pt -w sag_pt_in_proc \\\n", "-f meta.conf min_clients=2 \\\n", - "-f config_fed_client.conf app_script=train_with_mlflow.py app_config=\"--batch_size 4 --dataset_path {CIFAR10_ROOT} --num_workers 2\" \\\n", - "-f config_fed_server.conf num_rounds=2 \\\n", + "-f config_fed_client.conf app_script=train.py app_config=\"--batch_size 4 --dataset_path {CIFAR10_ROOT} --num_workers 2\" \\\n", + "-f config_fed_server.conf num_rounds=5 \\\n", "-sd ../code/fl \\\n", "-force" ] @@ -289,48 +289,6 @@ "The next 5 examples will use the same ScatterAndGather workflow, but will demonstrate different execution APIs and feature.\n", "In the next example [sag_deploy_map](../sag_deploy_map/sag_deploy_map.ipynb), we will learn about the deploy_map configuration for deployment of apps to different sites." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a49b430b-a65b-4b1e-8793-9b3befcfcfd9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!tree /tmp/nvflare/jobs/cifar10_sag_pt_workspace/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50594df7-b4c9-4e5e-944a-403b5a105c27", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!mlflow ui --port 5000 --backend-store-uri /tmp/nvflare/jobs/cifar10_sag_pt_workspace/server/simulate_job/mlruns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af2b6628-61af-4bc8-84d4-a9876a27c7c2", - "metadata": {}, - "outputs": [], - "source": [ - "!tensorboard --logdir=/tmp/nvflare/jobs/cifar10_sag_pt_workspace/server/simulate_job/tb_events" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3ad11c3-6ef7-46cd-8778-0090505b14e1", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/job_templates/sag_pt_in_proc/config_fed_server.conf b/job_templates/sag_pt_in_proc/config_fed_server.conf index deb678189f..ab5691c4b7 100644 --- a/job_templates/sag_pt_in_proc/config_fed_server.conf +++ b/job_templates/sag_pt_in_proc/config_fed_server.conf @@ -107,7 +107,7 @@ path = "nvflare.app_opt.tracking.mlflow.mlflow_receiver.MLflowReceiver" args { # tracking_uri = "http://0.0.0.0:5000" - tracking_uri = "file:///{WORKSPACE}/{JOB_ID}/mlruns" + tracking_uri = "" kwargs { experiment_name = "nvflare-sag-pt-experiment" run_name = "nvflare-sag-pt-with-mlflow"