Skip to content

Commit

Permalink
fix MLFLOW example (#2575)
Browse files Browse the repository at this point in the history
Co-authored-by: Yuan-Ting Hsieh (謝沅廷) <yuantingh@nvidia.com>
  • Loading branch information
chesterxgchen and YuanTingHsieh committed May 11, 2024
1 parent 297a079 commit 27726f1
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 4 deletions.
Expand Up @@ -48,6 +48,7 @@
"id": "mlflow_receiver_with_tracking_uri",
"path": "nvflare.app_opt.tracking.mlflow.mlflow_receiver.MLflowReceiver",
"args": {
tracking_uri = "file:///{WORKSPACE}/{JOB_ID}/mlruns"
"kwargs": {
"experiment_name": "hello-pt-experiment",
"run_name": "hello-pt-with-mlflow",
Expand Down
Expand Up @@ -139,7 +139,7 @@ def evaluate(input_weights):
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f"({client_id}) [{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
global_step = input_model.current_round * local_epochs * batch_size + epoch * batch_size + i
global_step = input_model.current_round * steps + epoch * len(trainloader) + i
mlflow.log_metric("loss", running_loss / 2000, global_step)
running_loss = 0.0

Expand Down
46 changes: 44 additions & 2 deletions examples/hello-world/step-by-step/cifar10/sag/sag.ipynb
Expand Up @@ -232,8 +232,8 @@
"source": [
"! nvflare job create -j /tmp/nvflare/jobs/cifar10_sag_pt -w sag_pt_in_proc \\\n",
"-f meta.conf min_clients=2 \\\n",
"-f config_fed_client.conf app_script=train.py app_config=\"--batch_size 4 --dataset_path {CIFAR10_ROOT} --num_workers 2\" \\\n",
"-f config_fed_server.conf num_rounds=5 \\\n",
"-f config_fed_client.conf app_script=train_with_mlflow.py app_config=\"--batch_size 4 --dataset_path {CIFAR10_ROOT} --num_workers 2\" \\\n",
"-f config_fed_server.conf num_rounds=2 \\\n",
"-sd ../code/fl \\\n",
"-force"
]
Expand Down Expand Up @@ -289,6 +289,48 @@
"The next 5 examples will use the same ScatterAndGather workflow, but will demonstrate different execution APIs and feature.\n",
"In the next example [sag_deploy_map](../sag_deploy_map/sag_deploy_map.ipynb), we will learn about the deploy_map configuration for deployment of apps to different sites."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a49b430b-a65b-4b1e-8793-9b3befcfcfd9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!tree /tmp/nvflare/jobs/cifar10_sag_pt_workspace/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50594df7-b4c9-4e5e-944a-403b5a105c27",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!mlflow ui --port 5000 --backend-store-uri /tmp/nvflare/jobs/cifar10_sag_pt_workspace/server/simulate_job/mlruns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af2b6628-61af-4bc8-84d4-a9876a27c7c2",
"metadata": {},
"outputs": [],
"source": [
"!tensorboard --logdir=/tmp/nvflare/jobs/cifar10_sag_pt_workspace/server/simulate_job/tb_events"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3ad11c3-6ef7-46cd-8778-0090505b14e1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion job_templates/sag_pt_in_proc/config_fed_server.conf
Expand Up @@ -107,7 +107,7 @@
path = "nvflare.app_opt.tracking.mlflow.mlflow_receiver.MLflowReceiver"
args {
# tracking_uri = "http://0.0.0.0:5000"
tracking_uri = ""
tracking_uri = "file:///{WORKSPACE}/{JOB_ID}/mlruns"
kwargs {
experiment_name = "nvflare-sag-pt-experiment"
run_name = "nvflare-sag-pt-with-mlflow"
Expand Down

0 comments on commit 27726f1

Please sign in to comment.