## Distributed Deep Learning with Ray Train

### 1. Run distributed DL in interactive mode

In [None]:
from azureml.core import Workspace, Experiment, Environment,ScriptRunConfig
from ray_on_aml.core import Ray_On_AML
import time
ws = Workspace.from_config()
ray_on_aml =Ray_On_AML(ws=ws, compute_cluster ="gpunc6",additional_pip_packages=['torch', 'torchvision==0.8.1'], maxnode=2,exp_name='distributed_dl')

ray = ray_on_aml.getRay(gpu_support=True)
# Note that by default, ci_is_head=True which means  compute instance as head node and all nodes in the remote compute cluster as workers 
# But if you want to use one of the nodes in the remote AML compute cluster is used as head node and the remaining are worker nodes.
# then simply specify ray = ray_on_aml.getRay(ci_is_head=False)
# To install additional library, use additional_pip_packages and additional_conda_packages parameters.
time.sleep(50)
ray.cluster_resources()

### Train Multi-GPU Distributed Pytorch

In [None]:

import ray.train.torch
from ray import train
from ray.train import Trainer

import torch
import torch.nn as nn
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
from torch.optim import Adam
import numpy as np
from ray.train import Trainer, TrainingCallback
from typing import List, Dict

class PrintingCallback(TrainingCallback):
    def handle_result(self, results: List[Dict], **info):
#         print(results)
        print('hello')
def train_func(config):
    n = 100
    # create a toy dataset
    # data   : X - dim = (n, 4)
    # target : Y - dim = (n, 1)
    X = torch.Tensor(np.random.normal(0, 1, size=(n, 4)))
    Y = torch.Tensor(np.random.uniform(0, 1, size=(n, 1)))
    # toy neural network : 1-layer
    # wrap the model in DDP
    model = ray.train.torch.prepare_model(nn.Linear(4, 1))
    criterion = nn.MSELoss()

    optimizer = Adam(model.parameters(), lr=3e-4)
    for epoch in range(config["num_epochs"]):
        y = model.forward(X)
        # compute loss
        loss = criterion(y, Y)
        print("epoch ", epoch, " loss ", loss)

        # back-propagate loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # To fetch non-DDP state_dict
        # w/o DDP: model.state_dict()
        # w/  DDP: model.module.state_dict()
        # See: https://github.com/ray-project/ray/issues/20915
        state_dict = model.state_dict()
#         consume_prefix_in_state_dict_if_present(state_dict, "module.")
        train.save_checkpoint(epoch=epoch, model_weights=state_dict)



trainer = Trainer(backend="torch", num_workers=2)
trainer.start()
trainer.run(train_func, config={"num_epochs": 5}, callbacks=[PrintingCallback()])
trainer.shutdown() # clean up resources


In [None]:
ray_on_aml.shutdown()

### 2. Run distributed Deep Learning in job mode 

#### Checkout the distributed_ml.py, the job.yml and the conda.yml files. This setup is for Azure ML CLI v2
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli

run the command to submit the job to your AML environment
````az ml job create -f job.yml --resource-group azureml --workspace-name ws01ent````

You can also run the job using v1 SDK as below

In [None]:
from azureml.core import Workspace, Experiment, Environment,ScriptRunConfig
# from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DockerConfiguration,RunConfiguration

#Remember the AML job has to have distribted setings (MPI type) for ray-on-aml to work correctly.
ws = Workspace.from_config()
compute_cluster = 'gpunc6' #This can be another cluster different from the interactive cluster. 
ray_cluster = ComputeTarget(workspace=ws, name=compute_cluster)

aml_run_config_ml = RunConfiguration(communicator='OpenMpi')
docker_config = DockerConfiguration(use_docker=True, shm_size='48gb')


rayEnv = Environment.from_conda_specification(name = "RLEnv",
                                             file_path = "conda.yml")
rayEnv.docker.base_image = "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04:20220412.v1"

aml_run_config_ml.target = ray_cluster
aml_run_config_ml.node_count = 2
aml_run_config_ml.environment = rayEnv
aml_run_config_ml.docker =docker_config

src = ScriptRunConfig(source_directory='.',
                    script='distributed_ml.py',
                    run_config = aml_run_config_ml,
                   )

run = Experiment(ws, "distributed_ml").submit(src)


In [None]:
from azureml.widgets import RunDetails
RunDetails(run).show()