# DONUT TRAINING

## 1. Prepare workspace, environment, and cluster

In [None]:
from azure.ai.ml.entities import AmlCompute
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.entities import Environment
from azure.ai.ml import command
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes

Prepare the workspace

In [1]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="", # enter credentials in Step 5
    resource_group_name="", # enter credentials in Step 5
    workspace_name="", # enter credentials in Step 5
)

Create the environment to be used later by the compute cluster

In [None]:
env_docker_conda = Environment(
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04",
    conda_file="environment.yaml",
    name="donut",
    description="Donut",
)
ml_client.environments.create_or_update(env_docker_conda)

Create compute resource, change the value of min_instances, max_instances if you want to have a cluster of a different size.

In [None]:
gpu_compute_target = "gpu-cluster"

try:
    # let's see if the compute target already exists
    gpu_cluster = ml_client.compute.get(gpu_compute_target)
    print(
        f"You already have a cluster named {gpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new gpu compute target...")

    gpu_cluster = AmlCompute(
        name="gpu-cluster",
        type="amlcompute",
        size="STANDARD_NC6",  # 4 x NVIDIA Tesla K80
        min_instances=0,
        max_instances=4,
        idle_time_before_scale_down=120,
        tier="Dedicated",
    )

    gpu_cluster = ml_client.begin_create_or_update(gpu_cluster)

print(
    f"AMLCompute with name {gpu_cluster.name} is created, the compute size is {gpu_cluster.size}"
)

## 2. Create and submit job

In [2]:
job = command(
    inputs={
       "data": Input(
            type="uri_folder",
            path= "", # replace by the data uri in step 4
            mode = "ro_mount"
        ),
        "config":  "config/train_passport_terminal.yaml" , # this file contain the hyperparameters and configuration for the training
        "exp_name": "mrz", # experiment name, change it as you like
      
    },
    code="./",  # location of source code
    command="python train.py --dataset_name_or_paths ${{inputs.data}} --config ${{inputs.config}} --exp_name ${{inputs.exp_name}} ",
    environment="donut@latest", # this refers to the environment created above
    compute="gpu-cluster", # this refers to the compute cluster created above
    experiment_name = 'mrz',
    display_name="main_job", 
    instance_count=4,  
    distribution={
        "type": "PyTorch",
        "process_count_per_instance": 1,  # number of GPus per node
    },
)

In [None]:
After the job is uploaded, you can click on the link to get to the "Jobs" page

In [3]:
ml_client.create_or_update(job)

Your file exceeds 100 MB. If you experience low speeds, latency, or broken connections, we recommend using the AzCopyv10 tool for this file transfer.

Example: azcopy copy '/mnt/batch/tasks/shared/LS_root/mounts/clusters/script-launch/code/Users/thithuyduyen.pham/donut_training/donut' 'https://safetravel.blob.core.windows.net/azureml-blobstore-61ed3287-b3b7-4ed7-8306-4e23b936201e/LocalUpload/49b805722c10f089f08ac1c4b979aba3/donut' 

See https://docs.microsoft.com/azure/storage/common/storage-use-azcopy-v10 for more information.
[32mUploading donut (143.05 MBs): 100%|██████████| 143052127/143052127 [00:09<00:00, 14928318.03it/s]
[39m



Experiment,Name,Type,Status,Details Page
mrz,honest_stamp_5th95nc8zj,command,Starting,Link to Azure Machine Learning studio
