In [1]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.36.0


In [2]:
from azureml.telemetry import set_diagnostics_collection

set_diagnostics_collection(send_diagnostics=True)

Turning diagnostics collection on. 


In [3]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: pytorchml
Azure region: westus2
Subscription id: 68fed1ed-5c6f-4d1d-995e-b7918c1cb057
Resource group: mod8rg


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "hp-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', 
                                                           max_nodes=1)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Creating a new compute target...
InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Resizing', 'allocationStateTransitionTime': '2022-01-11T23:56:35.927000+00:00', 'errors': None, 'creationTime': '2022-01-11T23:56:34.918646+00:00', 'modifiedTime': '2022-01-11T23:56:39.132793+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS11_V2'}


In [5]:
#create project directory
import os

project_folder = './pytorch-birds'
os.makedirs(project_folder, exist_ok=True)

In [6]:
import shutil

shutil.copy('pytorch_train.py', project_folder)

'./pytorch-birds/pytorch_train.py'

In [7]:
from azureml.core import Experiment

experiment_name = 'pytorch-birds'
experiment = Experiment(ws, name=experiment_name)

In [8]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-defaults
  - torch==1.6.0
  - torchvision==0.7.0
  - future==0.17.1
  - pillow

Overwriting conda_dependencies.yml


In [9]:
from azureml.core import Environment

pytorch_env = Environment.from_conda_specification(name = 'pytorch-1.6-gpu', file_path = './conda_dependencies.yml')

# Specify a GPU base image
pytorch_env.docker.enabled = True
pytorch_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [10]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder,
                      script='pytorch_train.py',
                      arguments=['--num_epochs', 30, '--output_dir', './outputs'],
                      compute_target=compute_target,
                      environment=pytorch_env)

In [11]:
run = experiment.submit(src)
print(run)

Run(Experiment: pytorch-birds,
Id: pytorch-birds_1641945401_f38a5ce1,
Type: azureml.scriptrun,
Status: Starting)


In [12]:
# to get more details of your run
print(run.get_details())

{'runId': 'pytorch-birds_1641945401_f38a5ce1', 'target': 'hp-cluster', 'status': 'Preparing', 'services': {}, 'properties': {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnapshotId': '336fb628-a18b-4993-992d-aa1cbc8b2836'}, 'inputDatasets': [], 'outputDatasets': [], 'runDefinition': {'script': 'pytorch_train.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--num_epochs', '30', '--output_dir', './outputs'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'hp-cluster', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'datacaches': [], 'jobName': None, 'maxRunDurationSeconds': 2592000, 'nodeCount': 1, 'instanceTypes': [], 'priority': None, 'credentialPassthrough': False, 'identity': None, 'environment': {'name': 'pytorch-1.6-gpu', 'version': 'Autosave_2022-01-11T13:15:06Z_9da23315', 'python': {'interpreterPath': 'python', 'userManagedDependencies': False, 'condaDependencies': {'channels': ['conda-forge'], 'dependencies': ['p

In [13]:
#monitor your run
from azureml.widgets import RunDetails

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

In [14]:
run.wait_for_completion(show_output=True)

RunId: pytorch-birds_1641945401_f38a5ce1
Web View: https://ml.azure.com/runs/pytorch-birds_1641945401_f38a5ce1?wsid=/subscriptions/68fed1ed-5c6f-4d1d-995e-b7918c1cb057/resourcegroups/mod8rg/workspaces/pytorchml&tid=7058e5f5-ec9d-453b-85ca-6635eb7ccd32


In [None]:
#tune model hyperparameters
#start a hyperparameter sweep

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, uniform, PrimaryMetricGoal

param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'momentum': uniform(0.9, 0.99)
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='best_val_acc',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=8,
                                     max_concurrent_runs=4)

In [None]:
# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_config)

In [None]:
#monitor hyperdrive runs

RunDetails(hyperdrive_run).show()

In [None]:
#block until the HyperDrive sweep has completed:

hyperdrive_run.wait_for_completion(show_output=True)

In [None]:
assert(hyperdrive_run.get_status() == "Completed")

In [None]:
#Warm start a Hyperparameter Tuning experiment and resuming child runs
#Find and register the best model


best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print(best_run)

In [None]:
print('Best Run is:\n  Validation accuracy: {0:.5f} \n  Learning rate: {1:.5f} \n  Momentum: {2:.5f}'.format(
        best_run_metrics['best_val_acc'][-1],
        best_run_metrics['lr'],
        best_run_metrics['momentum'])
     )

In [None]:
model = best_run.register_model(model_name = 'pytorch-birds', model_path = 'outputs/model.pt')
print(model.name, model.id, model.version, sep = '\t')

In [None]:
#Deploy model as web service
#Deploy to ACI container


from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig
from azureml.core.model import Model

inference_config = InferenceConfig(entry_script="pytorch_score.py", environment=pytorch_env)

aciconfig = AciWebservice.deploy_configuration(cpu_cores=2, 
                                               memory_gb=2, 
                                               tags={'data': 'birds',  'method':'transfer learning', 'framework':'pytorch'},
                                               description='Classify turkey/chickens using transfer learning with PyTorch')

service = Model.deploy(workspace=ws, 
                           name='aci-birds', 
                           models=[model], 
                           inference_config=inference_config, 
                           deployment_config=aciconfig)
service.wait_for_deployment(True)
print(service.state)

In [None]:
service.get_logs()

In [None]:
print(service.scoring_uri)

In [None]:
#Test the web service

import json
from PIL import Image
import matplotlib.pyplot as plt

%matplotlib inline
plt.imshow(Image.open('test_img.jpg'))

In [None]:
import torch
from torchvision import transforms
    
def preprocess(image_file):
    """Preprocess the input image."""
    data_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    image = Image.open(image_file)
    image = data_transforms(image).float()
    image = torch.tensor(image)
    image = image.unsqueeze(0)
    return image.numpy()

In [None]:
input_data = preprocess('test_img.jpg')
result = service.run(input_data=json.dumps({'data': input_data.tolist()}))
print(result)

In [None]:
#Clean up

service.delete()