# DataBricksのclusterをAMLにAttachする

In [11]:
from azureml.core import Workspace
from azureml.core.compute import DatabricksCompute, ComputeTarget

ws = Workspace.from_config("./config")

db_resource_group = "dp100"
db_workspace_name = "Databricks01"
# Access Token は DatabricksのUser Settingsから作成する
db_access_token = "dapi855095b836fa289bf1ecb757dcaaeb94"

db_compute_name = "mydbcluster001"
if db_compute_name not in ws.compute_targets:
    attach_config = DatabricksCompute.attach_configuration(
                                    resource_group=db_resource_group,
                                    workspace_name=db_workspace_name,
                                    access_token=db_access_token)

    db_cluster = ComputeTarget.attach(ws, db_compute_name, attach_config)
    db_cluster.wait_for_completion(True)

else:
    db_cluster = ws.compute_targets[db_compute_name]

# 設定を行う

In [20]:
# ------------------------------------------------------
# Run the DatabricksStep as an AzureML Pipeline step
# ------------------------------------------------------
from azureml.core import Workspace

# Access the Workspace
ws = Workspace.from_config("./config")


# -----------------------------------------------------------------
# Create custom environment
# -----------------------------------------------------------------
from azureml.core import Environment
from azureml.core.environment import CondaDependencies

# Create the environment
myenv = Environment(name="MyEnvironment")

# Create the dependencies object
myenv_dep = CondaDependencies.create(conda_packages=['scikit-learn==0.22.1', 'joblib', 'pandas'])

myenv.python.conda_dependencies = myenv_dep

# Register the environment
myenv.register(ws)



# -----------------------------------------------------------------
# Create a compute cluster for pipeline
# -----------------------------------------------------------------
cluster_name = "pipeline-cluster"

# # Provisioning configuration using AmlCompute
from azureml.core.compute import AmlCompute

print("Accessing the compute cluster...")

if cluster_name not in ws.compute_targets:
    print("Creating the compute cluster with name: ", cluster_name)
    compute_config = AmlCompute.provisioning_configuration(
                                      vm_size="STANDARD_D11_V2",
                                      max_nodes=2)

    compute_cluster = AmlCompute.create(ws, cluster_name, compute_config)
    compute_cluster.wait_for_completion()
else:
    compute_cluster = ws.compute_targets[cluster_name]
    print(cluster_name, ", compute cluster found. Using it...")


# -----------------------------------------------------------------
# Create Run Configurations for the steps
# -----------------------------------------------------------------
from azureml.core.runconfig import RunConfiguration
run_config = RunConfiguration()

run_config.target = compute_cluster
run_config.environment = myenv



# -----------------------------------------------------------------
# Attach the Databricks Cluster as an attached compute target
# -----------------------------------------------------------------
from azureml.core.compute    import DatabricksCompute
from azureml.core.compute    import ComputeTarget

# Initialize the attach config parameters
db_resource_group = "dp100"
db_workspace_name = "Databricks01"
# Access Token は DatabricksのUser Settingsから作成する
db_access_token = "dapi855095b836fa289bf1ecb757dcaaeb94"
db_compute_name = "mydbcluster001"


# Attach the Databricks compute target
if db_compute_name not in ws.compute_targets:
    print("Creating attach config for Databricks...")
    attach_config = DatabricksCompute.attach_configuration(
                            resource_group = db_resource_group,
                            workspace_name = db_workspace_name,
                            access_token = db_access_token)
    
    print("Attaching Databricks Cluster to AzureML workspace..")
    db_cluster = ComputeTarget.attach(ws,
                                      db_compute_name,
                                      attach_config)

    db_cluster.wait_for_completion(True)

else:
    print('Compute target already exists')
    db_cluster = ws.compute_targets[db_compute_name]


# -----------------------------------------------------------------
# Create/pass data reference of Input and Output
# -----------------------------------------------------------------
from azureml.data.data_reference import DataReference
from azureml.pipeline.core   import PipelineData

# Create input data reference
# data_store = ws.get_default_datastore()
data_store = ws.datastores.get('azure_sdk_blob01')

input_data = DataReference(datastore = data_store,
                           data_reference_name = 'input')

output_data1 = PipelineData('testdata', datastore=data_store)

Accessing the compute cluster...
pipeline-cluster , compute cluster found. Using it...
Compute target already exists


# Databricksstepを作成する

In [21]:
# Create the Databricks Step
from azureml.pipeline.steps import DatabricksStep
from azureml.core.databricks import PyPiLibrary

scikit_learn = PyPiLibrary(package = 'scikit-learn==0.22.1')
joblib       = PyPiLibrary(package = 'joblib')


# Databricks ⇒ Cluster ⇒ NotebookからPathを確認可能
notebook_path = r"/Users/nakamukaiya@gmail.com/demo001"

db_step01 = DatabricksStep(name = "db_step01",
                           inputs = [input_data],
                           outputs = [output_data1],
                           num_workers = 1,
                           notebook_path = notebook_path,
                           run_name = "db_notebook_demo",
                           compute_target = db_cluster,
                           pypi_libraries = [scikit_learn, joblib],
                           allow_reuse = False) # Trueにすると、過去のrunを再利用できる

In [22]:
# Databricks stepでAIの判定を行い、Storageに結果を保存
# eval stepでそれを読み込んで評価を行う
# -----------------------------------------------------------------
# Create the pipeline step to run python script
# ----------------------------------------------------------------
from azureml.pipeline.steps import PythonScriptStep

eval_step    = PythonScriptStep(name='Evaluate',
                                 source_directory='./script',
                                 script_name='630 - Evaluate.py',
                                 inputs=[output_data1],
                                 runconfig=run_config,
                                 arguments=['--testdata', output_data1])

# -----------------------------------------------------------------
# Build and submit the pipeline
# -----------------------------------------------------------------
from azureml.pipeline.core   import Pipeline
from azureml.core            import Experiment

steps             = [db_step01, eval_step]
new_pipeline      = Pipeline(workspace=ws, steps=steps)
new_pipeline_run  = Experiment(ws, 'DB_Notebook_exp001').submit(new_pipeline)

# Wait for completion
new_pipeline_run.wait_for_completion(show_output=True)

Created step db_step01 [4549581a][9d56fde7-1c2b-4095-a4e7-ea2ef981f5e3], (This step will run and generate new outputs)
Created step Evaluate [4cdb7e87][a444d72f-1eca-4210-b2fc-343472bfd354], (This step is eligible to reuse a previous run's output)
Using data reference input for StepId [e4b09270][5b0dfa0b-94df-4882-add4-43770e66ecb3], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun 6a918073-2a09-49cd-abfc-910e3a68651c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/6a918073-2a09-49cd-abfc-910e3a68651c?wsid=/subscriptions/3467f739-a57b-4612-9de8-72a6616c01b3/resourcegroups/AzuremlSDKRG00/workspaces/Azureml-SDK-WS01&tid=bcd8db96-8bb9-4f0d-af35-e471bf92c072
PipelineRunId: 6a918073-2a09-49cd-abfc-910e3a68651c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/6a918073-2a09-49cd-abfc-910e3a68651c?wsid=/subscriptions/3467f739-a57b-4612-9de8-72a6616c01b3/resourcegroups/AzuremlSDKRG00/workspaces/Azureml-SDK-WS01&tid=bcd8db96-8bb9-4

Collecting package metadata (repodata.json): ...working... 
done
Solving environment: ...working... 
done

Downloading and Extracting Packages

scikit-learn-0.22.1  | 7.3 MB    |            |   0% 
scikit-learn-0.22.1  | 7.3 MB    |            |   0% 
scikit-learn-0.22.1  | 7.3 MB    | 2          |   2% 
scikit-learn-0.22.1  | 7.3 MB    | 4          |   5% 
scikit-learn-0.22.1  | 7.3 MB    | 7          |   7% 
scikit-learn-0.22.1  | 7.3 MB    | #2         |  12% 
scikit-learn-0.22.1  | 7.3 MB    | #8         |  19% 
scikit-learn-0.22.1  | 7.3 MB    | ###        |  30% 
scikit-learn-0.22.1  | 7.3 MB    | ####4      |  45% 
scikit-learn-0.22.1  | 7.3 MB    | #####9     |  59% 
scikit-learn-0.22.1  | 7.3 MB    | #########  |  91% 
scikit-learn-0.22.1  | 7.3 MB    | ########## | 100% 

blas-1.0             | 6 KB      |            |   0% 
blas-1.0             | 6 KB      | ########## | 100% 

zlib-1.2.12          | 130 KB    |            |   0% 
zlib-1.2.12          | 130 KB    | #########


done
[91m

  current version: 4.11.0
  latest version: 22.9.0

Please update conda by running

    $ conda update -n base -c defaults conda


[0m#
# To activate this environment, use
#
#     $ conda activate /azureml-envs/azureml_5facbb09c99e77ed568632b31e323952
#
# To deactivate an active environment, use
#
#     $ conda deactivate

Removing intermediate container 607dd6b4d817
 ---> 3bb469ac5de7
Step 9/21 : ENV PATH /azureml-envs/azureml_5facbb09c99e77ed568632b31e323952/bin:$PATH
 ---> Running in 5e526332c7e4
Removing intermediate container 5e526332c7e4
 ---> 139c64903dac
Step 10/21 : COPY azureml-environment-setup/send_conda_dependencies.py azureml-environment-setup/send_conda_dependencies.py
 ---> ca5d0975ab3a
Step 11/21 : RUN echo "Copying environment context"
 ---> Running in 9fca6ea6093e
Copying environment context
Removing intermediate container 9fca6ea6093e
 ---> a74b1df8b741
Step 12/21 : COPY azureml-environment-setup/environment_context.json azureml-environment-setup/envir

Run ID: cea was successful after 5m25s


Unable to stream download: ("Connection broken: ConnectionResetError(10054, '既存の接続はリモート ホストに強制的に切断されました。', None, 10054, None)", ConnectionResetError(10054, '既存の接続はリモート ホストに強制的に切断されました。', None, 10054, None))



StepRun(Evaluate) Execution Summary
StepRun( Evaluate ) Status: Finished
{'runId': '22de851a-d558-47e2-847b-e066855b1518', 'target': 'pipeline-cluster', 'status': 'Completed', 'startTimeUtc': '2022-10-08T09:17:42.214714Z', 'endTimeUtc': '2022-10-08T09:19:28.579657Z', 'services': {}, 'properties': {'ContentSnapshotId': '91a371df-af0a-4894-986b-953a5e3ab8de', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': 'a444d72f-1eca-4210-b2fc-343472bfd354', 'azureml.moduleName': 'Evaluate', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '4cdb7e87', 'azureml.pipelinerunid': '6a918073-2a09-49cd-abfc-910e3a68651c', 'azureml.pipeline': '6a918073-2a09-49cd-abfc-910e3a68651c', 'azureml.pipelineComponent': 'masterescloud', '_azureml.ComputeTargetType': 'amlctrain', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [], 'outputDatasets': [], 'runDefinition': {'script': '630 - Evalu



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '6a918073-2a09-49cd-abfc-910e3a68651c', 'status': 'Completed', 'startTimeUtc': '2022-10-08T09:01:36.080197Z', 'endTimeUtc': '2022-10-08T09:19:29.613995Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://azuremlsstorage74437b7d7.blob.core.windows.net/azureml/ExperimentRun/dcid.6a918073-2a09-49cd-abfc-910e3a68651c/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=sUTL3ueHk6hGpI6dvUwIQ0CyreRAgIamCVSPVvp9gqw%3D&skoid=285719f7-ab93-4fe7-a7ba-e7b47a2d53bd&sktid=bcd8db96-8bb9-4f0d-af35-e471bf92c072&skt=2022-10-08T05%3A06%3A45Z&ske=2022-10-09T13%3A16%3A45Z&sks=b&skv=2019-07-07&st=2022-10-08T

'Finished'

In [23]:
# Databricks stepのアウトプットはDatabricksのrunから確認したrun_IDのフォルダ名の中に格納されている
# --AZUREML_RUN_ID 5192c843-7a9c-4c25-be32-8e48da275e39