In [1]:
import azureml.core
from azureml.core import Workspace
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.36.0 to work with diabetesml


In [2]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
                        target_path='diabetes-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


In [3]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'diabetes_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

diabetes_pipeline


In [4]:
%%writefile $experiment_folder/prep_diabetes.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(diabetes))
run.log('raw_rows', row_count)

# remove nulls
diabetes = diabetes.dropna()

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])

# Log processed rows
row_count = (len(diabetes))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
diabetes.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Overwriting diabetes_pipeline/prep_diabetes.py


In [5]:
%%writefile $experiment_folder/train_diabetes.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
args = parser.parse_args()
training_folder = args.training_folder

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_folder,'data.csv')
diabetes = pd.read_csv(file_path)

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train adecision tree model
print('Training a decision tree model...')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'diabetes_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'diabetes_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

Overwriting diabetes_pipeline/train_diabetes.py


In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "dp100-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [7]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
diabetes_env = Environment("diabetes-pipeline-env")
diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
diabetes_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])

# Add the dependencies to the environment
diabetes_env.python.conda_dependencies = diabetes_packages

# Register the environment 
diabetes_env.register(workspace=ws)
registered_env = Environment.get(ws, 'diabetes-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


Run configuration created.


In [8]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create a PipelineData (temporary Data Reference) for the model folder
prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())

# Step 1, Run the data prep script
train_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_diabetes.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data_folder],
                                outputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
register_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_diabetes.py",
                                arguments = ['--training-folder', prepped_data_folder],
                                inputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [9]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [f22041f1][2fe8887e-bc17-4c0e-8f65-1ce87d40f7da], (This step will run and generate new outputs)
Created step Train and Register Model [7ff9f56b][c6f4af96-d1ae-420e-bd3a-f7d816bf6455], (This step will run and generate new outputs)
Submitted PipelineRun 42e1233e-013f-4407-9bcd-31b92b1edd62
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/42e1233e-013f-4407-9bcd-31b92b1edd62?wsid=/subscriptions/42987aff-66bf-4e3f-97b2-6489495e6f17/resourcegroups/mlrg/workspaces/diabetesml&tid=44ce2c82-ed0c-4def-9a56-03ce54252d6e
Pipeline submitted for execution.
PipelineRunId: 42e1233e-013f-4407-9bcd-31b92b1edd62
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/42e1233e-013f-4407-9bcd-31b92b1edd62?wsid=/subscriptions/42987aff-66bf-4e3f-97b2-6489495e6f17/resourcegroups/mlrg/workspaces/diabetesml&tid=44ce2c82-ed0c-4def-9a56-03ce54252d6e
PipelineRun Status: Running


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …



StepRunId: 3661b751-a252-4231-91da-54af04ed70e6
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3661b751-a252-4231-91da-54af04ed70e6?wsid=/subscriptions/42987aff-66bf-4e3f-97b2-6489495e6f17/resourcegroups/mlrg/workspaces/diabetesml&tid=44ce2c82-ed0c-4def-9a56-03ce54252d6e
StepRun( Prepare Data ) Status: NotStarted
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished

This run might be using a new job runtime with improved performance and error reporting. The logs from your script are in user_logs/std_log.txt. Please let us know if you run into any issues, and if you would like to opt-out, please add the environment variable AZUREML_COMPUTE_USE_COMMON_RUNTIME to the environment variables section of the job and set its value to the string "false"




StepRunId: 0cefe1a2-fdaf-472f-9263-d676eab94b83
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/0cefe1a2-fdaf-472f-9263-d676eab94b83?wsi

'Finished'

In [10]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.8886666666666667
	 AUC : 0.8734165878899447
	 ROC : aml://artifactId/ExperimentRun/dcid.0cefe1a2-fdaf-472f-9263-d676eab94b83/ROC_1641527484.png
Prepare Data :
	 raw_rows : 10000
	 processed_rows : 10000


In [11]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

diabetes_model version: 2
	 Training context : Pipeline
	 AUC : 0.8734165878899447
	 Accuracy : 0.8886666666666667


diabetes_model version: 1
	 Training context : Pipeline
	 AUC : 0.877354594755958
	 Accuracy : 0.8903333333333333




In [12]:
#publish pipeline from the run
published_pipeline=pipeline_run.publish_pipeline(
    name="diebetes-training-pipeline",description="Trains diabetes model",version=1.0)

published_pipeline

Name,Id,Status,Endpoint
diebetes-training-pipeline,534e29de-2c66-4e5d-b7e9-7a496922518f,Active,REST Endpoint


In [13]:
rest_endpoint=published_pipeline.endpoint
print(rest_endpoint)

https://eastus.api.azureml.ms/pipelines/v1.0/subscriptions/42987aff-66bf-4e3f-97b2-6489495e6f17/resourceGroups/mlrg/providers/Microsoft.MachineLearningServices/workspaces/diabetesml/PipelineRuns/PipelineSubmit/534e29de-2c66-4e5d-b7e9-7a496922518f


In [14]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print("Authentication header ready")

Authentication header ready


In [16]:
import requests

experiment_name='mslearn-diabetes-pipeline'
rest_endpoint=published_pipeline.endpoint
response=requests.post(rest_endpoint,
                        headers=auth_header,
                        json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id

'7e952af6-ca6d-4a67-9549-684e6d35ed99'

In [17]:
from azureml.pipeline.core import ScheduleRecurrence,Schedule

#submit the pipeline every Monday at 00:00 UTC
recurrence = ScheduleRecurrence(frequency="Week",interval=1, week_days=["Monday"], time_of_day="00:00")
weekly_schedule = Schedule.create(ws, name="weekly-diabetes-training", 
                                  description="Based on time",
                                  pipeline_id=published_pipeline.id,
                                  experiment_name='mslearn-diabetes-pipeline',
                                  recurrence=recurrence)
                
print('pipeline scheduled.')

                                     

pipeline scheduled.


In [18]:
#retrieve schedule
schedules=Schedule.list(ws)
schedules

[Pipeline(Name: weekly-diabetes-training,
 Id: f7ed9b95-652d-49ab-ac84-fddc7fc00008,
 Status: Active,
 Pipeline Id: 534e29de-2c66-4e5d-b7e9-7a496922518f,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week),
 Pipeline(Name: weekly-diabetes-training,
 Id: bde704c6-044f-4b1d-bbd2-bc51684b245b,
 Status: Active,
 Pipeline Id: c6b2b973-ca64-49fe-9e71-5bc4dd9b4a1f,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week)]

In [19]:
pipeline_experiment = ws.experiments.get('mslearn-diabetes-pipeline')
latest_run = list(pipeline_experiment.get_runs())[0]

latest_run.get_details()

{'runId': 'e96b61ba-fd18-4b6a-af4b-0ceeb898660c',
 'status': 'Completed',
 'startTimeUtc': '2022-01-07T03:57:57.154907Z',
 'endTimeUtc': '2022-01-07T03:57:59.605777Z',
 'services': {},
 'properties': {'azureml.runsource': 'azureml.PipelineRun',
  'runSource': 'Unavailable',
  'runType': 'Schedule',
  'azureml.parameters': '{}',
  'azureml.continue_on_step_failure': 'False',
  'azureml.pipelineComponent': 'pipelinerun',
  'azureml.pipelineid': '534e29de-2c66-4e5d-b7e9-7a496922518f'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'logs/azureml/executionlogs.txt': 'https://diabetesml9114746245.blob.core.windows.net/azureml/ExperimentRun/dcid.e96b61ba-fd18-4b6a-af4b-0ceeb898660c/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=OLv%2BPqHdJTv3xf%2FKd4EMVP7W%2Frix8JW5rex%2BS%2BVME4U%3D&skoid=f8bb5ece-a6cc-4f3b-9a61-1f29a8519959&sktid=44ce2c82-ed0c-4def-9a56-03ce54252d6e&skt=2022-01-07T00%3A40%3A50Z&ske=2022-01-08T08%3A50%3A50Z&sks=b&skv=2019-07-07&st=2022-01-07T03%3A48%3A02Z