## 1. Connect to Workspace

In [2]:
## 1. get the workspace
from azureml.core import Workspace
ws = Workspace.get(
           name="MLOps",
           subscription_id='a9ec4d60-b59f-47b4-99d3-e2a767f0544d',
           resource_group='MLOps-Group4')

## 2. Register Dataset

In [4]:
## 2. load dataset
from azureml.core import Dataset
subscription_id = 'a9ec4d60-b59f-47b4-99d3-e2a767f0544d'
resource_group = 'MLOps-Group4'
workspace_name = 'MLOps'

workspace2 = Workspace(subscription_id, resource_group, workspace_name)
dataset = Dataset.get_by_name(workspace2, name='mobile')

## 3. Register input files saved on blob store as an Azure ML Dataset
dataset.register(workspace=ws, name="my_raw_dataset")

{
  "source": [
    "('workspaceblobstore', 'UI/10-25-2021_104638_UTC/mobile.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "5ae3d39a-9bfe-4a95-bb89-75f95e4cfcfb",
    "name": "my_raw_dataset",
    "version": 1,
    "workspace": "Workspace.create(name='MLOps', subscription_id='a9ec4d60-b59f-47b4-99d3-e2a767f0544d', resource_group='MLOps-Group4')"
  }
}

## 3. Create Scripts for Pipeline Steps

In [5]:
## Create a folder for the pipeline step files
import os
experiment_folder = 'mobile_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

mobile_pipeline


### 3.1 Pipelein 1 - Clean Dataset

In [6]:
%%writefile $experiment_folder/prep_mobile.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)

# remove nulls
df = df.dropna()

# Log processed rows
row_count = (len(df))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
df.to_csv(save_path, index=False, header=True)

# End the run
run.complete()



Overwriting mobile_pipeline/prep_mobile.py


### 3.2 Pipeline 2 - Train Model

In [57]:
%%writefile $experiment_folder/train_mobile.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
args = parser.parse_args()
training_folder = args.training_folder

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_folder,'data.csv')
df_mobile = pd.read_csv(file_path)

# Separate features and labels
X = df_mobile.drop("price_range",axis=1)
y = df_mobile.price_range

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train random forest tree model
print('Training a random forest model...')
model = RandomForestClassifier().fit(X_train, y_train)

# 1) calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# 2) calculate F1
f1_scores = f1_score(y_test, y_hat, average=None)
print('F1: ' + str(f1_scores))
run.log('F1_avg', np.float(f1_scores.mean()))

# 3) plot the classification report
report = classification_report(y_test, y_hat)
print(report)

precision,recall,fscore,support=score(y_test,y_hat)
print ('Precision : {}'.format(precision))
print ('Recall    : {}'.format(recall))
print ('F-score   : {}'.format(fscore))
print ('Support   : {}'.format(support))

run.log('Precision_avg', np.float(precision.mean()))
run.log('Recall_avg', np.float(recall.mean()))

run.log_list('F1', fscore) ## this will give us a line chart
run.log_list('Precision', precision)
run.log_list('Recall', recall)
run.log_list('Support', support)

# 4) plot the matrix
conf_matrix = confusion_matrix(y_test, y_hat)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
run.log_image(name = "Confusion_Matrix", plot = fig)
plt.show() 

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'mobile_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'mobile_rf_model',
               tags={'Training context':'Pipeline'},
               properties={'f1-scores-avg': np.float(f1_scores.mean()), 'Accuracy': np.float(acc)})


run.complete()

Overwriting mobile_pipeline/train_mobile.py


## 4. Compute Environment

In [41]:
## get the cluster
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "caixinya1"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [42]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
mobile_env = Environment("mobile-pipeline-env")
mobile_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
mobile_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])

# Add the dependencies to the environment
mobile_env.python.conda_dependencies = diabetes_packages

# Register the environment 
mobile_env.register(workspace=ws)
registered_env = Environment.get(ws, 'mobile-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


Run configuration created.


## 5. Create & Run Pipeline

In [43]:

from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
mobile_ds = ws.datasets.get("mobile")

# Create a PipelineData (temporary Data Reference) for the model folder
prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())

# Step 1, Run the data prep script
train_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_mobile.py",
                                arguments = ['--input-data', mobile_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data_folder],
                                outputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
register_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_mobile.py",
                                arguments = ['--training-folder', prepped_data_folder],
                                inputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [58]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-mobile-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [49f30026][0b4be8f6-eedf-489b-a989-3b8009795a70], (This step will run and generate new outputs)
Created step Train and Register Model [ddf12f93][b66e186f-99ca-4374-a39b-5b949f009ffb], (This step will run and generate new outputs)
Submitted PipelineRun dd989d4f-6b0d-484f-9e59-74118828082b
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/dd989d4f-6b0d-484f-9e59-74118828082b?wsid=/subscriptions/a9ec4d60-b59f-47b4-99d3-e2a767f0544d/resourcegroups/MLOps-Group4/workspaces/MLOps&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
Pipeline submitted for execution.
PipelineRunId: dd989d4f-6b0d-484f-9e59-74118828082b
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/dd989d4f-6b0d-484f-9e59-74118828082b?wsid=/subscriptions/a9ec4d60-b59f-47b4-99d3-e2a767f0544d/resourcegroups/MLOps-Group4/workspaces/MLOps&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
PipelineRun Status: NotStarted


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRun Status: Running


StepRunId: cd03347a-5cb6-4951-a9bb-868fe7401bdf
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cd03347a-5cb6-4951-a9bb-868fe7401bdf?wsid=/subscriptions/a9ec4d60-b59f-47b4-99d3-e2a767f0544d/resourcegroups/MLOps-Group4/workspaces/MLOps&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_5730cae126fde30212b486acb90db12b704f6078c588f655a2c80501e85b3405_d.txt
2021-10-31T21:37:36Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/mlops/azureml/cd03347a-5cb6-4951-a9bb-868fe7401bdf/mounts/workspaceblobstore
2021-10-31T21:37:36Z The vmsize standard_ds3_v2 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2021-10-31T21:37:36Z Starting output-watcher...
2021-10-31T21:37:36Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2021-10-31T21:37:36Z Executing 'Copy ACR Details file' on 10.0.0.5
2021-10-31T21

'Finished'

In [22]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.8683333333333333
	 F1 : 0.8636145547111012
Prepare Data :
	 raw_rows : 3000
	 processed_rows : 2000
