## Connect to your workspace

In [2]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails

from azureml.core import Dataset

from azureml.pipeline.core import Pipeline, PipelineData, PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

from azureml.core.model import Model

import warnings
warnings.filterwarnings('ignore')

print('SDK Version:', azureml.core.VERSION)

SDK Version: 1.35.0


In [3]:
ws = Workspace.create(name='MLOP_14',
               subscription_id='6d705e60-c254-4179-9abe-b484e8ac71ef',
               resource_group='MLOP',
               create_resource_group=False,
               location='eastus'
               )

Deploying StorageAccount with name mlop14storagecbc4ef8e3a6.
Deploying KeyVault with name mlop14keyvault0628dbb87c.
Deploying AppInsights with name mlop14insightsa90a58820c.
Deployed AppInsights with name mlop14insightsa90a58820c. Took 1.86 seconds.
Deploying Workspace with name MLOP_14.
Deployed StorageAccount with name mlop14storagecbc4ef8e3a6. Took 22.19 seconds.
Deployed KeyVault with name mlop14keyvault0628dbb87c. Took 20.98 seconds.
Deployed Workspace with name MLOP_14. Took 64.46 seconds.


## Prepare data

In [4]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

workspacefilestore - Default = False
workspaceblobstore - Default = True
workspaceartifactstore - Default = False
workspaceworkingdirectory - Default = False


In [5]:
if 'telecom dataset' not in ws.datasets:
    default_ds.upload_files(files=['/Users/lingyizhao/Desktop/mlop/project/clean_data.csv'], # Upload the diabetes csv files in /data
                        target_path='telecom-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'telecom-data/clean_data.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='telecom dataset',
                                description='telecom data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Uploading an estimated of 1 files
Uploading /Users/lingyizhao/Desktop/mlop/project/clean_data.csv
Uploaded /Users/lingyizhao/Desktop/mlop/project/clean_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Dataset registered.


In [6]:
# Display dataframe
tab_data_set.to_pandas_dataframe()

Unnamed: 0,state,account_length,area_code,phone_number,intl_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churned
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,HI,50,408,365-8751,no,yes,40,235.7,127,40.07,...,126,18.96,297.5,116,13.39,9.9,5,2.67,2,False.
4996,WV,152,415,334-9736,no,no,0,184.2,90,31.31,...,73,21.83,213.6,113,9.61,14.7,2,3.97,3,True.
4997,DC,61,415,333-6861,no,no,0,140.6,89,23.90,...,128,14.69,212.4,97,9.56,13.6,4,3.67,1,False.
4998,DC,109,510,394-2206,no,no,0,188.8,67,32.10,...,92,14.59,224.4,89,10.10,8.5,6,2.30,0,False.


In [7]:
#tab_data_set.to_pandas_dataframe().isnull().sum()

In [8]:
# df = tab_data_set.to_pandas_dataframe().dropna()
# # df1 = df[df.isna().any(axis=1)]
# # df1
# df.isnull().sum()

## Create scripts for pipeline steps

In [9]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'telecom_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

telecom_pipeline


### Data Preprocessing

In [10]:
%%writefile $experiment_folder/prep_telecom.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
telecom = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(telecom))
run.log('raw_rows', row_count)

# data cleaning
telecom = telecom.dropna()

telecom['voice_mail_plan'] = telecom['voice_mail_plan'].map(lambda x: x.strip())
telecom['intl_plan'] = telecom['intl_plan'].map(lambda x: x.strip())
telecom['churned'] = telecom['churned'].astype('str') 
telecom['churned'] = telecom['churned'].map(lambda x: x.strip())
telecom = telecom.replace(['True.', 'False.'], ['True','False']) 

# telecom = telecom[telecom.account_length != 99999]
# telecom = telecom[telecom.account_length != -1]
# telecom = telecom[telecom.area_code != 99999]
# telecom  = telecom[telecom.area_code != -20]
# telecom = telecom[telecom.area_code != 1]
# telecom = telecom[telecom.total_day_charge < 2000]
# telecom = telecom[telecom.churned != 'nan']
# telecom = telecom.replace(['T', 'F', 'TRUE', 'FALSE', 'True.', 'False.'], 
#                      ['True','False', 'True','False','True','False'])

# Log processed rows
row_count = (len(telecom))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
telecom.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Overwriting telecom_pipeline/prep_telecom.py


In [11]:
%%writefile $experiment_folder/train_telecom.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
args = parser.parse_args()
training_folder = args.training_folder

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_folder,'data.csv')
telecom = pd.read_csv(file_path)

# Separate features and labels
y = np.where(telecom['churned'] == 'True',1,0)
## Drop some useless columns
to_drop = ['state','area_code','phone_number','churned']
churn_feat_space = telecom.drop(to_drop, axis=1)
## converted yes and no
yes_no_cols = ["intl_plan","voice_mail_plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
X = churn_feat_space


# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Scale the data, using standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Train the random forest model
print('Training a random forest model...')
model = RandomForestClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()


# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score

# # calculate accuracy, precision and recall
# def cal_evaluation(classifier, cm):
#     tn = cm[0][0]
#     fp = cm[0][1]
#     fn = cm[1][0]
#     tp = cm[1][1]
#     accuracy  = (tp + tn) / (tp + fp + fn + tn + 0.0)
#     precision = tp / (tp + fp + 0.0)
#     recall = tp / (tp + fn + 0.0)
#     print (classifier)
#     print ("Accuracy is: %0.3f" % accuracy)
#     print ("precision is: %0.3f" % precision)
#     print ("recall is: %0.3f" % recall)

# # print out confusion matrices
# def draw_confusion_matrices(confusion_matricies):
#     class_names = ['Not','Churn']
#     for cm in confusion_matrices:
#         classifier, cm = cm[0], cm[1]
#         cal_evaluation(classifier, cm)
#         fig = plt.figure()
#         ax = fig.add_subplot(111)
#         cax = ax.matshow(cm, interpolation='nearest',cmap=plt.get_cmap('Reds'))
#         plt.title('Confusion matrix for %s' % classifier)
#         fig.colorbar(cax)
#         ax.set_xticklabels([''] + class_names)
#         ax.set_yticklabels([''] + class_names)
#         plt.xlabel('Predicted')
#         plt.ylabel('True')
#         run.log_image(name = "confusion_matrices", plot = fig)
#         plt.show()
        
# confusion_matrices = [("Random Forest", confusion_matrix(y_test,model.predict(X_test)))]

# draw_confusion_matrices(confusion_matrices)


# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'randomforest_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'randomforest_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

Overwriting telecom_pipeline/train_telecom.py


## Prepare a compute environment for the pipeline steps

In [12]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "mlop-test"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

InProgress....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [13]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
telecom_env = Environment("telecom-pipeline-env")

# Create a set of package dependencies
telecom_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])

# Add the dependencies to the environment
telecom_env.python.conda_dependencies = telecom_packages

# Register the environment 
telecom_env.register(workspace=ws)
registered_env = Environment.get(ws, 'telecom-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


## Create and run a pipeline

In [14]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
telecom_ds = ws.datasets.get("telecom dataset")

# Create a PipelineData (temporary Data Reference) for the model folder
prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())

# Step 1, Run the data prep script
train_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_telecom.py",
                                arguments = ['--input-data', telecom_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data_folder],
                                outputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
register_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_telecom.py",
                                arguments = ['--training-folder', prepped_data_folder],
                                inputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [15]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'telecom-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [93becb17][9f491d08-cebc-4ea7-9e3a-d0765cb3e897], (This step will run and generate new outputs)
Created step Train and Register Model [e21083f7][2204e40e-5743-4d95-958d-669f6323d211], (This step will run and generate new outputs)
Submitted PipelineRun 8530c936-18ef-4aef-8dd9-298666786f4d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8530c936-18ef-4aef-8dd9-298666786f4d?wsid=/subscriptions/6d705e60-c254-4179-9abe-b484e8ac71ef/resourcegroups/MLOP/workspaces/MLOP_14&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 8530c936-18ef-4aef-8dd9-298666786f4d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8530c936-18ef-4aef-8dd9-298666786f4d?wsid=/subscriptions/6d705e60-c254-4179-9abe-b484e8ac71ef/resourcegroups/MLOP/workspaces/MLOP_14&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
PipelineRun Status: Running


StepRunId: afbedb46-4924-4cbe-abec-b867b68805e4
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/afbedb46-4924-4cbe-abec-b867b68805e4?wsid=/subscriptions/6d705e60-c254-4179-9abe-b484e8ac71ef/resourcegroups/MLOP/workspaces/MLOP_14&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2021/11/02 22:54:38 Downloading source code...
2021/11/02 22:54:39 Finished downloading source code
2021/11/02 22:54:40 Creating Docker network: acb_default_network, driver: 'bridge'
2021/11/02 22:54:40 Successfully set up Docker network: acb_default_network
2021/11/02 22:54:40 Setting up Docker configur

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with IndexError: index 1 is out of bounds for axis 1 with size 1",
        "messageParameters": {},
        "detailsUri": "https://aka.ms/azureml-run-troubleshooting",
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with IndexError: index 1 is out of bounds for axis 1 with size 1\",\n        \"messageParameters\": {},\n        \"detailsUri\": \"https://aka.ms/azureml-run-troubleshooting\",\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}