In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient(
    DefaultAzureCredential(), "e1f27d73-68d8-4f59-900b-77783d4c5b3b", "End2EndCPC", "End2End_CPC"
)


In [2]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [22]:
%%writefile src/prepare-data.py
# import libraries
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import RobustScaler

def main(args):
    df = get_data(args.input_data)

    imputed_data = impute(df)

    scaled_data = scale_data(imputed_data)

    output_df = scaled_data.to_csv((Path(args.output_data)), index = False)

def get_data(path):
    df = pd.read_csv(path)

    print(f'Preparing {df.shape[1]} columns and {df.shape[0]} rows of data')
    print(df.dtypes)

    df = df.drop("keyword",axis=1)
    
    return df

def impute(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            fill_value = df[column].median()
        else:
            fill_value = df[column].mode()[0]
        
        df[column].fillna(fill_value, inplace=True)
    
    return df

def scale_data(df):
    scaler = RobustScaler()
    num_cols = df.select_dtypes(['float64', 'int64']).columns.to_list()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--input_data", dest='input_data',
                        type=str)
    parser.add_argument("--output_data", dest='output_data',
                        type=str)

    # parse args
    args = parser.parse_args()

    # return args
    return args


if __name__ == "__main__":
    print("\n\n")
    print("*" * 60)

    args = parse_args()

    main(args)

    print("*" * 60)
    print("\n\n")

Overwriting src/prepare-data.py


In [23]:
%%writefile src/train-model.py
# import libraries
import mlflow
import glob
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def main(args):
    mlflow.autolog()

    df = get_data(args.training_data)

    X_train, X_test, y_train, y_test = split_data(df,args.target_feature)

    model = train_model(args.algorithm, X_train, X_test, y_train, y_test)

    eval_model(model, X_test, y_test)

def get_data(path):
    df = pd.read_csv(path)

    print(f'Modeling with {df.shape[1]} columns and {df.shape[0]} rows of data')
    
    return df

def split_data(df,target_feature):
    print("Splitting data...")
    X, y = df.drop(target_feature,axis=1), np.ravel(df[target_feature])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=99)

    return X_train, X_test, y_train, y_test

def train_model(algorithm,X_train, X_test, y_train, y_test):
    print("Training model...")
    if algorithm == "gradient-boosting":
        model = GradientBoostingRegressor()
    if algorithm == "random-forest":
        model = RandomForestRegressor()
    else:
        model = LinearRegression()
    
    model.fit(X_train, y_train)

    mlflow.sklearn.save_model(model, args.model_output)

    return model


def eval_model(model, X_test, y_test):

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    print('RMSE:', rmse)


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--training_data", dest='training_data',
                        type=str)
    parser.add_argument("--algorithm", dest='algorithm',
                        type=str, default='linear-regression')
    parser.add_argument("--target_feature", dest='target_feature',
                        type=str, default='CPC')
    parser.add_argument("--model_output", dest='model_output',
                        type=str)

    args = parser.parse_args()

    return args

if __name__ == "__main__":
    print("\n\n")
    print("*" * 60)

    args = parse_args()

    main(args)

    print("*" * 60)
    print("\n\n")


Overwriting src/train-model.py


In [24]:
%%writefile prepare-data.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: Prepare training data
version: 1
type: command
inputs:
  input_data: 
    type: uri_file
outputs:
  output_data:
    type: uri_file
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python prepare-data.py 
  --input_data ${{inputs.input_data}}
  --output_data ${{outputs.output_data}}

Overwriting prepare-data.yml


In [25]:
%%writefile train-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: Train a linear, gradient boosting or random forest regression model
version: 1
type: command
inputs:
  training_data: 
    type: uri_file
  algorithm:
    type: string
    default: 'linear-regression'
  target_feature:
    type: string
    default: 'CPC'
outputs:
  model_output:
    type: mlflow_model
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python train-model.py 
  --training_data ${{inputs.training_data}} 
  --algorithm ${{inputs.algorithm}} 
  --target_feature ${{inputs.target_feature}} 
  --model_output ${{outputs.model_output}} 

Overwriting train-model.yml


In [26]:
from azure.ai.ml import load_component
parent_dir = ""

prep_data = load_component(source=parent_dir + "./prepare-data.yml")
train_regression = load_component(source=parent_dir + "./train-model.yml")

In [36]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def kw_CPC_prediction(pipeline_job_input,algorithm='linear-regression',target_feature='CPC'):
    clean_data = prep_data(input_data=pipeline_job_input)
    train_model = train_regression(training_data=clean_data.outputs.output_data,algorithm=algorithm,target_feature=target_feature)

    return {
        "pipeline_job_transformed_data": clean_data.outputs.output_data,
        "pipeline_job_trained_model": train_model.outputs.model_output,
    }

pipeline_job = kw_CPC_prediction(Input(type=AssetTypes.URI_FILE, path="azureml:kw-dataset:1"),'random-forest')

In [37]:
print(pipeline_job)

display_name: kw_CPC_prediction
type: pipeline
inputs:
  pipeline_job_input:
    type: uri_file
    path: azureml:kw-dataset:1
  algorithm: random-forest
  target_feature: CPC
outputs:
  pipeline_job_transformed_data:
    type: uri_file
  pipeline_job_trained_model:
    type: mlflow_model
jobs:
  clean_data:
    type: command
    inputs:
      input_data:
        path: ${{parent.inputs.pipeline_job_input}}
    outputs:
      output_data: ${{parent.outputs.pipeline_job_transformed_data}}
    component:
      $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
      name: prep_data
      version: '1'
      display_name: Prepare training data
      type: command
      inputs:
        input_data:
          type: uri_file
      outputs:
        output_data:
          type: uri_file
      command: python prepare-data.py  --input_data ${{inputs.input_data}} --output_data
        ${{outputs.output_data}}
      environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-

In [38]:
from azure.ai.ml.entities import AmlCompute

cpu_compute_target = "aml-cluster"

try:
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"There is already a cluster named {cpu_compute_target}. Reusing it."
    )

except Exception:
    print("Creating a new cpu compute target...")

    cpu_cluster = AmlCompute(
        name=cpu_compute_target,
        type="amlcompute",
        size="STANDARD_DS11_V2",
        min_instances=0,
        max_instances=2,
        idle_time_before_scale_down=60,
        tier="Dedicated",
    )

    cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster)


There is already a cluster named aml-cluster. Reusing it.


In [39]:
from azure.ai.ml.entities import AmlCompute

cluster_scale = AmlCompute(
    name="aml-cluster",
    max_instances=2,
)
ml_client.begin_create_or_update(cluster_scale)

<azure.core.polling._poller.LROPoller at 0x7f92f2e98ca0>

In [40]:

pipeline_job.outputs.pipeline_job_transformed_data.mode = "upload"
pipeline_job.outputs.pipeline_job_trained_model.mode = "upload"

pipeline_job.settings.default_compute = "aml-cluster"

pipeline_job.settings.default_datastore = "workspaceblobstore"

In [41]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_kw_CPC"
)
pipeline_job

Experiment,Name,Type,Status,Details Page
pipeline_kw_CPC,helpful_ship_7r957w60p7,pipeline,Preparing,Link to Azure Machine Learning studio
