In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient(
    DefaultAzureCredential(), "e1f27d73-68d8-4f59-900b-77783d4c5b3b", "End2endCPC", "End2End_CPC"
)


In [2]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

In [3]:
%%writefile src/prepare-data.py
# import libraries
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import RobustScaler

def main(args):
    df = get_data(args.input_data)

    cleaned_data = clean_data(df)

    imputed_data = impute(cleaned_data)

    scaled_data = scale_data(imputed_data)

    output_df = scaled_data.to_csv((Path(args.output_data)), index = False)

def get_data(path):
    df = pd.read_csv(path)

    print(f'Preparing {df.shape[1]} columns and {df.shape[0]} rows of data')
    print(df.dtypes)

    df = df.drop("keyword",axis=1)
    
    return df

def clean_data(df):
    zero_mask = (df['lower_bid']==df['upper_bid'])|(df['lower_bid']==0)
    df = df[~zero_mask]
    lower_mask = (df['lower_bid']>df['upper_bid'])|(df['lower_bid']>df['CPC'])|(df['upper_bid']<df['CPC'])
    df = df[~lower_mask]

    return df

def impute(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            fill_value = df[column].median()
        else:
            fill_value = df[column].mode()[0]
        
        df[column].fillna(fill_value, inplace=True)
    
    return df

def scale_data(df):
    scaler = RobustScaler()
    num_cols = df.select_dtypes(['float64', 'int64']).columns.to_list()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--input_data", dest='input_data',
                        type=str)
    parser.add_argument("--output_data", dest='output_data',
                        type=str)

    # parse args
    args = parser.parse_args()

    # return args
    return args


if __name__ == "__main__":
    print("\n\n")
    print("*" * 60)

    args = parse_args()

    main(args)

    print("*" * 60)
    print("\n\n")

In [4]:
%%writefile src/train-model.py
# import libraries
import mlflow
import glob
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

def main(args):
    mlflow.autolog()

    df = get_data(args.training_data)

    X_train, X_test, y_train, y_test = split_data(df,args.target_feature)

    model = train_model(args.algorithm, X_train, X_test, y_train, y_test)

    eval_model(model, X_test, y_test)

def get_data(path):
    df = pd.read_csv(path)

    print(f'Modeling with {df.shape[1]} columns and {df.shape[0]} rows of data')
    
    return df

def split_data(df,target_feature):
    print("Splitting data...")
    X, y = df.drop(target_feature,axis=1), np.ravel(df[target_feature])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=99)

    return X_train, X_test, y_train, y_test

def train_model(algorithm,X_train, X_test, y_train, y_test):
    print("Training model...")
    if algorithm == "gradient-boosting":
        model = GradientBoostingRegressor()
    if algorithm == "random-forest":
        model = RandomForestRegressor()
    else:
        model = LinearRegression()
    
    model.fit(X_train, y_train)

    mlflow.sklearn.save_model(model, args.model_output)

    return model


def eval_model(model, X_test, y_test):

    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_pred, y_test)
    mlflow.log_param('MAE',mae)
    print('MAE:', mae)


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--training_data", dest='training_data',
                        type=str)
    parser.add_argument("--algorithm", dest='algorithm',
                        type=str, default='linear-regression')
    parser.add_argument("--target_feature", dest='target_feature',
                        type=str, default='CPC')
    parser.add_argument("--model_output", dest='model_output',
                        type=str)

    args = parser.parse_args()

    return args

if __name__ == "__main__":
    print("\n\n")
    print("*" * 60)

    args = parse_args()

    main(args)

    print("*" * 60)
    print("\n\n")


In [5]:
%%writefile prepare-data.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: Prepare training data
version: 1
type: command
inputs:
  input_data: 
    type: uri_file
outputs:
  output_data:
    type: uri_file
code: ./src
environment: azureml://registries/azureml/environments/sklearn-1.1/versions/34
command: >-
  python prepare-data.py 
  --input_data ${{inputs.input_data}}
  --output_data ${{outputs.output_data}}

In [6]:
%%writefile train-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: Train a linear, gradient boosting or random forest regression model
version: 1
type: command
inputs:
  training_data: 
    type: uri_file
  algorithm:
    type: string
    default: 'linear-regression'
  target_feature:
    type: string
    default: 'CPC'
outputs:
  model_output:
    type: mlflow_model
code: ./src
environment: azureml://registries/azureml/environments/sklearn-1.1/versions/34
command: >-
  python train-model.py 
  --training_data ${{inputs.training_data}} 
  --algorithm ${{inputs.algorithm}} 
  --target_feature ${{inputs.target_feature}} 
  --model_output ${{outputs.model_output}} 

In [7]:
from azure.ai.ml import load_component
parent_dir = ""

prep_data = load_component(source=parent_dir + "./prepare-data.yml")
train_regression = load_component(source=parent_dir + "./train-model.yml")

In [8]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def kw_CPC_prediction(pipeline_job_input,algorithm='linear-regression',target_feature='CPC'):
    clean_data = prep_data(input_data=pipeline_job_input)
    train_model = train_regression(training_data=clean_data.outputs.output_data,algorithm=algorithm,target_feature=target_feature)

    return {
        "pipeline_job_transformed_data": clean_data.outputs.output_data,
        "pipeline_job_trained_model": train_model.outputs.model_output,
    }

pipeline_job = kw_CPC_prediction(Input(type=AssetTypes.URI_FILE, path="azureml:kw-dataset:1"),'random-forest')

In [9]:
print(pipeline_job)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


In [None]:
from azure.ai.ml.entities import AmlCompute

cpu_compute_target = "aml-cluster2"

try:
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"There is already a cluster named {cpu_compute_target}. Reusing it."
    )

except Exception:
    print("Creating a new cpu compute target...")

    cpu_cluster = AmlCompute(
        name=cpu_compute_target,
        type="amlcompute",
        size="Standard_D11_v2",
        min_instances=0,
        max_instances=2,
        idle_time_before_scale_down=60,
        tier="Dedicated",
    )

    cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster)


In [None]:
from azure.ai.ml.entities import AmlCompute

cluster_scale = AmlCompute(
    name="aml-cluster2",
    max_instances=2,
)
ml_client.begin_create_or_update(cluster_scale)

In [None]:

pipeline_job.outputs.pipeline_job_transformed_data.mode = "upload"
pipeline_job.outputs.pipeline_job_trained_model.mode = "upload"
pipeline_job.settings.default_compute = "aml-cluster2"
pipeline_job.settings.default_datastore = "workspaceblobstore"

In [None]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_kw_CPC_prediction"
)
pipeline_job

In [None]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

child_jobs = [child_job for child_job in ml_client.jobs.list(parent_job_name=pipeline_job.name)]
child_job_name = child_jobs[1].name

run_model = Model(
    path=f"azureml://jobs/{child_job_name}/outputs/artifacts/paths/model/",
    name="mlflow-kw-CPC",
    description="Model created from run.",
    type=AssetTypes.MLFLOW_MODEL,
)

ml_client.models.create_or_update(run_model)

In [None]:
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment
from azure.ai.ml.constants import AssetTypes
import datetime

online_endpoint_name = "endpoint-CPC-prediction"

endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="Online endpoint for MLflow keyword CPC prediction.",
    auth_mode="key",
)


In [None]:


production_model = Model(
    path=f"azureml://jobs/{child_job_name}/outputs/artifacts/paths/model/",
    name="mlflow-kw-CPC-blue-deployment",
    description="Model created from run.",
    type=AssetTypes.MLFLOW_MODEL,
)

blue_deployment = ManagedOnlineDeployment(
    name="blue-kw-cpc",
    endpoint_name=online_endpoint_name,
    description="Blue online deployment for keyword CPC prediction",
    model=production_model,
    instance_type="Standard_F4s_v2",
    instance_count=1,
)

In [None]:
ml_client.begin_create_or_update(endpoint).result()

In [None]:
ml_client.online_deployments.begin_create_or_update(blue_deployment).result()

In [None]:
endpoint.traffic = {"blue-kw-cpc": 100}
ml_client.begin_create_or_update(endpoint).result()

In [None]:
import json 
sample_deployment_data = {
  "input_data": {
    "columns": [
      "competition",
      "lower_bid",
      "upper_bid"
    ],
    "index": [1],
    "data": [
      [
      30,2.1,5
    ]
    ]
  }
}

sample_data_path = "data/sample_deployment_data.json"

with open(sample_data_path, 'w') as json_file:
    json.dump(sample_deployment_data, json_file)

response = ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name="blue-kw-cpc",
    request_file=sample_data_path,
)
response

In [None]:
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

print(endpoint.traffic)

print(endpoint.scoring_uri)