In [None]:
!pip install azure-ai-ml
!pip install mltable

In [1]:
import pandas as pd
import numpy as np
import warnings
from math import sqrt
warnings.filterwarnings('ignore')
import pickle
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import mlflow
import mltable

In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Enter details of your AML workspace
subscription_id = "4a571c1c-a483-4a43-9930-490479d70db0"
resource_group = "Learn_MLOps"
workspace = "MLOs_WS"

# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace)

In [3]:
registered_data_asset = ml_client.data.get(
    name="processed_weather_data_port_of_Turku",version='1')

In [4]:
# uri = workspace.get_mlflow_tracking_uri()
uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
mlflow.set_tracking_uri(uri)

In [19]:
# define a path or folder or pattern
# path = {
#     'file': 'azureml://subscriptions/4a571c1c-a483-4a43-9930-490479d70db0/resourcegroups/Learn_MLops/workspaces/MLOs_WS/datastores/workspaceblobstore/paths/LocalUpload/39e72abce50cc70e7c0f6e2cdab79e0b/Dataset/weather_dataset_processed.csv'}

path = {'file':f'{registered_data_asset.path}weather_dataset_processed.csv'}
# create an mltable from paths
tbl = mltable.from_delimited_files(paths=[path])

# materialize to pandas
df = tbl.to_pandas_dataframe()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 04:00:00+02:00,"Port of Turku, Finland",8.755555556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 05:00:00+02:00,"Port of Turku, Finland",9.222222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 06:00:00+02:00,"Port of Turku, Finland",7.733333333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 07:00:00+02:00,"Port of Turku, Finland",8.772222222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 08:00:00+02:00,"Port of Turku, Finland",10.82222222,0.82,11.3183,259,9.982,1017.37,1,1


In [20]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 04:00:00+02:00,"Port of Turku, Finland",8.755555556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 05:00:00+02:00,"Port of Turku, Finland",9.222222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 06:00:00+02:00,"Port of Turku, Finland",7.733333333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 07:00:00+02:00,"Port of Turku, Finland",8.772222222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 08:00:00+02:00,"Port of Turku, Finland",10.82222222,0.82,11.3183,259,9.982,1017.37,1,1


# Spliting Pre-Processed data into Training and Validation datasets

In [73]:
# Validation set is used later to evaluate model performance post training. 

In [21]:
df_training = df.iloc[:77160]

In [22]:
df_training.shape

(77160, 10)

In [23]:
df_validation = df.drop(df_training.index)

In [24]:
df_validation.shape

(19289, 10)

# Registering Training and Validation data to the datastore on the workspace. 

In [25]:
!mkdir Data

In [26]:
df_training.to_csv('Data/training_data.csv',index=False)

In [27]:
df_validation.to_csv('Data/validation_data.csv',index=False)

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

# Supported paths include:
# local: './<path>'
# blob:  'https://<account_name>.blob.core.windows.net/<container_name>/<path>'
# ADLS gen2: 'abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/'
# Datastore: 'azureml://datastores/<data_store_name>/paths/<path>'

my_path = './Data/'

my_data = Data(
    path=my_path,
    type=AssetTypes.MLTABLE,
    description='processed weather data',
    name="processed_weather_data_port_of_Turku",
    version='2'
)

ml_client.data.create_or_update(my_data)

# Data ingestion step - Training dataset

In [5]:
path = {'file':'azureml://subscriptions/4a571c1c-a483-4a43-9930-490479d70db0/resourcegroups/Learn_MLops/workspaces/MLOs_WS/datastores/workspaceblobstore/paths/LocalUpload/2ee72d117bfcc8b45d53e4ba082b4d2a/Data/training_data.csv'}
# create an mltable from paths
tbl = mltable.from_delimited_files(paths=[path])

# materialize to pandas
df = tbl.to_pandas_dataframe()

In [6]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,True,True
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,True,True
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,True,True
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,True,True
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,True,True


In [7]:
df.shape

(77160, 10)

#### Feature Selection and scaling

In [8]:
X = df[['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars', 'Current_weather_condition']].values
y = df['Future_weather_condition'].values
y

array([ True,  True,  True, ...,  True,  True,  True])

In [15]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (1.2.0)


In [9]:
# Splitting the Training dataset into Train and Test set for ML training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [11]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training and Testing Step

## 1. Support Vector Machine

In [12]:
# myexperiment = Experiment(workspace, "support-vector-machine")
mlflow.set_experiment("mlflow-support-vector-machine")

<Experiment: artifact_location='', creation_time=1671868697927, experiment_id='1d2c9fe3-9b67-4311-930c-a740a1d3d8f2', last_update_time=None, lifecycle_stage='active', name='mlflow-support-vector-machine', tags={}>

In [13]:
#from sklearn.svm import SVC
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [14]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [15]:
svc = svm.SVC()

In [16]:
# initialize a run in Azureml and mlflow experiments
# run = myexperiment.start_logging()
with mlflow.start_run() as run:
    mlflow.sklearn.autolog()
    
    mlflow.log_text("dataset name", registered_data_asset.name)
    mlflow.log_text("dataset Version", '2')
    
    # svc_grid = GridSearchCV(svc, parameters)
    # svc_grid.fit(X_train, y_train)

    svc = SVC(C=1, kernel='rbf')
    svc.fit(X_train, y_train)
    
    predicted_svc = svc.predict(X_test)
    acc = accuracy_score(y_test, predicted_svc)
    
    fscore = f1_score(y_test, predicted_svc, average="macro")
    precision = precision_score(y_test, predicted_svc, average="macro")
    recall = recall_score(y_test, predicted_svc, average="macro")
    
    mlflow.log_metric("Test_accuracy", acc)
    mlflow.log_metric("Test_Precision", precision)
    mlflow.log_metric("Test_Recall", recall)
    mlflow.log_metric("Test_F-Score", fscore)

In [17]:
svc

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Random Forest classifier 

In [18]:
# myexperiment = Experiment(workspace, "random-forest-classifier")
mlflow.set_experiment("mlflow-random-forest-classifier")

<Experiment: artifact_location='', creation_time=1671913622865, experiment_id='a629928c-9024-4a56-8e65-50dea1125044', last_update_time=None, lifecycle_stage='active', name='mlflow-random-forest-classifier', tags={}>

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)

In [21]:
# initialize runs in Azureml and mlflow
# run = myexperiment.start_logging()
with mlflow.start_run() as run:
    mlflow.sklearn.autolog()

    # Log dataset used 
    mlflow.log_text("dataset name", registered_data_asset.name)
    mlflow.log_text("dataset Version", '2')
    
    
    rf.fit(X_train, y_train)
    
    predicted_rf = rf.predict(X_test)
    
    acc = accuracy_score(y_test, predicted_rf)
    fscore = f1_score(y_test, predicted_rf, average="macro")
    precision = precision_score(y_test, predicted_rf, average="macro")
    recall = recall_score(y_test, predicted_rf, average="macro")
    
    metric ={"Test_accuracy": acc,
            "Precision": precision,
            "Recall": recall,
            "F-Score": fscore}
    mlflow.log_metrics(metric)

# Model Packaging Step

pickle file or onnx

In [None]:
!pip install -U scikit-learn
!pip install -U skl2onnx

In [23]:
!mkdir outputs

In [24]:
# Convert into SVC model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(svc, initial_types=initial_type)
with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.


In [32]:
# Convert into RF model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("outputs/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

AttributeError: 'RandomForestClassifier' object has no attribute 'classes_'

# Model Registering Step

In [68]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/svc.onnx', # this points to a local file 
                       model_name = "support-vector-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9519'}, 
                       model_framework='pandas==0.23.4',
                       description = "Support vector classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model support-vector-classifier
Name: support-vector-classifier
Version: 1


In [33]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

file_model = Model(
    path="./outputs/svc.onnx",
    type=AssetTypes.CUSTOM_MODEL,
    name="support-vector-classifier",
    description="Support vector classifier to predict weather at port of Turku"
)
ml_client.models.create_or_update(file_model)

[32mUploading svc.onnx[32m (< 1 MB): 100%|██████████| 272k/272k [00:00<00:00, 7.08MB/s]
[39m



Model({'job_name': None, 'is_anonymous': False, 'auto_increment_version': False, 'name': 'support-vector-classifier', 'description': 'Support vector classifier to predict weather at port of Turku', 'tags': {}, 'properties': {}, 'id': '/subscriptions/4a571c1c-a483-4a43-9930-490479d70db0/resourceGroups/Learn_MLOps/providers/Microsoft.MachineLearningServices/workspaces/MLOs_WS/models/support-vector-classifier/versions/1', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/eddyhakz1/code/Learn_Mlops/04_ML_Pipelines', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7fda13bef310>, 'serialize': <msrest.serialization.Serializer object at 0x7fda18102f10>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/4a571c1c-a483-4a43-9930-490479d70db0/resourceGroups/Learn_MLOps/workspaces/MLOs_WS/datastores/workspaceblobstore/paths/LocalUpload/1b3639d318948f811f52d534dc56cb04/svc.onnx', 'datastore': None, 'utc_t

In [69]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/rf.onnx', # this points to a local file 
                       model_name = "random-forest-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9548'}, 
                       model_framework='pandas==0.23.4',
                       description = "Random forest classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model random-forest-classifier
Name: random-forest-classifier
Version: 1


In [70]:
import mlflow.sklearn

In [71]:
# Save the model to the outputs directory for capture
mlflow.sklearn.log_model(svc, 'outputs/svc.onnx')



In [72]:
# Save the model to the outputs directory for capture
mlflow.sklearn.log_model(rf, 'outputs/rf.onnx')



# Save model artefacts

In [34]:
import pickle

with open('./outputs/scaler.pkl', 'wb') as scaler_pkl:
    pickle.dump(sc, scaler_pkl)

In [35]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

file_model = Model(
    path="./outputs/scaler.pkl",
    type=AssetTypes.CUSTOM_MODEL,
    name="scaler",
    description="Scaler used for scaling incoming inference data",
)
ml_client.models.create_or_update(file_model)

[32mUploading scaler.pkl[32m (< 1 MB): 100%|██████████| 599/599 [00:00<00:00, 48.1kB/s]
[39m



Model({'job_name': None, 'is_anonymous': False, 'auto_increment_version': False, 'name': 'scaler', 'description': 'Scaler used for scaling incoming inference data', 'tags': {}, 'properties': {}, 'id': '/subscriptions/4a571c1c-a483-4a43-9930-490479d70db0/resourceGroups/Learn_MLOps/providers/Microsoft.MachineLearningServices/workspaces/MLOs_WS/models/scaler/versions/1', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/eddyhakz1/code/Learn_Mlops/04_ML_Pipelines', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7fda187eb760>, 'serialize': <msrest.serialization.Serializer object at 0x7fda18104bb0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/4a571c1c-a483-4a43-9930-490479d70db0/resourceGroups/Learn_MLOps/workspaces/MLOs_WS/datastores/workspaceblobstore/paths/LocalUpload/942322bf58670231c40d8ce7d3440d00/scaler.pkl', 'datastore': None, 'utc_time_created': None, 'flavors': None, 'arm_type': '