In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from azureml.core.run import Run 
from azureml.core.experiment import Experiment
from azureml.core.model import Model
from azureml.core.workspace import Workspace
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.train.automl import AutoMLConfig

import pickle
import mlflow

In [2]:
from azureml.core import Workspace, Dataset

souscription_id = '6f507867-3280-4f79-8820-9939bef45222'
ressource_group = 'Learn_MLOps'
workspace_name = 'MLOps_WS'

workspace = Workspace(souscription_id, ressource_group, workspace_name)

In [4]:
uri = workspace.get_mlflow_tracking_uri()
mlflow.set_tracking_uri(uri)

In [8]:
# Import processed data from datastore
dataset = Dataset.get_by_name(workspace, name='processed data weather finland port')
print(f'Name of dataset:{dataset.name}\nVerion: {dataset.version}')

Name of dataset:processed data weather finland port
Verion: 1


In [10]:
df = dataset.to_pandas_dataframe()
df.head()

Unnamed: 0,Column1,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_condition,new_Weather_condition
0,4,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1.0
1,5,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1.0
2,6,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1.0
3,7,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1.0
4,8,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1.0


In [11]:
# Split dataset into train and test dataset. In order to keep temporal order, we will not use train_test_split
# We'll split data en 80% for train and data remaining for test
df.shape

(96449, 11)

* Let's evaluate size of training data \begin{equation*}
  \frac{80}{100} *96449\ \simeq 77160\ 
  \end{equation*}

In [13]:
data_train = df.iloc[:77160]
data_test = df.iloc[77160:]
print(f'Shape train data: {data_train.shape}\nShape test data: {data_test.shape}')

Shape train data: (77160, 11)
Shape test data: (19289, 11)


### Save and register the two dataset

In [15]:
#Save those 2 datasets to datastore connected to our workspace(Microsoft AML workspace)
#First Save to current folder 'Data'
data_train.to_csv('Data_train_test/train_dataset.csv', index=False)
data_test.to_csv('Data_train_test/test_dataset.csv', index=False)

In [17]:
#Upload to datastore
datastore = workspace.get_default_datastore()
datastore.upload(src_dir='Data_train_test', target_path='data')

Uploading an estimated of 2 files
Uploading Data_train_test/test_dataset.csv
Uploaded Data_train_test/test_dataset.csv, 1 files out of an estimated total of 2
Uploading Data_train_test/train_dataset.csv
Uploaded Data_train_test/train_dataset.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_8c487c1d7efb44529eda103259e4ea26

In [20]:
# Register and version
train_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/train_dataset.csv'))

In [21]:
test_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/test_dataset.csv'))

In [23]:
training_ds = train_dataset.register(workspace=workspace, name='training dataset', description='Dataset to use to trainin ML model')

In [24]:
test_ds = test_dataset.register(workspace=workspace, name='testing dataset', description='Dataset to use to test ML model')

## Data ingestion training set 

In [25]:
dataset = Dataset.get_by_name(workspace, name='training dataset')
print(f'Dataset name: {dataset.name}\nDataset version: {dataset.version}')

Dataset name: training dataset
Dataset version: 1


In [27]:
df = dataset.to_pandas_dataframe()
df.head()

Unnamed: 0,Column1,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_condition,new_Weather_condition
0,4,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1.0
1,5,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1.0
2,6,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1.0
3,7,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1.0
4,8,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1.0


In [30]:
df.columns

Index(['Column1', 'Timestamp', 'Location', 'Temperature_C', 'Humidity',
       'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km',
       'Pressure_millibars', 'Weather_condition', 'new_Weather_condition'],
      dtype='object')

In [28]:
df.shape

(77160, 11)

In [32]:
# Split data into train et validation data
train_col = ['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars']
X = df[train_col]
y = df['new_Weather_condition']

In [33]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

In [34]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)

## Training and Testing Model

* SVM

In [39]:
# Set Experiment into Azure ML
experiment = Experiment(workspace, 'Support-Vector-Machine')

In [40]:
#set experiment with mlflow
mlflow.set_experiment('mlflow-Support-Vector-Machine')

INFO: 'mlflow-Support-Vector-Machine' does not exist. Creating a new experiment


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
params_grid = {
    'kernel': ('Linear', 'rbf'), 'C':[1, 10]
}
svm_clf = SVC()
# Initialize run in Azure Ml and mlflow
run = experiment.start_logging()
mlflow.start_run()
run.log("dataset name:", dataset.name)
run.log('dataset version:', dataset.version)
grid_svm = GridSearchCV(svm_clf, params_grid, verbose=2)

In [43]:
grid_svm.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=1, kernel=Linear ..............................................
[CV] ............................... C=1, kernel=Linear, total=   0.0s
[CV] C=1, kernel=Linear ..............................................
[CV] ............................... C=1, kernel=Linear, total=   0.0s
[CV] C=1, kernel=Linear ..............................................
[CV] ............................... C=1, kernel=Linear, total=   0.0s
[CV] C=1, kernel=Linear ..............................................
[CV] ............................... C=1, kernel=Linear, total=   0.0s
[CV] C=1, kernel=Linear ..............................................
[CV] ............................... C=1, kernel=Linear, total=   0.0s
[CV] C=1, kernel=rbf .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] .................................. C=1, kernel=rbf, total=  16.9s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=  16.5s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=  16.5s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=  17.0s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=  16.7s
[CV] C=10, kernel=Linear .............................................
[CV] .............................. C=10, kernel=Linear, total=   0.0s
[CV] C=10, kernel=Linear .............................................
[CV] .............................. C=10, kernel=Linear, total=   0.0s
[CV] C=10, kernel=Linear .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  3.0min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('Linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [47]:
final_model_svm = grid_svm.best_estimator_
final_model_svm

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [49]:
final_model_svm.fit(X_train, y_train)
run.log('C', grid_svm.get_params(deep=True)['estimator__C'])
run.log('kernel', grid_svm.get_params(deep=True)['estimator__kernel'] )

In [50]:
# Test model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [52]:
y_pred = final_model_svm.predict(X_validation)

In [58]:
acc = accuracy_score(y_validation, y_pred)
precision = precision_score(y_validation, y_pred)
recall = recall_score(y_validation, y_pred)
score = f1_score(y_validation, y_pred)
print(f'Accuracy: {acc}\nPrecision: {precision}\nRappel: {recall}\nF1_score: {score}')

Accuracy: 0.9558709175738724
Precision: 0.9661558573607378
Rappel: 0.9840952803669182
F1_score: 0.9750430607981823


In [60]:
import git
repo = git.Repo(search_parent_directories=True)
sha = repo.head.object.hexsha

In [61]:
run.log("Accuracy", acc)
run.log("Precision", precision)
run.log('Rappel', recall)
run.log("f1_score", score)
run.log("fGit-sha", sha)

In [62]:
run.complete()
print(f'run id: {run.id}')

run id: ed1bbf68-44f0-4f83-b2c0-c0406d02545b


In [63]:
mlflow.end_run()

In [64]:
run.get_metrics()

{'dataset name:': 'training dataset',
 'dataset version:': 1,
 'C': 1.0,
 'kernel': 'rbf',
 'Accuracy': 0.9558709175738724,
 'Precision': 0.9661558573607378,
 'Rappel': 0.9840952803669182,
 'f1_score': 0.9750430607981823,
 'fGit-sha': '57b765480a8bc85fdcfb95fa0b925cce3af28cba'}

In [68]:
import mlflow.sklearn
mlflow.sklearn.log_model(final_model_svm, 'outputs')

* Random Forest

In [69]:
# Initialise experiment in Azue ML Worskspace
experiment = Experiment(workspace, "Random-Forest")
#Initialise in mlflow
mlflow.set_experiment("mlflow-Random-Forest")

INFO: 'mlflow-Random-Forest' does not exist. Creating a new experiment


In [70]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(max_depth=10, random_state=42, n_estimators=100)

In [74]:
mlflow.end_run()

In [75]:
run = experiment.start_logging()
mlflow.start_run()

run.log("dataset name", dataset.name)
run.log("dataset version", dataset.version)

In [76]:
%%time
forest_clf.fit(X_train, y_train)

CPU times: user 6.22 s, sys: 2.17 ms, total: 6.22 s
Wall time: 6.34 s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [77]:
run.log("max_depth", 10)
run.log("random_state", 42)
run.log("n_estimators", 500)

In [78]:
predicted_forest = forest_clf.predict(X_validation)

In [79]:
acc_forest = accuracy_score(y_validation, predicted_forest)
precision_forest = precision_score(y_validation, predicted_forest)
recall_forest = recall_score(y_validation, predicted_forest)
score_forest = f1_score(y_validation, predicted_forest)
print(f'Accuracy: {acc_forest}\nPrecision: {precision_forest}\nRappel: {recall_forest}\nF1_score: {score_forest}')

Accuracy: 0.9569077242094349
Precision: 0.9700826567186014
Rappel: 0.9810622873206095
F1_score: 0.9755415793151642


In [80]:
run.log("Accuracy_forest", acc_forest )
run.log("Precision_forest", precision_forest )
run.log("Recall_forest", recall_forest )
run.log("F1_score_forest", score_forest )

In [81]:
run.complete()
print(f'run id: {run.id}')

run id: cbe8177b-28a4-4293-8010-f7a958067b97


In [82]:
mlflow.end_run()

In [83]:
run.get_metrics()

{'dataset name': 'training dataset',
 'dataset version': 1,
 'max_depth': 10,
 'random_state': 42,
 'n_estimators': 500,
 'Accuracy_forest': 0.9569077242094349,
 'Precision_forest': 0.9700826567186014,
 'Recall_forest': 0.9810622873206095,
 'F1_score_forest': 0.9755415793151642}

## Packaging Model

In [86]:
# We'll use ONNX standard to avoid compatibilities and interoprebilities issues
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(final_model_svm, initial_types=initial_type)
with open("outputs/final_model_svm.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.


In [88]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [('float_type', FloatTensorType([None, 6]))]
onx = convert_sklearn(forest_clf, initial_types=initial_type)
with open("outputs/forest_clf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.
The maximum opset needed by this model is only 9.


## Registering Model

In [92]:
# Register SVM model
model = Model.register(workspace = workspace,
                       model_path = 'outputs/final_model_svm.onnx',
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'Hyperparameter-C':'10', 'testdata-accuracy':'0.9558'},
                       model_framework = 'pandas==0.23.4',
                       model_name = 'Support-vector-Classifier', 
                       description = 'SVM classification to predict weather')

print(f'Model Name: {model.name}\nModel version: {model.version}')

Registering model Support-vector-Classifier
Model Name: Support-vector-Classifier
Model version: 1


In [94]:
#Register Random Classifier Model
model = Model.register(workspace=workspace,
                      model_path = 'outputs/forest_clf.onnx',
                      model_name = 'Random-Forest-Classifier',
                      tags = {'dataset': dataset.name, 'version':dataset.version, 'testdata-accuracy':'0.9569'},
                      model_framework = 'pandas==0.23.4',
                      description = 'Random-Forest to forcast Weather')

print(f'Model Name: {model.name}\nModel version: {model.version}')

Registering model Random-Forest-Classifier
Model Name: Random-Forest-Classifier
Model version: 1


In [95]:
# Registering Artefacts
import pickle as pkl

with open("outputs/scaler.pkl", "wb") as f:
    pkl.dump(sc, f)

In [99]:
model = Model.register(workspace = workspace,
                      model_path = 'outputs/scaler.pkl',
                      model_name = 'Scaler',
                      tags = {'dataset': dataset.name, 'version':dataset.version},
                      description = 'Scaler for scaling inference data into test or production environnement',
                      model_framework = 'pandas==0.23.4')
                      
print(f'Model name: {model.name}\nModel version: {model.version}')

Registering model Scaler
Model name: Scaler
Model version: 1


In [100]:
mlflow.sklearn.log_model(final_model_svm, 'outputs/final_model_svm.onnx')

In [101]:
mlflow.sklearn.log_model(forest_clf, 'outputs/forest_clf.onnx')