# ESML - accelerator: Quick DEMO


In [None]:
import repackage
repackage.add("../azure-enterprise-scale-ml/esml/common/")
from esml import ESMLDataset, ESMLProject

p = ESMLProject() # Will search in ROOT for your copied SETTINGS folder '../../../settings', you should copy template settings from '../settings'
p.active_model = 11
p.inference_mode = False
p.describe()

In [None]:
unregister_all_datasets=False
if(unregister_all_datasets):
    p.unregister_all_datasets(p.ws) # For DEMO purpose

from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

auth = InteractiveLoginAuthentication(tenant_id = p.tenant)
ws = Workspace.get(name = p.workspace_name,subscription_id = p.subscription_id,resource_group = p.resource_group,auth=auth)
ws.write_config(path=".", file_name="../../ws_config.json")

ws = Workspace.from_config("../ws_config.json") # Reads config.json 

# 2) ESML will Automap and Autoregister Azure ML Datasets - IN, SILVER, BRONZE, GOLD
- `Automap` and `Autoregister` Azure ML Datasets as: `IN, SILVER, BRONZE, GOLD`

In [None]:
from azureml.core import Workspace
ws, config_name = p.authenticate_workspace_and_write_config()
ws = p.get_workspace_from_config()
ws.name

In [None]:
print("Are we in R&D state (no dataset versioning) = {}".format(p.rnd))

In [None]:
datastore = p.init(ws)

# 3) IN->`BRONZE->SILVER`->Gold
- Create dataset from PANDAS - Save to SILVER

In [None]:
import pandas as pd 
ds = p.DatasetByName("ds01_diabetes")
df = ds.Bronze.to_pandas_dataframe()
df.head()

## 3) BRONZE-SILVER (EDIT rows & SAVE)
- Test change rows, same structure = new version (and new file added)
- Note: not earlier files in folder are removed. They are needed for other "versions". 
- Expected: For 3 files: New version, 997 rows: 2 older files=627 + 1 new file=370
- Expected (if we delete OLD files): New version, with less rows. 370 instead of 997

In [None]:
df_filtered = df[df.AGE > 0.015]
print(df.shape[0], df_filtered.shape[0])

## 3a) Save `SILVER` ds01_diabetes

In [None]:
aml_silver = p.save_silver(p.DatasetByName("ds01_diabetes"),df_filtered)
aml_silver.name

### COMPARE `BRONZE vs SILVER`
- Compare and validate the feature engineering

In [None]:
ds01 = p.DatasetByName("ds01_diabetes")
bronze_rows = ds01.Bronze.to_pandas_dataframe().shape[0]
silver_rows = ds01.Silver.to_pandas_dataframe().shape[0]

print("Bronze: {}".format(bronze_rows)) # Expected 442 rows
print("Silver: {}".format(silver_rows)) # Expected 185 rows (filtered)

assert bronze_rows == 442,"BRONZE Should have 442 rows to start with, but is {}".format(bronze_rows)
assert silver_rows == 185,"SILVER should have 185 after filtering, but is {}".format(silver_rows)

## 3b) Save  `BRONZE →  SILVER` ds02_other

In [None]:
df_edited = p.DatasetByName("ds02_other").Silver.to_pandas_dataframe()
ds02_silver = p.save_silver(p.DatasetByName("ds02_other"),df_edited)
ds02_silver.name

## 3c) Merge all `SILVERS -> then save GOLD`

In [None]:
df_01 = ds01.Silver.to_pandas_dataframe()
df_02 = ds02_silver.to_pandas_dataframe()
df_gold1_join = df_01.join(df_02) # left join -> NULL on df_02
print("Diabetes shape: ", df_01.shape)
print(df_gold1_join.shape)

# Save `GOLD` v1

In [None]:
p.rnd=False # Allow versioning on DATASETS, to have lineage

In [None]:
ds_gold_v1 = p.save_gold(df_gold1_join)

### 3c) Ops! "faulty" GOLD - too many features

In [None]:
print(p.Gold.to_pandas_dataframe().shape) # 19 features...I want 11

In [None]:
print("Are we in RnD phase? Or do we have 'versioning on datasets=ON'")
print("RnD phase = {}".format(p.rnd))

# Save `GOLD` v2

In [None]:
# Lets just go with features from ds01
ds_gold_v1 = p.save_gold(df_01)

# Get `GOLD` by version

In [None]:
gold_1 = p.get_gold_version(1)
gold_1.to_pandas_dataframe().shape # (185, 19)

In [None]:
gold_2 = p.get_gold_version(2)
gold_2.to_pandas_dataframe().shape # (185, 11)

In [None]:
p.Gold.to_pandas_dataframe().shape # Latest version (185, 11)

In [None]:
df_01_filtered = df_01[df_01.AGE > 0.03807]
ds_gold_v1 = p.save_gold(df_01_filtered)

In [None]:
gold_2 = p.get_gold_version(3) # sliced, from latest version
gold_2.to_pandas_dataframe().shape # (113, 11)

# TRAIN - `AutoMLFactory + ComputeFactory`

In [None]:
from baselayer_azure_ml import AutoMLFactory, ComputeFactory

In [None]:
p.dev_test_prod = "test"
print("what environment are we targeting? =  {}".format(p.dev_test_prod)) 

In [None]:
automl_performance_config = p.get_automl_performance_config()
automl_performance_config

In [None]:
p.dev_test_prod = "dev"
automl_performance_config = p.get_automl_performance_config()
automl_performance_config

# Get `COMPUTE` for current `ENVIRONMENT`

In [None]:
aml_compute = p.get_training_aml_compute(ws)

# `TRAIN` model -> See other notebook `esml_howto_2_train.ipynb`

In [None]:
from azureml.train.automl import AutoMLConfig
from baselayer_azure_ml import azure_metric_regression

label = p.active_model["label"]
train_6, validate_set_2, test_set_2 = p.split_gold_3(0.6,label) # Auto-registerin AZURE (M03_GOLD_TRAIN | M03_GOLD_VALIDATE | M03_GOLD_TEST) # Alt: train,testv= p.Gold.random_split(percentage=0.8, seed=23)
automl_config = AutoMLConfig(task = 'regression',
                             primary_metric = azure_metric_regression.MAE,
                             compute_target = aml_compute,
                             training_data = p.GoldTrain, # is 'train_6' pandas dataframe, but as an Azure ML Dataset
                             label_column_name = label,
                             experiment_exit_score = '0.308', # DEMO purpose
                             **automl_performance_config
                            )

via_pipeline = False
best_run, fitted_model, experiment = AutoMLFactory(p).train_pipeline(automl_config) if via_pipeline else AutoMLFactory(p).train_as_run(automl_config)

# END

# ESML - accelerator

## PROJECT + DATA CONCEPTS + ENTERPRISE Datalake Design + DEV->PROD MLOps
- `1)ESML Project`: The ONLY thing you need to remember is your `Project number` (and `BRONZE, SILVER, GOLD` concept )
    - ProjectNo=4 have a list of all your datasets as ESMLDatasets. (Well you need to provide names for them also: "mydata01", "mydata02" - but thats it)
- `2)Lakedesign & Roles`: Bronze, silver, gold + IN and date folders
    - Benefits: Physical datalake design!  onnected to Azure ML Workspace, with autoregistration of `Azure ML Datasets`
    - `Role 1`: `Data ingestion team` only need to care about 1 thing - onboard data to `IN-folder`, in .CSV format
        - `Auto parquet-conversion` from `IN` folder (.CSV) to `OUT`/BRONZE/bronze.PARQUET 
    - `Role 2`: `Data scientists` only need to care about 3 things (R/W): `BRONZE, SILVER, GOLD` datasets, all in .PARQUET format
    - How? The ESML project will `Automap` and `Autoregister` Azure ML Datasets - `IN, SILVER, BRONZE, GOLD`
- `2a) R&D  VS Production phase`: "Latest data" VS versioning on Datasets and datefolders  
    - Benefits "R&D mode": Faster RnD phase to onboard and refresh data easy. Also fast "flip-switch" to production
    - How? `ESMLDataset is context self aware` - knows when it is used in TRAIN or INFERENCE pipeline
- `2b) TRAIN vs INFERENCE` versions</u> `Reuse (Bronze->Silver->Gold) pipepline`, for both TRAIN preprocessing, and INFERENCE 
    - Benefits: Inference with different MODEL version, on data from the same day/time, (to compare scoring etc)
    - How? ESMLDataset have context self awareness, and `knows WHERE and HOW to load/save data`
- `2c) BATCH CONFIG`: Turn on/off features on ALL datasets
    - Accelerate setup: `Datadrift, Time series traits, Smart noise, etc`
    - Share refined data back to its "origin/non-projectbased structure" easy: 
        - ESMLProject.ShareBack(ds.Silver)
    - How? ESMProject controls all ESMDatasets, in a uniform way
## ENTERPRISE Deployment of Models & Governance - MLOps  at scale
- `3) DEV->TEST-PROD` (configs, compute, performance)
    - ESML has config for 3 environemnts: Easy DEPLOY model across subscriptions and Azure ML Studio workspaces 
        - Save costs & time: 
            - `DEV` has cheaper compute performance for TRAIN and INFERENCE (batch, AKS)
            - `DEV` has Quick-debug ML training (fast training...VS good scoring in TEST and PROD)
        - How? ESML `AutoMLFactory` and `ComputeFactory`
         

### Q&A:
- Q: Is ESML Machine learning specific? If I only want to refine some data...for integration, or report? 
- A: You can use this for just data refinement also: `Bronze->Silver->Gold` refinement.
    - Benefits: Enterprise security, Read/write to datalake, easy to share refined data. 
    - Benefits: The tooling "glued togehter": Azure datafactory +  Azure Databricks (and Azure ML Studio pipelines if needed)



In [None]:
p.dev_test_prod

In [None]:
from baselayer_azure_ml import AutoMLFactory
target_env = p.dev_test_prod #"dev", test, prod  = Target environment. Does Model A score better than Model B?
print("Example: If new model scores better in DEV, we can promote this to TEST")

promote, m1_name, r1_id, m2_name, r2_run_id = AutoMLFactory(p).compare_scoring_current_vs_new_model(target_env)

print("Promote model?  {}".format(promote))
print("New Model: {} in environment {}".format(m1_name, p.dev_test_prod))
print("Existing Model: {} in environment {}".format(m2_name,target_env))

if (promote and p.dev_test_prod == target_env):# Can only register a model in same workspace (test->test) - need to retrain if going from dev->test
    AutoMLFactory(p).register_active_model(target_env)


In [None]:
inference_config, model, best_run = p.get_active_model_inference_config(ws) #  AutoML support 
service,api_uri, kv_aks_api_secret= p.deploy_automl_model_to_aks(model,inference_config)

In [None]:
X_test, y_test, tags = p.get_gold_validate_Xy() # Get the X_test data, ESML knows the SPLIT and LABEL already (due to training)
print(tags)

df = p.call_webservice(p.ws, X_test,"my_caller_id_tracker_guid") # Auto-fetch key from keyvault, and calls the webservice
df.head()