# Verifying the MLOps environment on GCP with Cloud AI Platfrom training custom container

This notebook verifies the MLOps environment provisioned on GCP
1. Create trainer module and submit a Cloud AI Platfrom training job using custom container
2. Test using the training result log entries in the Cloud SQL


## 1. Create and submit Cloud AI Platfrom training job


In [None]:
import os
import re
from IPython.core.display import display, HTML
from datetime import datetime
import mlflow
import pymysql

In [None]:
# Jupyter magic template to create Python file with variable substitution 
from IPython.core.magic import register_line_cell_magic
@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [None]:
experiment_name = "caipt-test"
mlflow.set_experiment(experiment_name)

mlflow_tracking_uri = mlflow.get_tracking_uri()
MLFLOW_EXPERIMENTS_URI = os.environ['MLFLOW_EXPERIMENTS_URI']
training_artifacts_uri = MLFLOW_EXPERIMENTS_URI+"/caip-training"
REGION=os.environ['MLOPS_REGION']
ML_IMAGE_URI=os.environ['ML_IMAGE_URI']

print(f"MLflow tracking server URI: {mlflow_tracking_uri}")
print(f"MLflow artifacts store root: {MLFLOW_EXPERIMENTS_URI}")
print(f"MLflow SQL connction name: {os.environ['MLFLOW_SQL_CONNECTION_NAME']}")
print(f"MLflow SQL connction string: {os.environ['MLFLOW_SQL_CONNECTION_STR']}")

display(HTML('<hr>You can check results of this test in MLflow and GCS folder:'))
display(HTML('<h4><a href="{}" rel="noopener noreferrer" target="_blank">Click to open MLflow UI</a></h4>'.format(os.environ['MLFLOW_TRACKING_EXTERNAL_URI'])))
display(HTML('<h4><a href="https://console.cloud.google.com/storage/browser/{}" rel="noopener noreferrer" target="_blank">Click to open GCS folder</a></h4>'.format(MLFLOW_EXPERIMENTS_URI.replace('gs://',''))))

!mkdir -p ./package/training

### 1.1. Create model trainer file
The following cells will write out python module files that will be sent as a training module to Cloud AI Platform Training.
At first, we implement a simple Scikit-learn model training routine.

In [None]:
%%writefile ./package/training/task.py

import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.linear_model import LogisticRegression
import sys
import argparse
import os

def train_model(args):
    print("Regularized logistic regression model train step started...")
    with mlflow.start_run(nested=True):
        X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)
        y = np.array([0, 0, 1, 1, 1, 0])
        # args.epochs is a training job parameter
        lr = LogisticRegression(max_iter=args.epochs)
        lr.fit(X, y)
        score = lr.score(X, y)
        mlflow.log_metric("score", score)
        mlflow.sklearn.log_model(lr, "model")
    print("LogisticRegression training finished.")

def training_data(local_data):
    dircontent = os.listdir(local_data)
    print(f"Check local data @: {local_data} :\n{dircontent}")
    
def upload_data(local, job_dir):
    print(f"Upload local data {local} to GCS: {job_dir}")

def main():
    print(f'Training arguments: {" ".join(sys.argv[1:])}'.format())
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int)
    parser.add_argument('--job-dir', type=str)
    parser.add_argument('--local_data', type=str)
    args, unknown_args = parser.parse_known_args()

    # CLOUD_ML_JOB conatains other CAIP Training runtime parameters in JSON object
    # job = os.environ['CLOUD_ML_JOB']
    
    # MLflow locally available
    mlflow.set_tracking_uri('http://127.0.0.1:80')
    mlflow.set_experiment("caipt-test")

    # Data already downloaded from GCS to 'local_data' folder if --data_source argument provided 
    # in 'ai-platform jobs submit training' command
    if args.local_data:
        training_data(args.local_data)

    print('Training main started')
    train_model(args)

    # if --job-dir provided in 'ai-platform jobs submit' command you can upload any training result to that
    # if args.job_dir:
    # upload_data(args.local_data, args.job_dir):

if __name__ == '__main__':
    main()

Create an empty __init__ file which is needed for training module.

In [None]:
%%writefile ./package/training/__init__.py



setup.py to ensure MLFlow modules are installed

In [None]:
%%writefile ./package/setup.py
from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = ['mlflow==1.13.1','PyMySQL==0.9.3']

setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Customer training setup.'
)

### 1.2. Submit training job
Note: Every run of this notebook cell creates a new traing job!

In [None]:
submit_time = datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_NAME=f"training_job_{submit_time}"
JOB_DIR=f"{training_artifacts_uri}/training_{submit_time}"
print(f"Training job name: '{JOB_NAME}' will run in {REGION} region using image from:\n {ML_IMAGE_URI}\n")

!gcloud ai-platform jobs submit training {JOB_NAME} \
  --region {REGION} \
  --scale-tier BASIC \
  --job-dir {JOB_DIR} \
  --package-path ./package/training/ \
  --module-name training.task \
  --master-image-uri {ML_IMAGE_URI} \
  -- \
  --mlflowuri {MLFLOW_EXPERIMENTS_URI} \
  --epochs 2

### 1.3 Wait for job done
After you submit your job, you can monitor the job status

In [None]:
!gcloud ai-platform jobs describe {JOB_NAME}


Training logs

In [None]:
!gcloud ai-platform jobs stream-logs {JOB_NAME}

## 2.0. Cloud AI Platform Training test results
Examine the logged entries in Cloud SQL and produced articats in Cloud Storage through MLflow tracking.

In [None]:
sqlauth=re.search('mysql\\+pymysql://(?P<user>.*):(?P<psw>.*)@127.0.0.1:3306/mlflow', os.environ['MLFLOW_SQL_CONNECTION_STR'],re.DOTALL)
connection = pymysql.connect(
    host='127.0.0.1',
    port=3306,
    database='mlflow',
    user=sqlauth.group('user'),
    passwd=sqlauth.group('psw')
)
cursor = connection.cursor() 

### 2.2. Retrieve experiment

In [None]:
cursor.execute(f"SELECT * FROM experiments where name='{experiment_name}' ORDER BY experiment_id desc LIMIT 1")
if cursor.rowcount == 0:
    print("Experiment not found")
else:
    experiment_id = list(cursor)[0][0]
    print(f"'{experiment_name}' experiment ID: {experiment_id}")

### 2.3. Query runs

In [None]:
cursor.execute(f"SELECT * FROM runs where experiment_id={experiment_id} ORDER BY start_time desc LIMIT 1")
if cursor.rowcount == 0:
    print("No runs found")
else:
    entity=list(cursor)[0]
    run_uuid = entity[0]
    print(f"Last run id of '{experiment_name}' experiment is: {run_uuid}\n")
    print(entity)

### 2.4. Query metrics

In [None]:
cursor.execute(f"SELECT * FROM metrics where run_uuid = '{run_uuid}'")
if cursor.rowcount == 0:
    print("No metrics found")
else:
    for entry in cursor:
        print(entry)

### 2.5. List the artifacts in Cloud Storage

In [None]:
!gsutil ls {MLFLOW_EXPERIMENTS_URI}/{experiment_id}/{run_uuid}/artifacts/model

## 3. Submitting a workflow to Composer to run training in Cloud AI Platform training
This section will test a training job submitted from Composer workflow by reusing training module
created in the 1.1. section earlier. Therefore the training metrics and artifacts will be stored in the 
same 'caipt-test' MLFlow experiment.

In [None]:
COMPOSER_NAME=os.environ['MLOPS_COMPOSER_NAME']
REGION=os.environ['MLOPS_REGION']

submit_time = datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_NAME=f"training_job_{submit_time}"
JOB_DIR=f"{training_artifacts_uri}/training_{submit_time}"
print(f"Training job name: '{JOB_NAME}' will run in {REGION} region using image from:\n {ML_IMAGE_URI}\n")

### 3.1. Importing existing training module

Upload local training /package folder to Composer's GCS bucket.
See more details about [data import](https://cloud.google.com/sdk/gcloud/reference/composer/environments/storage/data/import) and [Composer's folder structure](https://cloud.google.com/composer/docs/concepts/cloud-storage)

In [None]:
!gcloud composer environments storage data import \
    --environment {COMPOSER_NAME} \
    --location {REGION} \
    --source ./package \
    --destination test-sklearn-mlflow-caipt

### 3.2. Uploading the Airflow workflow

In [None]:
%%writetemplate test-sklearn-mlflow-caipt.py

from datetime import timedelta
import airflow
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator

default_args = dict(retries=1,start_date=airflow.utils.dates.days_ago(0))

command="""gcloud ai-platform jobs submit training {JOB_NAME} \
--region {REGION} \
--scale-tier BASIC \
--job-dir {JOB_DIR} \
--package-path /home/airflow/gcs/data/test-sklearn-mlflow-caipt/package/training/ \
--module-name training.task \
--master-image-uri {ML_IMAGE_URI} \
-- \
--mlflowuri {MLFLOW_EXPERIMENTS_URI} \
--epochs 2"""
print (command)

with airflow.DAG(
    "test_sklearn_mlflow_caipt",
    default_args=default_args,
    schedule_interval=None,
    dagrun_timeout=timedelta(minutes=15)) as dag:

    dummy_task = DummyOperator(task_id="dummy_task")
    
    bash_task = BashOperator(
    task_id="test_sklearn_mlflow_caipt",
    bash_command=command
    )
    
    dummy_task >> bash_task

In [None]:
!gcloud composer environments storage dags import \
  --environment {COMPOSER_NAME}  \
  --location {REGION} \
  --source test-sklearn-mlflow-caipt.py

#### Check imported Dag

In [None]:
!gcloud composer environments storage dags list \
  --environment {COMPOSER_NAME}  --location {REGION}

### 3.3. Triggering the workflow
Please wait for 30-60 seconds before triggering the workflow at the first Airflow Dag import

In [None]:
!gcloud composer environments run {COMPOSER_NAME} \
    --location {REGION} unpause -- test_sklearn_mlflow_caipt

In [None]:
!gcloud composer environments run {COMPOSER_NAME} \
    --location {REGION} trigger_dag -- test_sklearn_mlflow_caipt

## 4. Cloud AI Platform Training through Cloud Composer test results

In [None]:
cursor = connection.cursor()

### 4.1 Retrieve experiment

In [None]:
experiment_name = "caipt-test"
cursor.execute("SELECT * FROM experiments where name='{}' ORDER BY experiment_id desc LIMIT 1".format(experiment_name))
if cursor.rowcount == 0:
    print("Experiment not found")
else:
    experiment_id = list(cursor)[0][0]
    print(f"'{experiment_name}' experiment ID: {experiment_id}")

### 4.2 Query runs

In [None]:
cursor.execute("SELECT * FROM runs where experiment_id={} ORDER BY start_time desc LIMIT 1".format(experiment_id))
if cursor.rowcount == 0:
    print("No runs found")
else:
    entity=list(cursor)[0]
    run_uuid = entity[0]
    print(f"Last run id of '{experiment_name}' experiment is: {run_uuid}\n")
    print(entity)

### 4.3 Query metrics

In [None]:
cursor.execute("SELECT * FROM metrics where run_uuid = '{}'".format(run_uuid))
if cursor.rowcount == 0:
    print("No metrics found")
else:
    for entry in cursor:
        print(entry)

### 4.5. List the artifacts in Cloud Storage

In [None]:
!gsutil ls {MLFLOW_EXPERIMENTS_URI}/{experiment_id}/{run_uuid}/artifacts/model