# Verifying the MLOps environment on GCP with Cloud AI Platfrom training custom container

This notebook verifies the MLOps environment provisioned on GCP
1. Create trainer module and submit a Cloud AI Platfrom training job using custom container
2. Test using the training result log entries in the Cloud SQL


## 1. Create and submit Cloud AI Platfrom training job


In [None]:
import os
import re
from IPython.core.display import display, HTML
from datetime import datetime
import mlflow
import pymysql

In [None]:
experiment_name = "caipt-test"
mlflow.set_experiment(experiment_name)

mlflow_tracking_uri = mlflow.get_tracking_uri()
mlflow_artifact_uri = os.environ['MLFLOW_EXPERIMENTS_URI']
training_artifacts_uri = mlflow_artifact_uri+"/caip-training"
REGION=os.environ['MLOPS_REGION']
TRAINER_IMAGE_URI=os.environ['TRAINER_IMAGE_URI']

print("MLflow tracking server URI: {}".format(mlflow_tracking_uri))
print("MLflow articfacts store root: {}".format(mlflow_artifact_uri))
print("MLflow SQL connction name: {}".format(os.environ['MLFLOW_SQL_CONNECTION_NAME']))
print("MLflow SQL connction string: {}".format(os.environ['MLFLOW_SQL_CONNECTION_STR']))

display(HTML('<hr>You can check results of this test in MLflow and GCS folder:'))
display(HTML('<h4><a href="{}" rel="noopener noreferrer" target="_blank">Click to open MLflow UI</a></h4>'.format(os.environ['MLFLOW_TRACKING_EXTERNAL_URI'])))
display(HTML('<h4><a href="https://console.cloud.google.com/storage/browser/{}" rel="noopener noreferrer" target="_blank">Click to open GCS folder</a></h4>'.format(mlflow_artifact_uri.replace('gs://',''))))

!mkdir -p ./package/training

### 1.0. Create model trainer file
The following cells will write out python module files that will be sent as a training module to Cloud AI Platform Training.
At first, we implement a simple Scikit-learn model training routine.

In [None]:
%%writefile ./package/training/task.py

import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.linear_model import LogisticRegression
import sys, stat
import argparse
import os

def train_model(args):
    print("Regularized logistic regression model train step started...")
    with mlflow.start_run(nested=True):
        X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)
        y = np.array([0, 0, 1, 1, 1, 0])
        # args.epochs is a training job parameter
        lr = LogisticRegression(max_iter=args.epochs)
        lr.fit(X, y)
        score = lr.score(X, y)
        mlflow.log_metric("score", score)
        mlflow.sklearn.log_model(lr, "model")
    print("LogisticRegression training finished.")

def training_data(local_data):
    dircontent = os.listdir(local_data)
    print("Check local data @: {} :\n{}".format(local_data, dircontent))
    
def upload_data(local, job_dir):
    print("Upload local data {} to GCS: {}".format(local, job_dir))

def main():
    print('Training arguments: {}'.format(' '.join(sys.argv[1:])))
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int)
    parser.add_argument('--job-dir', type=str)
    parser.add_argument('--local_data', type=str)
    args, unknown_args = parser.parse_known_args()

    # CLOUD_ML_JOB conatains other CAIP Training runtime parameters in JSON object
    # job = os.environ['CLOUD_ML_JOB']
    
    # MLflow locally available
    mlflow.set_tracking_uri('http://127.0.0.1:80')
    mlflow.set_experiment("caipt-test")

    # Data already downloaded from GCS to 'local_data' folder if --data_source argument provided 
    # in 'ai-platform jobs submit training' command
    if args.local_data:
        training_data(args.local_data)

    print('Training main started')
    train_model(args)

    # if --job-dir provided in 'ai-platform jobs submit' command you can upload any training result to that
    # if args.job_dir:
    # upload_data(args.local_data, args.job_dir):

if __name__ == '__main__':
    main()

Create an empty __init__ file which is needed for training module.

In [1]:
%%writefile ./package/training/__init__.py



UsageError: %%writefile is a cell magic, but the cell body is empty.


In [None]:
%%writefile ./package/setup.py
from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = ['mlflow==1.8.0','PyMySQL==0.9.3']

setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Customer training setup.'
)

### 1.2. Submit training job
Note: Every run of this notebook cell creates a new traing job!

In [None]:
submit_time = datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_NAME="training_job_{}".format(submit_time)
JOB_DIR="{}/training_{}".format(training_artifacts_uri,submit_time)
print("Training job name: '{}' will run in {} region using image from:\n {}\n".format(JOB_NAME,REGION,TRAINER_IMAGE_URI))

!gcloud ai-platform jobs submit training {JOB_NAME} \
  --region {REGION} \
  --scale-tier BASIC \
  --job-dir {JOB_DIR} \
  --package-path ./package/training/ \
  --module-name training.task \
  --master-image-uri {TRAINER_IMAGE_URI} \
  -- \
  --mlflowuri {mlflow_artifact_uri} \
  --epochs 2

### 1.3 Wait for job done
After you submit your job, you can monitor the job status

In [None]:
!gcloud ai-platform jobs describe {JOB_NAME}


Training logs

In [None]:
!gcloud ai-platform jobs stream-logs {JOB_NAME}

## 2.0. Test results
Examine the logged entries in Cloud SQL and produced articats in Cloud Storage through MLflow tracking.

### 2.1. Create a SQL connection to test training results

In [None]:
sqlauth=re.search('mysql\\+pymysql://(?P<user>.*):(?P<psw>.*)@127.0.0.1:3306/mlflow', os.environ['MLFLOW_SQL_CONNECTION_STR'],re.DOTALL)
connection = pymysql.connect(
    host='127.0.0.1',
    port=3306,
    database='mlflow',
    user=sqlauth.group('user'),
    passwd=sqlauth.group('psw')
)
cursor = connection.cursor() 

#### Retrieve experiment

In [None]:
cursor.execute("SELECT * FROM experiments where name='{}' ORDER BY experiment_id desc LIMIT 1".format(experiment_name))
if cursor.rowcount == 0:
    print("Experiment not found")
else:
    experiment_id = list(cursor)[0][0]
    print("'{}' experiment ID: {}".format(experiment_name, experiment_id))

#### Query runs

In [None]:
cursor.execute("SELECT * FROM runs where experiment_id={} ORDER BY start_time desc LIMIT 1".format(experiment_id))
if cursor.rowcount == 0:
    print("No runs found")
else:
    entity=list(cursor)[0]
    run_uuid = entity[0]
    print("Last run id of '{}' experiment is: {}\n".format(experiment_name, run_uuid))
    print(entity)

#### Query metrics

In [None]:
cursor.execute("SELECT * FROM metrics where run_uuid = '{}'".format(run_uuid))
if cursor.rowcount == 0:
    print("No metrics found")
else:
    for entry in cursor:
        print(entry)

### 2.2. List the artifacts in Cloud Storage

In [None]:
!gsutil ls {mlflow_artifact_uri}/{experiment_id}/{run_uuid}/artifacts/model