# Verifying the MLOps environment on GCP with Cloud AI Platfrom training

This notebook verifies the MLOps environment provisioned on GCP
1. Submit a Cloud AI Platfrom training job using custom container
2. Test using the training result log entries in the Cloud SQL


## 1. Create and submit Cloud AI Platfrom training job


In [None]:
import os
import re
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.linear_model import LogisticRegression
import pymysql
from IPython.core.display import display, HTML
from datetime import date

In [None]:
experiment_name = "caip-training-test"
mlflow.set_experiment(experiment_name)

mlflow_tracking_uri = mlflow.get_tracking_uri()
mlflow_artifact_uri = os.environ['MLFLOW_EXPERIMENTS_URI']
training_artifacts_uri = mlflow_artifact_uri+"/caip-training"
REGION=os.environ['MLOPS_REGION']

print("MLflow tracking server URI: {}".format(mlflow_tracking_uri))
print("MLflow articfacts store root: {}".format(mlflow_artifact_uri))
print("MLflow SQL connction name: {}".format(os.environ['MLFLOW_SQL_CONNECTION_NAME']))
print("MLflow SQL connction string: {}".format(os.environ['MLFLOW_SQL_CONNECTION_STR']))

display(HTML('<hr>You can check results of this test in MLflow and GCS folder:'))
display(HTML('<h4><a href="{}" rel="noopener noreferrer" target="_blank">Click to open MLflow UI</a></h4>'.format(os.environ['MLFLOW_TRACKING_EXTERNAL_URI'])))
display(HTML('<h4><a href="https://console.cloud.google.com/storage/browser/{}" rel="noopener noreferrer" target="_blank">Click to open GCS folder</a></h4>'.format(mlflow_artifact_uri.replace('gs://',''))))

!mkdir ./scikit_learn

### 1.0. Create model trainer file
We implement a simple Scikit-learn model training routine

In [None]:
%%writefile ./scikit_learn/test-caip-sklearn-mlflow.py

import mlflow
import mlflow.sklearn
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LogisticRegression

def train_model(**kwargs):
    print("Regularized logistic regression model train step started...")
    print("MLflow tracking uri: {}".format(mlflow.get_tracking_uri()))
    mlflow.set_experiment("caip-training-test")
    with mlflow.start_run(nested=True):
        X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)
        y = np.array([0, 0, 1, 1, 1, 0])
        lr = LogisticRegression(max_iter=2)
        lr.fit(X, y)
        score = lr.score(X, y)
        mlflow.log_metric("score", score)
        mlflow.sklearn.log_model(lr, "model")
    print("Train lr model step finished.")

if __name__ == '__main__':
    kwargs = dict(x.split('=', 1) for x in sys.argv[1:])
    train_model(**kwargs)

Create an empty __init__ file which is needed for training package.

In [None]:
%%writefile ./scikit_learn/__init__.py



In [None]:
# !gsutil cp test-caip-sklearn-mlflow.py ${training_artifacts_uri}

### 1.2. Submit training job
Note: Every run of this notebook cell creates a new traing job!

In [None]:
submit_time = datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_NAME="training_job_{}".format(submit_time)
JOB_DIR="{}/training_{}".format(training_artifacts_uri,submit_time)
print("Training job name: {}".format(JOB_NAME))

!gcloud ai-platform jobs submit training {JOB_NAME} \
  --region {REGION} \
  --job-dir {JOB_DIR} \
  --package-path ./scikit_learn/ \
  --module-name test-caip-sklearn-mlflow \
  --runtime-version=2.1 \
  --python-version=3.7 \
  --scale-tier=BASIC \
  -- \
  --epochs=2
# "Command '['python3', '-m', 'test-caip-sklearn-mlflow', '--model-dir=gs://mlops29-artifacts/experiments/caip-training/training_20200823_152039', '--epochs=2', '--job-dir', 'gs://mlops29-artifacts/experiments/caip-training/training_20200823_152039']' returned non-zero exit status 1." 

#  --master-image-uri $IMAGE_URI \

### 1.x Wait for job done
After you submit your job, you can monitor the job status

In [None]:
!gcloud ai-platform jobs describe {JOB_NAME}


In [None]:
Training logs

In [None]:
!gcloud ai-platform jobs stream-logs {JOB_NAME}

## 2.0. Test results
Examine the logged entries in Cloud SQL and produced articats in Cloud Storage through MLflow tracking.

### 2.1. Create a SQL connection to test training results

In [None]:
sqlauth=re.search('mysql\\+pymysql://(?P<user>.*):(?P<psw>.*)@127.0.0.1:3306/mlflow', os.environ['MLFLOW_SQL_CONNECTION_STR'],re.DOTALL)
connection = pymysql.connect(
    host='127.0.0.1',
    port=3306,
    database='mlflow',
    user=sqlauth.group('user'),
    passwd=sqlauth.group('psw')
)

#### List tables

In [None]:
cursor = connection.cursor()   
cursor.execute("SHOW TABLES")
for entry in cursor:
    print(entry[0])

#### Retrieve experiment

In [None]:
cursor.execute("SELECT * FROM experiments where name='{}'".format(experiment_name))
for entry in cursor:
    print(entry)

experiment_id = entry[0]

#### Query runs

In [None]:
cursor.execute("SELECT * FROM runs where experiment_id={}".format(experiment_id))
for entry in cursor:
    print(entry)

run_uuid = entry[0]

#### Query metrics

In [None]:
cursor.execute("SELECT * FROM metrics where run_uuid = '{}'".format(run_uuid))
for entry in cursor:
    print(entry)

### 2.2. List the artifacts in Cloud Storage

In [None]:
!gsutil ls {mlflow_artifact_uri}/{experiment_id}/{run_uuid}/artifacts/model