# MLOps Coloring Book

This notebook may be used for demonstration of AutoMLOps
### TODO:
Add comments within the cell (where does our code end and their code start)

Allowing multiple AutoMLOps pipelines within the same directory


## 1. Without Using KFP Spec

## Imports

In [4]:
from AutoMLOps import AutoMLOps

In [6]:
%%define_imports
import json
import pandas as pd
from google.cloud import aiplatform
from google.cloud import aiplatform_v1
from google.cloud import bigquery
from google.cloud import storage
import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from joblib import dump
import pickle
import os

## Data Loading

In [7]:
%%define_component
AutoMLOps.makeComponent(
    name="create_dataset",
    description="Loads data from BQ and writes a dataframe as a csv to GCS.", # optional
    params=[
        {"name": "bq_table", "type": str}, # descriptions are optional
        {"name": "data_path", "type": str, "description": "GS location where the training data is written."},
        {"name": "project_id", "type": str, "description": "Project_id."}
    ]
)

bq_client = bigquery.Client(project=project_id)

def get_query(bq_input_table: str) -> str:
    """Generates BQ Query to read data.

    Args:
    bq_input_table: The full name of the bq input table to be read into
    the dataframe (e.g. <project>.<dataset>.<table>)
    Returns: A BQ query string.
    """
    return f"""
    SELECT *
    FROM `{bq_input_table}`
    """

def load_bq_data(query: str, client: bigquery.Client) -> pd.DataFrame:
    """Loads data from bq into a Pandas Dataframe for EDA.
    Args:
    query: BQ Query to generate data.
    client: BQ Client used to execute query.
    Returns:
    pd.DataFrame: A dataframe with the requested data.
    """
    df = client.query(query).to_dataframe()
    return df

dataframe = load_bq_data(get_query(bq_table), bq_client)
dataframe.to_csv(data_path)

## Model Training

In [8]:
%%define_component
AutoMLOps.makeComponent(
    name="train_model",
    description="Trains a decision tree on the training data.",
    params=[
        {"name": "model_directory", "type": str, "description": "GS location of saved model."},
        {"name": "data_path", "type": str, "description": "GS location where the training data."}
    ]
)

def save_model(model, model_directory):
    """Saves a model to uri."""
    filename = f'model.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    
    bucket_name = model_directory.split('/')[2]
    prefix='/'.join(model_directory.split('/')[3:])
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(os.path.join(prefix, filename))
    blob.upload_from_filename(filename)

df = pd.read_csv(data_path)
labels = df.pop("Class").tolist()
data = df.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(data, labels)
skmodel = DecisionTreeClassifier()
skmodel.fit(x_train,y_train)
score = skmodel.score(x_test,y_test)
print('accuracy is:',score)

output_uri = os.path.join(model_directory, f'model.pkl')
save_model(skmodel, model_directory)

## Uploading & Deploying the Model

In [9]:
%%define_component
AutoMLOps.makeComponent(
    name="deploy_model",
    description="Trains a decision tree on the training data.",
    params=[
        {"name": "model_directory", "type": str, "description": "GS location of saved model."},
        {"name": "project_id", "type": str, "description": "Project_id."},
        {"name": "region", "type": str, "description": "Region."}
    ]
)

aiplatform.init(project=project_id, location=region)
deployed_model = aiplatform.Model.upload(
    display_name="beans-model-pipeline",
    artifact_uri = model_directory,
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
)
endpoint = deployed_model.deploy(machine_type="n1-standard-4")

## Define and Run the Pipeline

In [10]:
AutoMLOps.makePipeline(
    name="training-pipeline",
    description="description", # optional
    params=[
        {"name": "bq_table", "type": str}, # descriptions are optional
        {"name": "model_directory", "type": str, "description": "Description."},
        {"name": "data_path", "type": str, "description": "Description."},
        {"name": "project_id", "type": str, "description": "Description."},
        {"name": "region", "type": str, "description": "Description."}
    ],
    pipeline=[{
        "component_name": "create_dataset", "param_mapping": [
            ("bq_table", "bq_table"), # (component_param, pipeline_param)
            ("data_path", "data_path"),
            ("project_id", "project_id")
        ]
    },
    {
        "component_name": "train_model", "param_mapping": [
            ("model_directory", "model_directory"),
            ("data_path", "data_path")
        ]
    },
    {
        "component_name": "deploy_model", "param_mapping": [
            ("model_directory", "model_directory"),
            ("project_id", "project_id"),
            ("region", "region")
        ]
    }]
)

In [11]:
pipeline_params = {
    "bq_table": "sandbox-srastatter.mlops_boxer_test.dry-beans",
    "model_directory": f"gs://mlops-boxer-test/trained_models/{datetime.datetime.now()}",
    "data_path": "gs://mlops-boxer-test/data",
    "project_id": "sandbox-srastatter",
    "region": "us-central1"
}

In [12]:
AutoMLOps.generate(project_id='sandbox-srastatter', pipeline_params=pipeline_params, use_kfp_spec=False, run_local=True)

Creating Artifact Registry vertex-mlops-af (if it doesn't exist)...
Creating GS Bucket sandbox-srastatter-bucket (if it doesn't exist)...
Creating Service Account vertex-pipelines@sandbox-srastatter.iam.gserviceaccount.com (if it doesn't exist)...
Creating Cloud Source Repo AutoMLOps-repo (if it doesn't exist)...
Creating Cloud Build Trigger on main branch (if it doesn't exist)...
Updating Service Account privileges (TODO)...
Enabling (TODO) APIs...


## 2. Using KFP Spec

In [None]:
import json
import pandas as pd
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import pipeline, component, Artifact, Dataset, Input, Metrics, Model, Output, InputPath, OutputPath
from google.cloud import aiplatform
from google.cloud import aiplatform_v1
import datetime

import AutoMLOps

In [2]:
@component(
    packages_to_install=[
        "google-cloud-bigquery", 
        "pandas",
        "pyarrow",
        "db_dtypes"
    ],
    base_image="python:3.9",
    output_component_file=f"{AutoMLOps.OUTPUT_DIR}/create_dataset.yaml"
)
def create_dataset(
    bq_table: str,
    output_data_path: OutputPath("Dataset"),
    project: str
):
    from google.cloud import bigquery
    import pandas as pd
    bq_client = bigquery.Client(project=project)


    def get_query(bq_input_table: str) -> str:
        """Generates BQ Query to read data.

        Args:
        bq_input_table: The full name of the bq input table to be read into
        the dataframe (e.g. <project>.<dataset>.<table>)
        Returns: A BQ query string.
        """
        return f"""
        SELECT *
        FROM `{bq_input_table}`
        """

    def load_bq_data(query: str, client: bigquery.Client) -> pd.DataFrame:
        """Loads data from bq into a Pandas Dataframe for EDA.
        Args:
        query: BQ Query to generate data.
        client: BQ Client used to execute query.
        Returns:
        pd.DataFrame: A dataframe with the requested data.
        """
        df = client.query(query).to_dataframe()
        return df

    dataframe = load_bq_data(get_query(bq_table), bq_client)
    dataframe.to_csv(output_data_path)

In [3]:
@component(
    packages_to_install=[
        "scikit-learn",
        "pandas",
        "joblib",
        "tensorflow"
    ],
    base_image="python:3.9",
    output_component_file=f"{AutoMLOps.OUTPUT_DIR}/train_model.yaml",
)
def train_model(
    output_model_directory: str,
    dataset: Input[Dataset],
    metrics: Output[Metrics],
    model: Output[Model]
):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import roc_curve
    from sklearn.model_selection import train_test_split
    from joblib import dump
    import pandas as pd
    import tensorflow as tf
    import pickle
    import os
    
    def save_model(model, uri):
        """Saves a model to uri."""
        with tf.io.gfile.GFile(uri, 'w') as f:
            pickle.dump(model, f)
    
    df = pd.read_csv(dataset.path)
    labels = df.pop("Class").tolist()
    data = df.values.tolist()
    x_train, x_test, y_train, y_test = train_test_split(data, labels)
    skmodel = DecisionTreeClassifier()
    skmodel.fit(x_train,y_train)
    score = skmodel.score(x_test,y_test)
    print('accuracy is:',score)
    metrics.log_metric("accuracy",(score * 100.0))
    metrics.log_metric("framework", "Scikit Learn")
    metrics.log_metric("dataset_size", len(df))

    output_uri = os.path.join(output_model_directory, f'model.pkl')
    save_model(skmodel, output_uri)
    model.path = output_model_directory

In [4]:
@component(
    packages_to_install=[
        "google-cloud-aiplatform"
    ],
    base_image="python:3.9",
    output_component_file=f"{AutoMLOps.OUTPUT_DIR}/deploy_model.yaml",
)
def deploy_model(
    model: Input[Model],
    project: str,
    region: str,
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=region)
    deployed_model = aiplatform.Model.upload(
        display_name="beans-model-pipeline",
        artifact_uri = model.uri,
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name

In [5]:
%%define_kfp_pipeline

@dsl.pipeline(name='training-pipeline')
def pipeline(bq_table: str,
             output_model_directory: str,
             project: str,
             region: str,
            ):

    dataset_task = create_dataset(
        bq_table=bq_table, 
        project=project)

    model_task = train_model(
        output_model_directory=output_model_directory,
        dataset=dataset_task.output)

    deploy_task = deploy_model(
        model=model_task.outputs["model"],
        project=project,
        region=region)


In [6]:
pipeline_params = {
    "bq_table": "sandbox-srastatter.mlops_boxer_test.dry-beans",
    "output_model_directory": f"gs://mlops-boxer-test/trained_models/{datetime.datetime.now()}",
    "project": "sandbox-srastatter",
    "region": "us-central1"
}

In [7]:
AutoMLOps.go(project_id='sandbox-srastatter', pipeline_params=pipeline_params, use_kfp_spec=True, run_local=False, schedule='0 */3 * * *')

Creating Artifact Registry vertex-mlops-af (if it doesn't exist)...
Creating GS Bucket sandbox-srastatter-bucket (if it doesn't exist)...
Creating Service Account vertex-pipelines@sandbox-srastatter.iam.gserviceaccount.com (if it doesn't exist)...
Creating Cloud Source Repo AutoMLOps-repo (if it doesn't exist)...
Creating Cloud Build Trigger on main branch (if it doesn't exist)...
Updating Service Account privileges (TODO)...
Enabling (TODO) APIs...
[main e7eaa52] "Run AutoMLOps"
 10 files changed, 21 insertions(+), 13 deletions(-)


remote: It seems you're using Apple Git (git/2.37.1 (Apple Git-137.1),gzip(gfe),gzip(gfe)). Apple Git is not frequently updated and often has known vulnerabilities. Please follow the instructions at go/old-git-client#gmac to use a more current version of Git.        
To https://source.developers.google.com/p/sandbox-srastatter/r/AutoMLOps-repo
   e4f2f3c..e7eaa52  main -> main


Pushing code to main branch, triggering cloudbuild...
Waiting for cloudbuild job to complete.....................Submitting PipelineJob...


Note: Unnecessary use of -X or --request, POST is already inferred.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

== Info:   Trying 216.239.38.53:443...
== Info: Connected to run-pipeline-wxma5l6a2a-uc.a.run.app (216.239.38.53) port 443 (#0)
== Info: ALPN: offers http/1.1
== Info:  CAfile: /etc/ssl/cert.pem
== Info:  CApath: none
== Info: (304) (OUT), TLS handshake, Client hello (1):
=> Send SSL data, 338 bytes (0x152)
0000: ...N..k...*...Vx.Re.tJ....K.3.#./....o U....c.U.9_mMvM:......2.K
0040: ~.O.9.|.b.............0.,.(.$.......k.9...........=.5...../.+.'.
0080: #.......g.3...E...<./...A.......................+............3.&
00c0: .$... |.d..,............^^.Rj.x.....~q...).'..$run-pipeline-wxma
0100: 5l6a2a-uc.a.run.app.............................................
0140: ..........http/1.1
== Info: (304) (IN), TLS handshake, Server hello (2):
<= Recv SSL data, 122 bytes (0x7a)
0000: ...v........w.+....}.z.<.......eq...Cv U....c.U.9_mMvM:......2.K
0040: ~.O.9.|......3.$... ...h..w"........e"..........FrIG.+....
== Info: (304) (IN), TLS handshake, Unknown (8):
<= Recv SSL data, 21 bytes (0x15)
000

100   225    0     0  100   225      0     43  0:00:05  0:00:05 --:--:--     0

== Info: Mark bundle as not supporting multiuse
<= Recv header, 17 bytes (0x11)
0000: HTTP/1.1 200 OK
<= Recv header, 32 bytes (0x20)
0000: content-type: application/json
<= Recv header, 61 bytes (0x3d)
0000: X-Cloud-Trace-Context: 707848bc5e956ab42eadbc334302fee9;o=1
<= Recv header, 41 bytes (0x29)
0000: X-Google-AppEngine-Module: run-pipeline
<= Recv header, 95 bytes (0x5f)
0000: X-Google-AppEngine-Version: run-pipeline:run-pipeline-00012-huh-
0040: pi9j9lmh41.449026851114377536
<= Recv header, 58 bytes (0x3a)
0000: x-google-appengine-gse-user-execution-time-usec: 4844713
<= Recv header, 57 bytes (0x39)
0000: x-google-appengine-gse-user-readiness-time-usec: 882986
<= Recv header, 33 bytes (0x21)
0000: X-Google-Cache-Control: private
<= Recv header, 49 bytes (0x31)
0000: X-Google-AppEngine-AppId: uc~sandbox-srastatter
<= Recv header, 41 bytes (0x29)
0000: X-Google-AppEngine-RuntimeId: container
<= Recv header, 59 bytes (0x3b)
0000: X-Google-Serverless-Tenant-Project: uc~sandbox-srasta

100   489  100   264  100   225     44     37  0:00:06  0:00:05  0:00:01    55
