In [0]:
import os
BUCKET = 'qwiklabs-gcp-8923d4964bfbd247-bucket' # REPLACE WITH A BUCKET NAME (PUT YOUR PROJECT ID AND WE CREATE THE BUCKET ITSELF NEXT)
PROJECT = 'qwiklabs-gcp-8923d4964bfbd247' # REPLACE WITH YOUR PROJECT ID
REGION = 'us-central1' # REPLACE WITH YOUR REGION e.g. us-central1

# do not change these
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] =  BUCKET
os.environ['REGION'] = REGION

In [0]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


In [0]:
# %%bash
# OUTDIR=gs://${BUCKET}/trained_model
# JOBNAME=cloud_composer_automated_ml_pipeline_taxifare_$(date -u +%y%m%d_%H%M%S)
# echo $OUTDIR $REGION $JOBNAME
# # Clear the Cloud Storage Bucket used for the training job
# gsutil -m rm -rf $OUTDIR
# gcloud ml-engine jobs submit training $JOBNAME \
#    --region=$REGION \
#    --module-name=trainer.task \
#    --package-path=${PWD}/cloud_composer_automated_ml_pipeline_taxifare/trainer \
#    --job-dir=$OUTDIR \
#    --staging-bucket=gs://$BUCKET \
#    --scale-tier=BASIC \
#    --runtime-version=1.8

In [0]:
%bash
cd cloud_composer_automated_ml_pipeline_taxifare_module
touch README.md
python setup.py sdist

In [0]:
!gsutil cp cloud_composer_automated_ml_pipeline_taxifare_module/dist/cloud_composer_automated_ml_pipeline_taxifare-0.1.tar.gz gs://qwiklabs-gcp-8923d4964bfbd247-bucket/code/

***
# Part Two: Setup a scheduled workflow with Cloud Composer
In this section you will complete a partially written training.py DAG file and copy it to the DAGS folder in your Composer instance.

## Copy your Airflow bucket name
1. Navigate to your Cloud Composer [instance](https://console.cloud.google.com/composer/environments?project=)<br/><br/>
2. Select __DAGs Folder__<br/><br/>
3. You will be taken to the Google Cloud Storage bucket that Cloud Composer has created automatically for your Airflow instance<br/><br/>
4. __Copy the bucket name__ into the variable below (example: us-central1-composer-08f6edeb-bucket)

In [0]:
AIRFLOW_BUCKET = 'us-central1-cloud-composer-automated-ml-pipeline-taxifare-191f74a9-bucket' # REPLACE WITH AIRFLOW BUCKET NAME
os.environ['AIRFLOW_BUCKET'] = AIRFLOW_BUCKET

## Complete the training.py DAG file
Apache Airflow orchestrates tasks out to other services through a [DAG (Directed Acyclic Graph)](https://airflow.apache.org/concepts.html) file which specifies what services to call, what to do, and when to run these tasks. DAG files are written in python and are loaded automatically into Airflow once present in the Airflow/dags/ folder in your Cloud Composer bucket. 

Your task is to complete the partially written DAG file below which will enable the automatic retraining and redeployment of our WALS recommendation model. 

__Complete the #TODOs__ in the Airflow DAG file below and execute the code block to save the file

In [0]:
%%writefile airflow/dags/training.py
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""DAG definition for anadarko composer toy model training."""

import airflow
from airflow import DAG

# Reference for all available airflow operators: 
# https://github.com/apache/incubator-airflow/tree/master/airflow/contrib/operators
from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.hooks.base_hook import BaseHook

from airflow.contrib.operators.mlengine_operator import MLEngineTrainingOperator, MLEngineModelOperator, MLEngineVersionOperator
from airflow.models import TaskInstance

import datetime
import logging

def _get_project_id():
    """Get project ID from default GCP connection."""

    extras = BaseHook.get_connection("google_cloud_default").extra_dejson
    key = "extra__google_cloud_platform__project"
    if key in extras:
        project_id = extras[key]
    else:
        raise ("Must configure project_id in google_cloud_default "
                     "connection from Airflow Console")
    return project_id

PROJECT_ID = _get_project_id()

# Data set constants, used in BigQuery tasks.    You can change these
# to conform to your data.

# TODO: Specify your BigQuery dataset name and table name
DATASET = "yellow"
TABLE_NAME = "trips"

# TODO: Confirm bucket name and region
# GCS bucket names and region, can also be changed.
BUCKET = "gs://" + PROJECT_ID + "-bucket"
REGION = "us-central1"

# # The code package name comes from the model code in the wals_ml_engine
# # directory of the solution code base.
PACKAGE_URI = BUCKET + "/code/cloud_composer_automated_ml_pipeline_taxifare-0.1.tar.gz"
JOB_DIR = BUCKET + "/jobs"

default_args = {
        "owner": "airflow",
        "depends_on_past": False,
        "start_date": airflow.utils.dates.days_ago(2),
        "email": ["airflow@example.com"],
        "email_on_failure": True,
        "email_on_retry": False,
        "retries": 5,
        "retry_delay": datetime.timedelta(minutes = 5)
}

# Default schedule interval using cronjob syntax - can be customized here
# or in the Airflow console.

# TODO: Specify a schedule interval in CRON syntax to run once a day at 2100 hours (9pm)
# Reference: https://airflow.apache.org/scheduler.html
schedule_interval = "00 21 * * *"

# TODO: Title your DAG to be recommendations_training_v1
dag = DAG("cloud_composer_automated_ml_pipeline_taxifare", 
                    default_args = default_args,
                    schedule_interval = schedule_interval)

dag.doc_md = __doc__


#
#
# Task Definition
#
#

# BigQuery data query
bql="""
#standardsql
SELECT
    (tolls_amount + fare_amount) AS fare_amount,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) * 1.0 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) * 1.0 AS hourofday,
    pickup_longitude AS pickuplon,
    pickup_latitude AS pickuplat,
    dropoff_longitude AS dropofflon,
    dropoff_latitude AS dropofflat,
    passenger_count*1.0 AS passengers,
    CONCAT(CAST(pickup_datetime AS STRING), CAST(pickup_longitude AS STRING), CAST(pickup_latitude AS STRING), CAST(dropoff_latitude AS STRING), CAST(dropoff_longitude AS STRING)) AS key
FROM
    `{0}.{1}.{2}`
WHERE
    trip_distance > 0
    AND fare_amount >= 2.5
    AND pickup_longitude > -78
    AND pickup_longitude < -70
    AND dropoff_longitude > -78
    AND dropoff_longitude < -70
    AND pickup_latitude > 37
    AND pickup_latitude < 45
    AND dropoff_latitude > 37
    AND dropoff_latitude < 45
    AND passenger_count > 0
    AND RAND() < 0.0001
"""

bql = bql.format(PROJECT_ID, DATASET, TABLE_NAME)

bql_train = "{0} AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 5) < 4".format(bql)
bql_eval = "{0} AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 5) = 4".format(bql)

# TODO: Complete the BigQueryOperator task to truncate the table if it already exists before writing
# Reference: https://airflow.apache.org/integration.html#bigqueryoperator
bq_train_data_op = BigQueryOperator(
    task_id = "bq_train_data_task",
    bql = bql_train,
    destination_dataset_table = "%s.cloud_composer_automated_ml_pipeline_taxifare_train_data" % DATASET,
    write_disposition = "WRITE_TRUNCATE", # specify to truncate on writes
    use_legacy_sql = False,
    dag = dag
)

bq_eval_data_op = BigQueryOperator(
    task_id = "bq_eval_data_task",
    bql = bql_eval,
    destination_dataset_table = "%s.cloud_composer_automated_ml_pipeline_taxifare_eval_data" % DATASET,
    write_disposition = "WRITE_TRUNCATE", # specify to truncate on writes
    use_legacy_sql = False,
    dag = dag
)

sql = """
SELECT
    COUNT(*)
FROM
    [{0}:{1}.{2}]
"""
sql_check_train = sql.format(PROJECT_ID, DATASET, "cloud_composer_automated_ml_pipeline_taxifare_train_data")
sql_check_eval = sql.format(PROJECT_ID, DATASET, "cloud_composer_automated_ml_pipeline_taxifare_eval_data")

# Check to make sure that the data tables won"t be empty
bq_check_train_data_op = BigQueryCheckOperator(
    task_id = "bq_check_train_data_task",
    sql = sql_check_train,
    dag = dag
)

bq_check_eval_data_op = BigQueryCheckOperator(
    task_id = "bq_check_eval_data_task",
    sql = sql_check_eval,
    dag = dag
)

# BigQuery training data export to GCS
bash_remove_old_data_op = BashOperator(
    task_id = "bash_remove_old_data_task",
    bash_command = "gsutil -m rm -rf {0}/data/*".format(BUCKET),
    dag = dag
)

# TODO: Fill in the missing operator name for task #2 which
# takes a BigQuery dataset and table as input and exports it to GCS as a CSV
train_files = BUCKET + "/data/cloud_composer_automated_ml_pipeline_taxifare/train-*.csv"

bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator(
    task_id = "bq_export_gcs_train_csv_task",
    source_project_dataset_table = "%s.cloud_composer_automated_ml_pipeline_taxifare_train_data" % DATASET,
    destination_cloud_storage_uris = [train_files],
    export_format = "CSV",
    print_header = False,
    dag = dag
)

eval_files = BUCKET + "/data/cloud_composer_automated_ml_pipeline_taxifare/eval-*.csv"

bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator(
    task_id = "bq_export_gcs_eval_csv_task",
    source_project_dataset_table = "%s.cloud_composer_automated_ml_pipeline_taxifare_eval_data" % DATASET,
    destination_cloud_storage_uris = [eval_files],
    export_format = "CSV",
    print_header = False,
    dag = dag
)


# # ML Engine training job
job_id = "cloud_composer_automated_ml_pipeline_taxifare_{0}".format(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
output_dir = BUCKET + "/trained_model"
job_dir = BUCKET + "/jobs/" + job_id
training_args = ["--job-dir", job_dir]

# # TODO: Fill in the missing operator name for task #3 which will start a new training job to Cloud ML Engine
# # Reference: https://airflow.apache.org/integration.html#cloud-ml-engine
# # https://cloud.google.com/ml-engine/docs/tensorflow/machine-types
ml_engine_training_op = MLEngineTrainingOperator(
    task_id = "ml_engine_training_task",
    project_id = PROJECT_ID,
    job_id = job_id,
    package_uris = [PACKAGE_URI],
    training_python_module = "trainer.task",
    training_args = training_args,
    region = REGION,
    scale_tier = "BASIC",
    runtime_version = "1.13", 
    python_version = "3.5",
    dag = dag
)

MODEL_NAME = "cloud_composer_automated_ml_pipeline_taxifare"
MODEL_VERSION = "v1"
MODEL_LOCATION = BUCKET + "/saved_model"

bash_remove_old_saved_model_op = BashOperator(
    task_id = "bash_remove_old_saved_model_task",
    bash_command = "gsutil -m rm -rf {0}/*".format(MODEL_LOCATION),
    dag = dag
)

bash_copy_new_saved_model_op = BashOperator(
    task_id = "bash_copy_new_saved_model_task",
    bash_command = "gsutil -m rsync -d -r `gsutil ls {0}/export/exporter/ | tail -1` {1}".format(output_dir, MODEL_LOCATION),
    dag = dag
)

# Create model on ML-Engine
bash_ml_engine_models_list_op = BashOperator(
    task_id = "bash_ml_engine_models_list_task",
    xcom_push = True,
    bash_command = "gcloud ml-engine models list --filter="name:{0}"".format(MODEL_NAME),
    dag = dag
)

def check_if_model_already_exists(**kwargs):
    ml_engine_models_list = kwargs["ti"].xcom_pull(task_ids = "bash_ml_engine_models_list_task")
    logging.info("check_if_model_already_exists: ml_engine_models_list = \n{}".format(ml_engine_models_list))
    if len(ml_engine_models_list) == 0 or ml_engine_models_list == "Listed 0 items.":
        return "ml_engine_create_model_task"
    return "dont_create_model_dummy_branch_task"

check_if_model_already_exists_op = BranchPythonOperator(
    task_id = "check_if_model_already_exists_task", 
    python_callable = check_if_model_already_exists,
    provide_context = True,
    dag = dag
)

ml_engine_create_model_op = MLEngineModelOperator(
    task_id = "ml_engine_create_model_task",
    project_id = PROJECT_ID, 
    model = {"name": MODEL_NAME}, 
    operation = "create",
    dag = dag
)

create_model_dummy_op = DummyOperator(
    task_id = "create_model_dummy_task",
    trigger_rule = "all_done",
    dag = dag
)

dont_create_model_dummy_branch_op = DummyOperator(
    task_id = "dont_create_model_dummy_branch_task",
    dag = dag
)

dont_create_model_dummy_op = DummyOperator(
    task_id = "dont_create_model_dummy_task",
    trigger_rule = "all_done",
    dag = dag
)

# Create version of model on ML-Engine
bash_ml_engine_versions_list_op = BashOperator(
    task_id = "bash_ml_engine_versions_list_task",
    xcom_push = True,
    bash_command = "gcloud ml-engine versions list --model {0} --filter="name:{1}"".format(MODEL_NAME, MODEL_VERSION),
    dag = dag
)

def check_if_model_version_already_exists(**kwargs):
    ml_engine_versions_list = kwargs["ti"].xcom_pull(task_ids = "bash_ml_engine_versions_list_task")
    logging.info("check_if_model_version_already_exists: ml_engine_versions_list = \n{}".format(ml_engine_versions_list))
    if len(ml_engine_versions_list) == 0 or ml_engine_versions_list == "Listed 0 items.":
        return "ml_engine_create_version_task"
    return "ml_engine_create_other_version_task"

check_if_model_version_already_exists_op = BranchPythonOperator(
    task_id = "check_if_model_version_already_exists_task", 
    python_callable = check_if_model_version_already_exists,
    provide_context = True,
    dag = dag
)

OTHER_VERSION_NAME = "v_{0}".format(datetime.datetime.now().strftime("%Y%m%d%H%M%S")[0:12])

ml_engine_create_version_op = MLEngineVersionOperator(
    task_id = "ml_engine_create_version_task",
    project_id = PROJECT_ID, 
    model_name = MODEL_NAME, 
    version_name = MODEL_VERSION, 
    version = {"name": MODEL_VERSION, "deploymentUri": MODEL_LOCATION}, 
    operation = "create",
    dag = dag
)

ml_engine_create_other_version_op = MLEngineVersionOperator(
    task_id = "ml_engine_create_other_version_task",
    project_id = PROJECT_ID, 
    model_name = MODEL_NAME, 
    version_name = OTHER_VERSION_NAME, 
    version = {"name": OTHER_VERSION_NAME, "deploymentUri": MODEL_LOCATION}, 
    operation = "create",
    dag = dag
)

ml_engine_set_default_version_op = MLEngineVersionOperator(
    task_id = "ml_engine_set_default_version_task",
    project_id = PROJECT_ID, 
    model_name = MODEL_NAME, 
    version_name = MODEL_VERSION, 
    version = {"name": MODEL_VERSION}, 
    operation = "set_default",
    dag = dag
)

ml_engine_set_default_other_version_op = MLEngineVersionOperator(
    task_id = "ml_engine_set_default_other_version_task",
    project_id = PROJECT_ID, 
    model_name = MODEL_NAME, 
    version_name = OTHER_VERSION_NAME, 
    version = {"name": OTHER_VERSION_NAME}, 
    operation = "set_default",
    dag = dag
)

# TODO: Be sure to set_upstream dependencies for all tasks
bq_check_train_data_op.set_upstream(bq_train_data_op)
bq_check_eval_data_op.set_upstream(bq_eval_data_op)

bash_remove_old_data_op.set_upstream([bq_check_train_data_op, bq_check_eval_data_op])

bq_export_gcs_train_csv_op.set_upstream([bq_train_data_op, bash_remove_old_data_op])
bq_export_gcs_eval_csv_op.set_upstream([bq_eval_data_op, bash_remove_old_data_op])

ml_engine_training_op.set_upstream([bq_export_gcs_train_csv_op, bq_export_gcs_eval_csv_op])

bash_remove_old_saved_model_op.set_upstream(ml_engine_training_op)
bash_copy_new_saved_model_op.set_upstream(bash_remove_old_saved_model_op)

bash_ml_engine_models_list_op.set_upstream(ml_engine_training_op)
check_if_model_already_exists_op.set_upstream(bash_ml_engine_models_list_op)

ml_engine_create_model_op.set_upstream(check_if_model_already_exists_op)
create_model_dummy_op.set_upstream(ml_engine_create_model_op)
dont_create_model_dummy_branch_op.set_upstream(check_if_model_already_exists_op)
dont_create_model_dummy_op.set_upstream(dont_create_model_dummy_branch_op)

bash_ml_engine_versions_list_op.set_upstream([dont_create_model_dummy_op, create_model_dummy_op])
check_if_model_version_already_exists_op.set_upstream(bash_ml_engine_versions_list_op)

ml_engine_create_version_op.set_upstream([bash_copy_new_saved_model_op, check_if_model_version_already_exists_op])
ml_engine_create_other_version_op.set_upstream([bash_copy_new_saved_model_op, check_if_model_version_already_exists_op])

ml_engine_set_default_version_op.set_upstream(ml_engine_create_version_op)
ml_engine_set_default_other_version_op.set_upstream(ml_engine_create_other_version_op)

Overwriting airflow/dags/training.py


In [0]:
import datetime
string = "v_{0}".format(datetime.datetime.now().strftime('%Y%m%d%H%M%S')[0:12])
print(string)
print(len(string))

v_201901160411
14


In [0]:
# !gsutil -m cp -r gs://qwiklabs-gcp-8923d4964bfbd247-bucket/models-v1 .

### Copy local Airflow DAG file and plugins into the DAGs folder

In [0]:
%bash
gsutil cp airflow/dags/training.py gs://${AIRFLOW_BUCKET}/dags # overwrite if it exists

2. Navigate to your Cloud Composer [instance](https://console.cloud.google.com/composer/environments?project=)<br/><br/>

3. Trigger a __manual run__ of your DAG for testing<br/><br/>

3. Ensure your DAG runs successfully (all nodes outlined in dark green and 'success' tag shows)

![Successful Airflow DAG run](./img/airflow_successful_run.jpg "Successful Airflow DAG run")
