# Lab: Chicago taxifare tip prediction on Google Cloud Vertex Pipelines using the TFX SDK

## Learning objectives

## Setup

### Define constants

In [24]:
GOOGLE_CLOUD_PROJECT_ID = !(gcloud config get-value core/project)
GOOGLE_CLOUD_PROJECT_ID = GOOGLE_CLOUD_PROJECT_ID[0]

In [25]:
GOOGLE_CLOUD_REGION = 'us-central1'

In [19]:
BQ_DATASET_NAME = 'chicago_taxi'
BQ_TABLE_NAME = 'chicago_taxi_tips_raw'
BQ_LOCATION = 'US'

### Create Google Cloud Storage bucket for storing Vertex Pipeline artifacts

In [22]:
GCS_BUCKET_NAME = f"gs://{PROJECT_ID}-taxifare-tip-prediction"

In [23]:
!gsutil mb -l $REGION $GCS_BUCKET_NAME

Creating gs://dougkelly-vertex-demos-taxifare-tip-prediction/...


### Import libraries

In [31]:
import tensorflow as tf
import tfx
import kfp

from google.cloud import bigquery
from google.cloud import aiplatform as vertex_ai

In [29]:
print(f"tensorflow: {tf.__version__}")
print(f"tfx: {tfx.__version__}")
print(f"kfp: {kfp.__version__}")
print(f"Google Cloud Vertex AI Python SDK: {vertex_ai.__version__}")

tensorflow: 2.6.2
tfx: 1.4.0
kfp: 1.8.1
Google Cloud Vertex AI Python SDK: 1.7.1


## Create BigQuery dataset

In [26]:
!bq --location=$BQ_LOCATION mk -d \
$GOOGLE_CLOUD_PROJECT_ID:$BQ_DATASET_NAME

BigQuery error in mk operation: Dataset 'dougkelly-vertex-demos:chicago_taxi'
already exists.


## Exploratory Data Analysis in BigQuery

## Create BigQuery dataset for ML classification task

In [32]:
SAMPLE_SIZE = 50000
YEAR = 2020

In [33]:
sql_script = '''
CREATE OR REPLACE TABLE `@PROJECT_ID.@DATASET.@TABLE` 
AS (
    WITH
      taxitrips AS (
      SELECT
        trip_start_timestamp,
        trip_seconds,
        trip_miles,
        payment_type,
        pickup_longitude,
        pickup_latitude,
        dropoff_longitude,
        dropoff_latitude,
        tips,
        fare
      FROM
        `bigquery-public-data.chicago_taxi_trips.taxi_trips`
      WHERE 1=1 
      AND pickup_longitude IS NOT NULL
      AND pickup_latitude IS NOT NULL
      AND dropoff_longitude IS NOT NULL
      AND dropoff_latitude IS NOT NULL
      AND trip_miles > 0
      AND trip_seconds > 0
      AND fare > 0
      AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR
    )

    SELECT
      trip_start_timestamp,
      EXTRACT(MONTH from trip_start_timestamp) as trip_month,
      EXTRACT(DAY from trip_start_timestamp) as trip_day,
      EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,
      EXTRACT(HOUR from trip_start_timestamp) as trip_hour,
      trip_seconds,
      trip_miles,
      payment_type,
      ST_AsText(
          ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)
      ) AS pickup_grid,
      ST_AsText(
          ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)
      ) AS dropoff_grid,
      ST_Distance(
          ST_GeogPoint(pickup_longitude, pickup_latitude), 
          ST_GeogPoint(dropoff_longitude, dropoff_latitude)
      ) AS euclidean,
      CONCAT(
          ST_AsText(ST_SnapToGrid(ST_GeogPoint(pickup_longitude,
              pickup_latitude), 0.1)), 
          ST_AsText(ST_SnapToGrid(ST_GeogPoint(dropoff_longitude,
              dropoff_latitude), 0.1))
      ) AS loc_cross,
      IF((tips/fare >= 0.2), 1, 0) AS tip_bin,
      IF(ABS(MOD(FARM_FINGERPRINT(STRING(trip_start_timestamp)), 10)) < 9, 'UNASSIGNED', 'TEST') AS data_split
    FROM
      taxitrips
    LIMIT @LIMIT
)
'''

In [34]:
sql_script = sql_script.replace(
    '@PROJECT_ID', PROJECT_ID).replace(
    '@DATASET', BQ_DATASET_NAME).replace(
    '@TABLE', BQ_TABLE_NAME).replace(
    '@YEAR', str(YEAR)).replace(
    '@LIMIT', str(SAMPLE_SIZE))

In [35]:
bq_client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT_ID, location=BQ_LOCATION)
job = bq_client.query(sql_script)
_ = job.result()

In [36]:
%%bigquery

SELECT data_split, COUNT(*)
FROM chicago_taxi.chicago_taxi_tips_raw
GROUP BY data_split

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1127.50query/s]                        
Downloading: 100%|██████████| 2/2 [00:01<00:00,  1.72rows/s]


Unnamed: 0,data_split,f0_
0,UNASSIGNED,45507
1,TEST,4493


## Create a TFX pipeline

In [38]:
PIPELINE_NAME="tfx-taxifare-tips"

### Write model code

In [39]:
%%writefile {PIPELINE_NAME}/model.py
TODO

Writing tfx-taxifare-tips/model.py


### Write pipeline definition with the TFX SDK

In [None]:
%%writefile {MODEL_DIR}/pipeline.py
from tfx.v1.components import 

def create_pipeline():
    return pipeline.Pipeline

### Compile and run your pipeline on Vertex Pipelines

In [None]:
tfx.orchestration.experimental.KubeflowV2DagRunner(
                config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(),
                output_filename=self.pipeline_definition)
            _ = self.runner_instance.run(self.pipeline)
            vertex_ai.init(project=self.project_id, location=self.region)
            pipeline_job = vertex_ai.PipelineJob(
                          display_name=self.pipeline_name,
                          template_path=self.pipeline_definition,
                          pipeline_root=self.pipeline_root
                      )
            pipeline_job.run(sync=False)