# Dataflow

## Setup IAM and networking for Dataflow jobs

### Create a Cloud Storage bucket

In [None]:
gcloud auth list
gcloud config list project

PROJECT=`gcloud config list --format 'value(core.project)'`
USER_EMAIL=`gcloud config list account --format 'value(core.account)'`
REGION=us-central1
gsutil mb -p $PROJECT -b on gs://$PROJECT

### Create a virtual environment

In [None]:
## Create and activate virtual environment
sudo apt-get install -y python3-venv
python3 -m venv df-env
source df-env/bin/activate

python3 -m pip install -q -U pip setuptools wheel
python3 -m pip install -q -U apache-beam[gcp]

# Dataflow API is enabled.
gcloud services enable dataflow.googleapis.com

### Grant `Dataflow` role

In [None]:
gcloud projects get-iam-policy $PROJECT \
    --format='table(bindings.role)' \
    --flatten='bindings[].members' \
    --filter='bindings.members:$USER_EMAIL'

gcloud projects add-iam-policy-binding $PROJECT \
    --member=user:$USER_EMAIL \
    --role=roles/dataflow.admin

# Grant the dataflow.worker role to the Compute Engine service account
PROJECT_ID=$(gcloud config get-value project)
export PROJECT_NUMBER=$(gcloud projects list --filter="$PROJECT_ID" \
    --format="value(PROJECT_NUMBER)")
export serviceAccount=""$PROJECT_NUMBER"-compute@developer.gserviceaccount.com"
gcloud projects add-iam-policy-binding $PROJECT_ID \
    --member="serviceAccount:${serviceAccount}" \
    --role="roles/dataflow.worker"

python3 -m apache_beam.examples.wordcount \
    --input=gs://dataflow-samples/shakespeare/kinglear.txt \
    --output=gs://$PROJECT/results/outputs \
    --runner=DataflowRunner \
    --project=$PROJECT \
    --temp_location=gs://$PROJECT/tmp/ \
    --region=$REGION

### Launch in private IPs

In [None]:
gcloud projects add-iam-policy-binding $PROJECT \
--member=user:$USER_EMAIL \
--role=roles/compute.networkAdmin


gcloud compute networks subnets update default \
--region=$REGION \
--enable-private-ip-google-access


python3 -m apache_beam.examples.wordcount \
--input=gs://dataflow-samples/shakespeare/kinglear.txt \
--output=gs://$PROJECT/results/outputs \
--runner=DataflowRunner \
--project=$PROJECT \
--temp_location=gs://$PROJECT/tmp/ \
--region=$REGION \
--no_use_public_ips \
--network default

## Extract-Transform-Load

### Pipeline

In [None]:
import argparse
import time
import logging
import json
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.runners import DataflowRunner, DirectRunner

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--stagingLocation', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--tempLocation', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.stagingLocation
    options.view_as(GoogleCloudOptions).temp_location = opts.tempLocation
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('my-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    # Static input and output
    input = 'gs://{0}/events.json'.format(opts.project)
    output = '{0}:logs.logs'.format(opts.project)

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "ip",
                "type": "STRING"
            },
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "lat",
                "type": "FLOAT"
            },
            {
                "name": "lng",
                "type": "FLOAT"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
            {
                "name": "http_request",
                "type": "STRING"
            },
            {
                "name": "http_response",
                "type": "INTEGER"
            },
            {
                "name": "num_bytes",
                "type": "INTEGER"
            },
            {
                "name": "user_agent",
                "type": "STRING"
            }
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    '''
    Steps:
    1) Read something
    2) Transform something
    3) Write something
    '''

    (p
        | 'ReadFromGCS' >> beam.io.ReadFromText(input)
        | 'ParseJson' >> beam.Map(lambda line: json.loads(line))
        | 'WriteToBQ' >> beam.io.WriteToBigQuery(
            output,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
# Set up environment variables
export PROJECT_ID=$(gcloud config get-value project)
# Run the pipeline
python3 my_pipeline.py \
--project=${PROJECT_ID} \
--region=us-central1 \
--stagingLocation=gs://$PROJECT_ID/staging/ \
--tempLocation=gs://$PROJECT_ID/temp/ \
--runner=DataflowRunner

In [None]:
# Build a full JSON object
bq show --schema --format=prettyjson logs.logs | sed '1s/^/{"BigQuery Schema":/' | sed '$s/$/}/' > schema.json
cat schema.json
export PROJECT_ID=$(gcloud config get-value project)
gsutil cp schema.json gs://${PROJECT_ID}/

## Windows, Watermarks, Triggers

Three main concepts:
1. Group data in windows.
2. Watermark when the window is ready to produce results.
3. Control when and number of times the window will emit ouptput.

### Windows

- Windowing divides data into time-based, finite chunks.
- Required when doing aggregations over unbounded data using Beam primitives (GroupByKey, Combiners).

Three types of windows:
1. Fixed
2. Sliding
3. Session
4. Single global

### Watermarks

A watermark is the system’s heuristic-based notion of when all data up to a certain point in event time can be expected to have arrived in the pipeline. Once the watermark progresses past the end of a window, any further element that arrives with a timestamp in that window is considered late data and is simply dropped.

**Lag time** = the difference in time from when data was expected to arrive and when it actually arrived

Lag is problematic when windowing using event time (as opposed to processing time) because it introduces uncertainty.

**Data freshness** = the amount of time between real time and the output watermark.

**System latency** = the current maximum duration that an item of data has been processing or awaiting processing.

### Triggers

1. Event time: AfterWatermark
2. Processing time: AfterProcessingTime
3. Composite
4. Data-driven: AfterCount

Window accumulation modes:
1. Accumulate
2. Discard

In [None]:
pcoll | WindowInto(
    # Sliding window of 60 seconds, every 5 seconds
    SlidingWindows(60, 5),
    # Relative to the watermark, trigger:
    trigger=AfterWatermark(
        # fires 30 seconds after pipeline commences
        early=AfterProcessingTime(delay=30),
        # and for every late record (< allowedLateness)
        late=AfterCount(1)
    )
    # the pane should have all the records
    accumulation_mode=AccumulationMode.ACCUMULATING
)

In [None]:
pcoll | WindowInto(
    # Fixed window of 60 seconds
    FixedWindows(60),
    # Set up a composite trigger that triggers
    trigger=Repeatedly(
        # whenever either of these happens:
        AfterAny(
            # 100 elements accumulate
            AfterCount(100),
            # every 60 seconds (ignore watermark)
            AfterProcessingTime(1*60)
        )
    ),
    # the trigger should be with only new records 2 days
    accumulation_mode=AccumulationMode.DISCARDING,
    allowed_lateness=Duration(seconds=2*24*60*60)
)

### Aggregates site traffic by user.

In [None]:
# batch_user_traffic_pipeline.py
import argparse
import time
import logging
import json
import typing
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.combiners import CountCombineFn
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog (typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

class PerUserAggregation(typing.NamedTuple):
    user_id: str
    page_views: int
    total_bytes: int
    max_bytes: int
    min_bytes: int

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)
beam.coders.registry.register_coder(PerUserAggregation, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element)
    return CommonLog(**row)

def to_dict(element):
    return element._asdict()

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_path', required=True, help='Path to events.json')
    parser.add_argument('--table_name', required=True, help='BigQuery table name')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('batch-user-traffic-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [

            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "total_bytes",
                "type": "INTEGER"
            },
            {
                "name": "max_bytes",
                "type": "INTEGER"
            },
            {
                "name": "min_bytes",
                "type": "INTEGER"
            },
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
       | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)
       | 'PerUserAggregations' >> beam.GroupBy('user_id')
            .aggregate_field('user_id', CountCombineFn(), 'page_views')
            .aggregate_field('num_bytes', sum, 'total_bytes')
            .aggregate_field('num_bytes', max, 'max_bytes')
            .aggregate_field('num_bytes', min, 'min_bytes')
            .with_output_types(PerUserAggregation)
       | 'ToDict' >> beam.Map(to_dict)
       | 'WriteToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export INPUT_PATH=${PIPELINE_FOLDER}/events.json
export TABLE_NAME=${PROJECT_ID}:logs.user_traffic
cd $BASE_DIR
python3 batch_user_traffic_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--input_path=${INPUT_PATH} \
--table_name=${TABLE_NAME}

### Aggregates site traffic by minute.

In [None]:
# batch_minute_traffic_pipeline.py
mport argparse
import time
import logging
import json
import typing
from datetime import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.combiners import CountCombineFn
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog(typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element)
    return CommonLog(**row)

def add_timestamp(element):
    ts = datetime.strptime(element.timestamp[:-8], "%Y-%m-%dT%H:%M:%S").timestamp()
    return beam.window.TimestampedValue(element, ts)

class GetTimestampFn(beam.DoFn):
    def process(self, element, window=beam.DoFn.WindowParam):
        window_start = window.start.to_utc_datetime().strftime("%Y-%m-%dT%H:%M:%S")
        output = {'page_views': element, 'timestamp': window_start}
        yield output

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_path', required=True, help='Path to events.json')
    parser.add_argument('--table_name', required=True, help='BigQuery table name')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('batch-minute-traffic-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },

        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
       | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)
       | 'AddEventTimestamp' >> beam.Map(add_timestamp)
       | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60))
       | "CountPerMinute" >> beam.CombineGlobally(CountCombineFn()).without_defaults()
       | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
       | 'WriteToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export INPUT_PATH=${PIPELINE_FOLDER}/events.json
export TABLE_NAME=${PROJECT_ID}:logs.minute_traffic
cd $BASE_DIR
python3 batch_minute_traffic_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--input_path=${INPUT_PATH} \
--table_name=${TABLE_NAME}

### Streaming analytics

In [None]:
# streaming_event_generator.py
# This program reads a file representing web server logs in common log format and streams them into a PubSub topic
# with lag characteristics as determined by command-line arguments

import argparse
from google.cloud import pubsub_v1
import time
from datetime import datetime, timezone
import random
from anytree.importer import DictImporter
import json
from multiprocessing import Process

parser = argparse.ArgumentParser(__file__, description="event_generator")
parser.add_argument("--taxonomy", "-x", dest="taxonomy_fp",
    help="A .json file representing a taxonomy of web resources",
    default="taxonomy.json")
parser.add_argument("--users_fp", "-u", dest="users_fp",
    help="A .csv file of users",
    default="users.csv")
parser.add_argument("--off_to_on", "-off", dest="off_to_on_prob", type=float,
    help="A float representing the probability that a user who is offline will come online",
    default=.25)
parser.add_argument("--on_to_off", "-on", dest="on_to_off_prob", type=float,
    help="A float representing the probability that a user who is online will go offline",
    default=.1)
parser.add_argument("--max_lag_millis", '-l', dest="max_lag_millis", type=int,
    help="An integer representing the maximum amount of lag in millisecond", default=250)
parser.add_argument("--project_id", "-p", type=str, dest="project_id", 
    help="A GCP Project ID", required=True)
parser.add_argument("--topic_name", "-t", dest="topic_name", type=str,
    help="The name of the topic where the messages to be published", required=True)


avg_secs_between_events = 5
args = parser.parse_args()
taxonomy_fp = args.taxonomy_fp
users_fp = args.users_fp
online_to_offline_probability = args.on_to_off_prob
offline_to_online_probability = args.off_to_on_prob
max_lag_millis = args.max_lag_millis
project_id = args.project_id
topic_name = args.topic_name
min_file_size_bytes = 100
max_file_size_bytes = 500
verbs = ["GET"]
responses = [200]


log_fields = ["ip", "user_id", "lat", "lng", "timestamp", "http_request",
              "http_response", "num_bytes", "user_agent"]

def extract_resources(taxonomy_filepath):
    """
    Reads a .json representing a taxonomy and returns
    a data structure representing their hierarchical relationship
    :param taxonomy_file: a string representing a path to a .json file
    :return: Node representing root of taxonomic tree
    """

    try:
        with open(taxonomy_filepath, 'r') as fp:
            json_str = fp.read()
            json_data = json.loads(json_str)
            root = DictImporter().import_(json_data)
    finally:
        fp.close()

    return root


def read_users(users_fp):
    """
    Reads a .csv from @user_fp representing users into a list of dictionaries,
    each elt of which represents a user
    :param user_fp: a .csv file where each line represents a user
    :return: a list of dictionaries
    """
    users = []
    with open(users_fp, 'r') as fp:
        fields = fp.readline().rstrip().split(",")
        for line in fp:
            user = dict(zip(fields, line.rstrip().split(",")))
            users.append(user)
    return users

def sleep_then_publish_burst(burst, publisher, topic_path):
    """
    :param burst: a list of dictionaries, each representing an event
    :param num_events_counter: an instance of Value shared by all processes
    to track the number of published events
    :param publisher: a PubSub publisher
    :param topic_path: a topic path for PubSub
    :return:
    """
    sleep_secs = random.uniform(0, max_lag_millis/1000)
    time.sleep(sleep_secs)
    publish_burst(burst, publisher, topic_path)

def publish_burst(burst, publisher, topic_path):
    """
    Publishes and prints each event
    :param burst: a list of dictionaries, each representing an event
    :param num_events_counter: an instance of Value shared by all processes to
    track the number of published events
    :param publisher: a PubSub publisher
    :param topic_path: a topic path for PubSub
    :return:
    """
    for event_dict in burst:
        json_str = json.dumps(event_dict)
        data = json_str.encode('utf-8')
        publisher.publish(topic_path, data=data, timestamp=event_dict['timestamp'])

def create_user_process(user, root):
    """
    Code for continuously-running process representing a user publishing
    events to pubsub
    :param user: a dictionary representing characteristics of the user
    :param root: an instance of AnyNode representing the home page of a website
    :param num_events_counter: a variable shared among all processes used to track the number of events published
    :return:
    """
    publisher = pubsub_v1.PublisherClient()
    topic_path = publisher.topic_path(project_id, topic_name)

    user['page'] = root
    user['is_online'] = True
    user['offline_events'] = []

    while True:
        time_between_events = random.uniform(0, avg_secs_between_events * 2)
        time.sleep(time_between_events)
        prob = random.random()
        event = generate_event(user)
        if user['is_online']:
            if prob < online_to_offline_probability:
                user['is_online'] = False
                user['offline_events'] = [event]
            else:
                sleep_then_publish_burst([event], publisher, topic_path)
        else:
            user['offline_events'].append(event)
            if prob < offline_to_online_probability:
                user['is_online'] = True
                sleep_then_publish_burst(user['offline_events'], publisher, topic_path)
                user['offline_events'] = []

def generate_event(user):
    """
    Returns a dictionary representing an event
    :param user:
    :return:
    """
    user['page'] = get_next_page(user)
    uri = str(user['page'].name)
    event_time = datetime.now(tz=timezone.utc)
    current_time_str = event_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    file_size_bytes = random.choice(range(min_file_size_bytes, max_file_size_bytes))
    http_request = "\"{} {} HTTP/1.0\"".format(random.choice(verbs), uri)
    http_response = random.choice(responses)
    event_values = [user['ip'], user['id'], float(user['lat']), float(user['lng']), current_time_str, http_request,
                    http_response, file_size_bytes, user['user_agent']]

    return dict(zip(log_fields, event_values))

def get_next_page(user):
    """
    Consults the user's representation of the web site taxonomy to determine the next page that they visit
    :param user:
    :return:
    """
    possible_next_pages = [user['page']]
    if not user['page'].is_leaf:
        possible_next_pages += list(user['page'].children)
    if (user['page'].parent != None):
        possible_next_pages += [user['page'].parent]
    next_page = random.choice(possible_next_pages)
    return next_page


if __name__ == '__main__':
    users = read_users(users_fp)
    root = extract_resources(taxonomy_fp)
    processes = [Process(target=create_user_process, args=(user, root))
                 for user in users]
    [process.start() for process in processes]
    while True:
        time.sleep(1)

In [None]:
# generate_streaming_events.sh
#!/bin/#!/usr/bin/env bash
echo "Installing packages"
# Install modules
sh ./install_packages.sh

echo "Generating synthetic users"
# Generate 10 fake web site users
python3 user_generator.py --n=10

echo "Generating synthetic events"
use_lag=$1

if [ "$use_lag" = true ] ; then
    echo "Using lag"
    python3 streaming_event_generator.py --project_id=$(gcloud config get-value project) -t=my_topic
else
    echo "Not using lag"
    python3 streaming_event_generator.py --project_id=$(gcloud config get-value project) -t=my_topic -off=1. -on=0. -l=0
fi

In [None]:
# streaming_minute_traffic_pipeline.py
import argparse
import time
import logging
import json
import typing
from datetime import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.combiners import CountCombineFn
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog(typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element.decode('utf-8'))
    return CommonLog(**row)

def add_processing_timestamp(element):
    row = element._asdict()
    row['event_timestamp'] = row.pop('timestamp')
    row['processing_timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return row

class GetTimestampFn(beam.DoFn):
    def process(self, element, window=beam.DoFn.WindowParam):
        window_start = window.start.to_utc_datetime().strftime("%Y-%m-%dT%H:%M:%S")
        output = {'page_views': element, 'timestamp': window_start}
        yield output

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic')
    parser.add_argument('--agg_table_name', required=True, help='BigQuery table name for aggregate results')
    parser.add_argument('--raw_table_name', required=True, help='BigQuery table name for raw inputs')
    parser.add_argument('--window_duration', required=True, help='Window duration')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True, streaming=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_topic = opts.input_topic
    raw_table_name = opts.raw_table_name
    agg_table_name = opts.agg_table_name
    window_duration = opts.window_duration

    # Table schema for BigQuery
    agg_table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },

        ]
    }

    raw_table_schema = {
        "fields": [
            {
                "name": "ip",
                "type": "STRING"
            },
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "user_agent",
                "type": "STRING"
            },
            {
                "name": "lat",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "lng",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "event_timestamp",
                "type": "STRING"
            },
            {
                "name": "processing_timestamp",
                "type": "STRING"
            },
            {
                "name": "http_request",
                "type": "STRING"
            },
            {
                "name": "http_response",
                "type": "INTEGER"
            },
            {
                "name": "num_bytes",
                "type": "INTEGER"
            }
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    parsed_msgs = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
                     | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog))

    (parsed_msgs
        | "AddProcessingTimestamp" >> beam.Map(add_processing_timestamp)
        | 'WriteRawToBQ' >> beam.io.WriteToBigQuery(
            raw_table_name,
            schema=raw_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
        )

    (parsed_msgs
        | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60))
        | "CountPerMinute" >> beam.CombineGlobally(CountCombineFn()).without_defaults()
        | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
        | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            agg_table_name,
            schema=agg_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run().wait_until_finish()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export PUBSUB_TOPIC=projects/${PROJECT_ID}/topics/my_topic
export WINDOW_DURATION=60
export AGGREGATE_TABLE_NAME=${PROJECT_ID}:logs.windowed_traffic
export RAW_TABLE_NAME=${PROJECT_ID}:logs.raw
python3 streaming_minute_traffic_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--input_topic=${PUBSUB_TOPIC} \
--window_duration=${WINDOW_DURATION} \
--agg_table_name=${AGGREGATE_TABLE_NAME} \
--raw_table_name=${RAW_TABLE_NAME}

In [None]:
# The gap between event time and processing time
SELECT
  UNIX_MILLIS(TIMESTAMP(event_timestamp)) - min_millis.min_event_millis AS event_millis,
  UNIX_MILLIS(TIMESTAMP(processing_timestamp)) - min_millis.min_event_millis AS processing_millis,
  user_id,
  -- added as unique label so we see all the points
  CAST(UNIX_MILLIS(TIMESTAMP(event_timestamp)) - min_millis.min_event_millis AS STRING) AS label
FROM
  `logs.raw`
CROSS JOIN (
  SELECT
    MIN(UNIX_MILLIS(TIMESTAMP(event_timestamp))) AS min_event_millis
  FROM
    `logs.raw`) min_millis
WHERE
  event_timestamp IS NOT NULL
ORDER BY
  event_millis ASC

## Sources and Sinks

### Text IO & File IO

In [None]:
# Text IO reading
pcoll = (pipeline
    | 'Create' >> Create([file_name])
    | 'ReadAll' >> ReadAllFromText()
)

pcoll = pipeline | 'Read' >> ReadFromText(file_name)

# File IO reading with filenames
with beam.Pipeline() as p:
    readable_files = (p
        | fileio.MatchFiles('hdfs://path/to/*.txt') # Match file patter
        | fileio.ReadMatches()
        | beam.Reshuffle()
    )
    file_and_contents = (readable_files
        | beam.Map(lambda x: (x.metadata.path, x.read_utf8())) # Access file metadata
    )
    
# File IO processing files as they arrive
with beam.Pipeline() as p:
    readable_files = (p
        | beam.io.ReadFromPubSub(...) # Parse PubSub message and yield filename
    )
    files_and_contents = (readable_files
        | ReadAllFromText() # Used parsed filename to read 
    )
    
# Text IO writing
transformed_data | "write" >> WriteToText(know_args.output, coder=JsonCoder())

# Text IO writing with dynamic destinations
pcoll | beam.io.fileio.WriteToFiles(
    path='/path',
    destination=lambda record: 
        'avro' if record['type']=='A' else 'csv', # Dynamic destination
    sink=lambda dest: 
        AvroSink() if dest=='avro' else CsvSink(), # Write dynamic sink
        file_naming=beam.io.fileio.destination_prefix_naming()
)

### BigQuery IO with BigQuery Storage API

In [None]:
# BigQuery IO reading with query
pcoll = (p
    | 'QueryTableStdSQL' >> beam.io.ReadFromBigQuery(
        query='SELECT max_temperature '\
            'FROM `project.dataset.table`',
        use_standard_sql=True
    ) # Map results
    | beam.Map(lambda elem: elem['max_temperature']) # Source using query
)

# BigQuery IO writing with dynamic destinations
def table_fn(element, fictional_characters):
    if element in fictional_characters:
        return 'dataset.fictional_quotes'
    else:
        return 'dataset.real_quotes'
    
quotes | "WriteWithDynamicDestination" >> beam.io.WriteToBigQuery(
    table_fn,
    schema=table_schema, # Schema destination
    table_side_inputs=(fictional_characters_view)
)

### PubSub IO 

In [None]:
# PubSub IO reading
class GroupWindowsIntoBatches(beam.PTransform):
    return (pcoll
        | beam.WindowInto(window.FixedWindows(self.window_size))
    )

pipeline 
    | "Read PubSub Message" >> beam.io.ReadFromPubSub(topic=input_topic)
    | "Window into" >> GroupWindowIntoBatches(window_size)

### Kafka IO

In [None]:
# Kafka IO reading
pipeline
    | ReadFromKafka(
        consumer_config={'bootstrap.servers': bootstrap_servers},
        topic=[topic]
    )

### Avro IO

In [None]:
# Avro IO reading multiple files
with beam.Pipeline() as p:
    records = p | "Read" >> beam.io.ReadFromAvro('/avrofiles*')

### Splittable DoFn

In [None]:
# Splittable DoFn custome source
class FileToWordsRestrictionProvider(beam.io.RestrictionProvider):
    def initial_restriction(self, file_name): # Initial restriction
        return OffsetRange(0, os.stat(file_name).st_size)
    
    # Tracking subset of restriction completed
    def create_tracker(self, restriction):
        return beam.io.restriction_trackers.OffsetRestrictionTracker()
    
class FileToWordsFn(beam.DoFn):
    def process(self, ...=FileToWordsRestrictionProvider()):

## Schemas

Express structured data in Beam pipelines.

### Branching pipeline

![Branch_pipeline](./img/branch_pipeline.png)

In [None]:
# my_pipeline.py
import argparse
import time
import logging
import json
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

def parse_json(element):
    return json.loads(element)

def drop_fields(element):
    element.pop('user_agent')
    return element

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--inputPath', required=True, help='Path to events.json')
    parser.add_argument('--outputPath', required=True, help='Path to coldline storage bucket')
    parser.add_argument('--tableName', required=True, help='BigQuery table name')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('my-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.inputPath
    output_path = opts.outputPath
    table_name = opts.tableName

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "ip",
                "type": "STRING"
            },
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "lat",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "lng",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
            {
                "name": "http_request",
                "type": "STRING"
            },
            {
                "name": "http_response",
                "type": "INTEGER"
            },
            {
                "name": "num_bytes",
                "type": "INTEGER"
            }
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    '''
    Steps:
    1) Read something
    2) Transform something
    3) Write something
    '''

    # Read in lines to an initial PCollection that can then be branched off of
    lines = p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)

    # Write to Google Cloud Storage
    lines | 'WriteRawToGCS' >> beam.io.WriteToText(output_path)

    # Read elements from Json, filter out individual elements, and write to BigQuery
    (lines
        | 'ParseJson' >> beam.Map(parse_json)
        | 'DropFields' >> beam.Map(drop_fields)
        | 'FilterFn' >> beam.Filter(lambda row: row['num_bytes'] < 120)
        | 'WriteToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
# Set up environment variables
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export COLDLINE_BUCKET=${BUCKET}-coldline
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export INPUT_PATH=${PIPELINE_FOLDER}/events.json
export OUTPUT_PATH=${PIPELINE_FOLDER}-coldline/pipeline_output
export TABLE_NAME=${PROJECT_ID}:logs.logs_filtered
cd $BASE_DIR
python3 my_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--stagingLocation=${PIPELINE_FOLDER}/staging \
--tempLocation=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--inputPath=${INPUT_PATH} \
--outputPath=${OUTPUT_PATH} \
--tableName=${TABLE_NAME}

## State & Timers

Two powerful features in `DoFn` to implement stateful transformations.

### State API

Stateful `ParDo` introduces a persistent mutable state which is partitioned by key and window.

In [None]:
class StatefulBufferingFn(beam.DoFn):
    MAX_BUFFER_SIZE = 500;
    BUFFER_STATE = BagStateSpec('buffer', EventCoder())
    COUNT_STATE = CombiningValueStateSpec('count', 
        VarIntCoder(), combiners.SumCombineFn())
    
    def process(self, element, buffer_state=beam.DoFn.StateParam(BUFFER_STATE),
            count_state=beam.DoFn.StateParam(COUNT_STATE)):
        buffer_state.add(element)
        # Increment count and add element to buffer
        count_state.add(1) 
        count = count_state.read()
        # When buffer size limit is reached, a request is sent to the external service
        if count >= MAX_BUFFER_SIZE:
            for event in buffer_state.read():
                yield event
            count_state.clear()
            buffer_state.clear()

### Timer API

In [None]:
class StatefulBufferingFn(beam.DoFn):
    MAX_BUFFER_SIZE = 500;
    BUFFER_STATE = BagStateSpec('buffer', EventCoder())
    COUNT_STATE = CombiningValueStateSpec('count', 
        VarIntCoder(), combiners.SumCombineFn()),
    EXPIRY_TIMER = TimerSpec('expiry', TimeDomain.WATERMARK)
    
    def process(self, element, w=beam.DoFn.WindowParam,
            buffer_state=beam.DoFn.StateParam(BUFFER_STATE),
            count_state=beam.DoFn.StateParam(COUNT_STATE),
            expiry_timer=beam.DoFn.TimerParam(EXPIRY_TIMER)):
        expiry_timer.set(w.end + ALLOWED_LATENESS)
        buffer_state.add(element)
        # Increment count and add element to buffer
        count_state.add(1) 
        count = count_state.read()
        # When buffer size limit is reached, a request is sent to the external service
        if count >= MAX_BUFFER_SIZE:
            for event in buffer_state.read():
                yield event
            count_state.clear()
            buffer_state.clear()        

    # Added an event time timer so that when the window expires,
    # any events remaining in the buffer are processed.
    @on_timer(EXPIRY_TIMER)
    def expiry(self, 
            buffer_state=beam.DoFn.StateParam(BUFFER_STATE),
            count_state=beam.DoFn.StateParam(COUNT_STATE)):
        events = buffer_state.read()
        for event in events:
            yield event
        count_state.clear()
        buffer_state.clear()

## Beam SQL & Beam DataFrames

### Beam SQL

In [None]:
# Apply a SQLTransform using ZetaSQL Dialect
SqlTransform(query, dialect='zetasql')

In [None]:
# Dataflow SQL CLI
gcloud dataflow sql query """SQL statements"""

In [None]:
# TUMBLE (fixed windows)
SELECT
  productId,
  COUNT(transactionId) AS num_purchases,
  TUMBLE_START("INTERVAL 10 SECOND") AS period_start
FROM
  pubsub.topic.`instant-insights`.`retaildemo-online-purchase-json` AS pr
GROUP BY
  productId,
  TUMBLE(pr.event_timestamp, "Interval 10 SECOND")

In [None]:
# HOP (sliding windows)
SELECT
  productId,
  COUNT(transactionId) AS num_purchases,
  HOP_START("INTERVAL 10 SECOND", "INTERVAL 30 SECOND") AS period_start,
  HOP_END("INTERVAL 10 SECOND", "INTERVAL 30 SECOND") AS period_end
FROM
  pubsub.topic.`instant-insights`.`retaildemo-online-purchase-json` AS pr
GROUP BY
  productId,
  HOP(pr.event_timestamp, "Interval 10 SECOND", "INTERVAL 30 SECOND")

In [None]:
# SESSION (session windows)
SELECT
  userId,
  COUNT(transactionId) AS num_purchases,
  SESSION_START("INTERVAL 10 MINUTE") AS interval_start,
  SESSION_END("INTERVAL 10 MINUTE") AS interval_end
FROM
  pubsub.topic.`instant-insights`.`retaildemo-online-purchase-json` AS pr
GROUP BY
  userId,
  SESSION(pr.event_timestamp, "Interval 10 MINUTE")

In [None]:
# batch_user_traffic_SQL_pipeline.py
import argparse
import time
import logging
import json
import typing
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.sql import SqlTransform
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog (typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element)
    return CommonLog(**row)

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_path', required=True, help='Path to events.json')
    parser.add_argument('--raw_table_name', required=True, help='BigQuery table for raw data')
    parser.add_argument('--agg_table_name', required=True, help='BigQuery table for aggregated data')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('batch-user-traffic-pipeline-sql-'
                                                                   ,time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    agg_table_name = opts.agg_table_name
    raw_table_name = opts.raw_table_name

    # Table schema for BigQuery
    raw_table_schema = {
            "fields": [
                {
                    "name": "ip",
                    "type": "STRING"
                },
                {
                    "name": "user_id",
                    "type": "STRING"
                },
                {
                    "name": "lat",
                    "type": "FLOAT"
                },
                {
                    "name": "lng",
                    "type": "FLOAT"
                },
                {
                    "name": "timestamp",
                    "type": "STRING"
                },
                {
                    "name": "http_request",
                    "type": "STRING"
                },
                {
                    "name": "http_response",
                    "type": "INTEGER"
                },
                {
                    "name": "num_bytes",
                    "type": "INTEGER"
                },
                {
                    "name": "user_agent",
                    "type": "STRING"
                }
            ]
        }


    # Table schema for BigQuery
    agg_table_schema = {
        "fields": [

            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "total_bytes",
                "type": "INTEGER"
            },
            {
                "name": "max_bytes",
                "type": "INTEGER"
            },
            {
                "name": "min_bytes",
                "type": "INTEGER"
            },
        ]
    }

    query = """
        SELECT user_id,
        COUNT(*) AS page_views, SUM(num_bytes) as total_bytes,
        MAX(num_bytes) AS max_bytes, MIN(num_bytes) as min_bytes
        FROM PCOLLECTION
        GROUP BY user_id
        """

    # Create the pipeline
    p = beam.Pipeline(options=options)

    logs = (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
              | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog))

    (logs | 'RawToDict' >> beam.Map(lambda row : row._asdict())
          | 'WriteRawToBQ' >> beam.io.WriteToBigQuery(
           raw_table_name,
           schema=raw_table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
      ))

    (logs | 'PerUserAggregations' >> SqlTransform(query, dialect='zetasql')
          | 'AggToDict' >> beam.Map(lambda row : row._asdict())
          | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            agg_table_name,
            schema=agg_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export INPUT_PATH=${PIPELINE_FOLDER}/events.json
export TABLE_NAME=${PROJECT_ID}:logs.user_traffic
export AGGREGATE_TABLE_NAME=${PROJECT_ID}:logs.user_traffic
export RAW_TABLE_NAME=${PROJECT_ID}:logs.raw
python3 batch_user_traffic_SQL_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--experiments=use_runner_v2 \
--input_path=${INPUT_PATH} \
--agg_table_name=${AGGREGATE_TABLE_NAME} \
--raw_table_name=${RAW_TABLE_NAME}

In [None]:
# batch_minute_traffic_SQL_pipeline.py
import argparse
import time
import logging
import json
import typing
from datetime import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.sql import SqlTransform
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog(typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    ts: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element)
    row['ts'] = row['timestamp']
    row.pop('timestamp')
    return CommonLog(**row)

def format_timestamp(element):
    ts = datetime.strptime(element.ts[:-8], "%Y-%m-%dT%H:%M:%S")
    ts = datetime.strftime(ts, "%Y-%m-%d %H:%M:%S")
    temp_dict = element._asdict()
    temp_dict['ts'] = ts
    return CommonLog(**temp_dict)

def to_dict(row):
    return {'page_views' : row.page_views,
            'start_time' : row.start_time}

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--stagingLocation', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--tempLocation', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--inputPath', required=True, help='Path to events.json')
    parser.add_argument('--tableName', required=True, help='BigQuery table name')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.stagingLocation
    options.view_as(GoogleCloudOptions).temp_location = opts.tempLocation
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('batch-minute-traffic-pipeline-sql'
                                                                   ,time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.inputPath
    table_name = opts.tableName

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "start_time",
                "type": "STRING"
            },

        ]
    }

    query = '''
        SELECT
            COUNT(*) AS page_views,
            STRING(window_start) AS start_time
        FROM
            TUMBLE(
                (SELECT TIMESTAMP(ts) AS ts FROM PCOLLECTION),
                DESCRIPTOR(ts),
                'INTERVAL 1 MINUTE')
        GROUP BY window_start
    '''

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
       | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)
       | 'FormatTimestamp' >> beam.Map(format_timestamp).with_output_types(CommonLog)
       | "CountPerMinute" >> SqlTransform(query, dialect='zetasql')
       | "ConvertToDict" >> beam.Map(to_dict)
       | 'WriteToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export INPUT_PATH=${PIPELINE_FOLDER}/events.json
export TABLE_NAME=${PROJECT_ID}:logs.minute_traffic
python3 batch_minute_traffic_SQL_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--stagingLocation=${PIPELINE_FOLDER}/staging \
--tempLocation=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--inputPath=${INPUT_PATH} \
--tableName=${TABLE_NAME} \
--experiments=use_runner_v2

In [None]:
# streaming_minute_traffic_SQL_pipeline.py
import argparse
import time
import logging
import json
import typing
from datetime import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.sql import SqlTransform
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog(typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    event_timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element.decode('utf-8'))
    return row

class GetEventTimestampFn(beam.DoFn):
    def process(self, row, timestamp=beam.DoFn.TimestampParam):
        event_ts = timestamp.to_utc_datetime().strftime("%Y-%m-%dT%H:%M:%S")
        row['event_timestamp'] = event_ts
        yield CommonLog(**row)

class ParseAndGetEventTimestamp(beam.PTransform):
    def expand(self, pcoll):
        return (
            pcoll
            | 'ParseJson' >> beam.Map(parse_json)
            | 'GetEventTimestamp' >> beam.ParDo(GetEventTimestampFn())
            )

def to_dict(row):
    return {'page_views' : row.page_views,
            'start_time' : row.start_time}

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic')
    parser.add_argument('--table_name', required=True, help='BigQuery table name for aggregate results')


    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True, streaming=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-sql-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_topic = opts.input_topic
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "start_time",
                "type": "STRING"
            },

        ]
    }

    query = '''
        SELECT
            COUNT(*) AS page_views,
            STRING(window_start) AS start_time
        FROM
            TUMBLE(
                (SELECT TIMESTAMP(event_timestamp) AS ts FROM PCOLLECTION),
                DESCRIPTOR(ts),
                'INTERVAL 1 MINUTE')
        GROUP BY window_start
    '''

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
       | 'ParseAndGetEventTimestamp' >> ParseAndGetEventTimestamp().with_output_types(CommonLog)
       | "CountPerMinute" >> SqlTransform(query, dialect='zetasql')
       | "ConvertToDict" >> beam.Map(to_dict)
       | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run().wait_until_finish()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export PUBSUB_TOPIC=projects/${PROJECT_ID}/topics/my_topic
export TABLE_NAME=${PROJECT_ID}:logs.minute_traffic
python3 streaming_minute_traffic_SQL_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--input_topic=${PUBSUB_TOPIC} \
--table_name=${TABLE_NAME} \
--experiments=use_runner_v2

### Beam DataFrames

- A more Pythonic expressive API compatible with Pandas DataFrames.

In [None]:
# Using DataframeTransform
def my_function(df):
    df['C'] = df.A + 2*df.B
    result = df.groupby('C').sum().filter('A < 0')
    return result

output = input | DataframeTransform(my_function)

In [None]:
# Dataframe/PCollection conversion
with beam.Pipeline() as p:
    df = to_dataframe(pc)
    pc = to_pcollection(df)

In [None]:
# Count words
words = (pipeline
    | "Split" >> beam.FlatMap(lambda line: re.findall(r'[\w]+', line))
        .with_output_types(str)
    # Map to Row objects to generate a schema suitable for conversion to a dataframe.
    | "ToRows" >> beam.Map(lambda word beam.Row(word=word))
)

df = to_dataframe(words)
df['count'] = 1
counted = df.groupby('word').sum()
counted.to_csv(known_args.output)

# Deferred DatFrames can also be converted back to schema PCollections
counted_pc = to_pcollection(counted, include_indexes=True)

# Print out every word that occurred > 50 times
_ = (counted_pc
    | beam.Filter(lambda row: row.count > 50)
    | beam.Map(lambda row: f'{row.word}: {row.count}')
    | beam.Map(print)
)

## Beam Notebooks

Include the `interactive_runner` and `interactive_beam` modules in notebook.

In [None]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

In [None]:
# Set the recording duration to 10 min
ib.options.recording_duration = '10m'

# Set the recording size limit to 1 GB
ib.options.recording_size_limit = 1e9

In [None]:
words = p | "read" >> beam.io.ReadFromPubSub(topic=topic)

windowed_words = (words | "window" >> beam.WindowInto(beam.window.FixedWindows(10)))

windowed_words_counts = (windowed_words | "count" >> beam.combiners.Count.PerElement())

In [None]:
# Materializes the resulting PCollection in a table
ib.show(windowed_word_counts, include_window_info=True)

# Load the output in a Pandas DataFrame
ib.collect(windowed_word_counts, include_window_info=True)

# Visualize the data in the Notebook
ib.show(windowed_word_counts, include_window_info=True, visualize_data=True)

In [None]:
# Import the production Dataflow runner
from apache_beam.runners import DataflowRunner

# Set up Apache Beam pipeline options
options = pipeline_options.PipelineOptions()

# Run the pipeline
runner = DataflowRunner()
runner.run_pipeline(p, options=options)

In [None]:
storeSales = p | beam.io.ReadFromText("purchases-store")
               | beam.Map(lambda s: ...)

onlineSales = p | beam.io.ReadFromText("purchase-online")
                | beam.Map(lambda s: ...)
    
topSales = (storeSales, onlineSales)
                | beam.Flatten()
                | beam.Combiners.Count.perKey()
                | beam.Combiners.Top.of(10, key=lambda x: x[1])
            
topSales        | beam.io.WriteToBigQuery(topSales)

## Best Practices

In [None]:
# Schemas
class Purchase(typing.NamedTuple):
    user_id: str # The id of the user who made the purchase.
    item_id: int # The identifier of the item that was purchased.
    shipping_address: ShippingAddress # The shipping address, a nested type.
    cost_cents: int # The cost of the item
    transactions: typing.Sequence[Transaction]

In [None]:
# DoFn for micro batching
class MyDoFn(beam.DoFn):
    def setup(self):
        pass
    def start_bundle(self):
        pass
    def process(self, element):
        pass
    def finish_bundle(self):
        pass
    def teardown(self):
        pass

### Dealing with late data


**Allowed lateness**

`Allowed lateness` controls how long a window should retain its state; once the watermark reaches the end of the allowed lateness period, all state is dropped. A clean and concise way of doing this is by defining a horizon on the allowed lateness within the system, i.e. placing a bound on how late any given record may be (relative to the watermark) for the system to bother processing it; any data that arrives after this horizon is simply dropped.

**Triggers**

`Triggers` determine at what point during processing time results will be materialized. Triggers fire panes when the trigger’s conditions are met. Set the trigger(s) for a PCollection by setting the `trigger` keyword argument of `WindowInto` PTransform:
- `AfterWatermark` for firing when the watermark passes a timestamp determined from either the end of the window or the arrival of the first element in a pane.
- `AfterProcessingTime` for firing after some amount of processing time has elapsed (typically since the first element in a pane).
- `AfterCount` for firing when the number of elements in the window reaches a certain count.

### Dealing with malformed data

Produce a branching pipeline to have a single transform produce multiple outputs while processing the input `PCollection` one time. Uses a class called `TaggedOutput` to key the outputs of the `DoFn` with multiple (possibly heterogeneous) outputs.

In [None]:
# streaming_minute_traffic_pipeline.py
import argparse
import time
import logging
import json
import typing
from datetime import datetime
import apache_beam as beam
from apache_beam.io import fileio
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.trigger import AfterWatermark, AfterCount, AfterProcessingTime
from apache_beam.transforms.trigger import AccumulationMode
from apache_beam.transforms.combiners import CountCombineFn
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog(typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

class ConvertToCommonLogFn(beam.DoFn):
    def process(self, element):
        try:
            row = json.loads(element.decode('utf-8'))
            yield beam.pvalue.TaggedOutput('parsed_row', CommonLog(**row))
        except:
            yield beam.pvalue.TaggedOutput('unparsed_row', element.decode('utf-8'))


class GetTimestampFn(beam.DoFn):
    def process(self, element, window=beam.DoFn.WindowParam):
        window_start = window.start.to_utc_datetime().strftime("%Y-%m-%dT%H:%M:%S")
        output = {'page_views': element, 'timestamp': window_start}
        yield output

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')

    # Google Cloud options
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')

    # Pipeline-specific options
    parser.add_argument('--window_duration', required=True, help='Window duration in seconds')
    parser.add_argument('--table_name', required=True, help='Output BQ table')
    parser.add_argument('--input_topic', required=True, help='Input Pub/Sub topic')
    parser.add_argument('--allowed_lateness', required=True, help='Allowed lateness')
    parser.add_argument('--dead_letter_bucket', required=True, help='GCS Bucket for unparsable Pub/Sub messages')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True, streaming=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_topic = opts.input_topic
    table_name = opts.table_name
    window_duration = opts.window_duration
    allowed_lateness = opts.allowed_lateness
    dead_letter_bucket = opts.dead_letter_bucket

    output_path = dead_letter_bucket + '/deadletter/'

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },

        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)



    rows = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
              | 'ParseJson' >> beam.ParDo(ConvertToCommonLogFn())
                    .with_outputs('parsed_row', 'unparsed_row')
                    .with_output_types(CommonLog))

    (rows.unparsed_row
        | 'FireEvery120s' >> beam.WindowInto(beam.window.FixedWindows(120),
            trigger=AfterProcessingTime(120),
            accumulation_mode=AccumulationMode.DISCARDING)
        # Dead-letter storage
        | 'WriteUnparsedToGCS' >> fileio.WriteToFiles(output_path,
            shards=1,
            max_writers_per_bundle=0)
    )

    (rows.parsed_row
        | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(int(window_duration)),
            trigger=AfterWatermark(late=AfterCount(1)),
            allowed_lateness=int(allowed_lateness),
            accumulation_mode=AccumulationMode.ACCUMULATING)
        | "CountPerMinute" >> beam.CombineGlobally(CountCombineFn()).without_defaults()
        | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
        | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run().wait_until_finish()

if __name__ == '__main__':
    run()

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}
export PIPELINE_FOLDER=${BUCKET}
export RUNNER=DataflowRunner
export PUBSUB_TOPIC=projects/${PROJECT_ID}/topics/my_topic
export WINDOW_DURATION=60
export ALLOWED_LATENESS=1
export OUTPUT_TABLE_NAME=${PROJECT_ID}:logs.minute_traffic
export DEADLETTER_BUCKET=${BUCKET}
cd $BASE_DIR
python3 streaming_minute_traffic_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--input_topic=${PUBSUB_TOPIC} \
--window_duration=${WINDOW_DURATION} \
--allowed_lateness=${ALLOWED_LATENESS} \
--table_name=${OUTPUT_TABLE_NAME} \
--dead_letter_bucket=${DEADLETTER_BUCKET} \
--allow_unsafe_triggers

In [None]:
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export BUCKET=gs://${PROJECT_ID}/deadletter
gsutil ls $BUCKET
gsutil cat $BUCKET/*

## Troubleshooting & Debug

### Adding exception handlers

In [None]:
class FilterMessagesFn(beam.DoFn):
    BAD_MESSAGE_TAG = 'bad_message'
    GOOD_MESSAGE_TAG = 'good_message'

    def process(self, element, window=beam.DoFn.WindowParam):
        try:
            data = element.decode()
            # tag the element accordingly
            if 'bad' in data:
                yield pvalue.TaggedOutput(self.BAD_MESSAGE_TAG, element)
            else:
                yield pvalue.TaggedOutput(self.GOOD_MESSAGE_TAG, element)
                
        # handle any exceptions in the processing
        except Exception as exp:
            logging.getLogger.warning(exp)
            yield pvalue.TaggedOutput(self.BAD_MESSAGE_TAG, element)

In [None]:
# my_pipeline.py
import argparse
import logging
import argparse, logging, os
import apache_beam as beam
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions

class ReadGBK(beam.DoFn):
    def process(self, e):
        k, elems = e
        for v in elems:
            logging.info(f"the element is {v}")
            yield v
            
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output', dest='output', help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    read_query = """(
                 SELECT
                   version,
                   block_hash,
                   block_number
                 FROM
                   `bigquery-public-data.crypto_bitcoin.transactions`
                 WHERE
                   version = 1
                 LIMIT
                   1000000 )
               UNION ALL (
                 SELECT
                   version,
                   block_hash,
                   block_number
                 FROM
                   `bigquery-public-data.crypto_bitcoin.transactions`
                 WHERE
                   version = 2
                 LIMIT
                   1000 ) ;"""
    
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    (p
    | 'Read from BigQuery' >> beam.io.ReadFromBigQuery(
        query=read_query, use_standard_sql=True)
    | "Add Hotkey" >> beam.Map(lambda elem: (elem["version"], elem))
    | "Groupby" >> beam.GroupByKey()
    | 'Print' >>  beam.ParDo(ReadGBK())
    | 'Sink' >>  WriteToText(known_args.output))
    result = p.run()

if __name__ == '__main__':
    logger = logging.getLogger().setLevel(logging.INFO)
    run()

In [None]:
# Create a storage bucket
export PROJECT_ID=$(gcloud config get-value project)
gsutil mb -l US gs://$PROJECT_ID

# Attempt to launch the pipeline
# Launch the pipeline
python3 my_pipeline.py \
  --project=${PROJECT_ID} \
  --region=us-central1 \
  --output=gs://$PROJECT_ID/results/prefix \
  --tempLocation=gs://$PROJECT_ID/temp/ \
  --max_num_workers=5 \
  --runner=DataflowRunner

## Performance

### Graph Optimization

In [None]:
# Reshuffle after ParDo
_ = pcoll | beam.Reshuffle()

# Side input
_ = pcoll | beam.FlatMap(cross_join, rights=beam.pvalue.AsIter(side_input))

### Disaster Recovery

In [None]:
# Make a snapshot of a subscription
gcloud pubsub snapshots create my-snapshot \
--subscription=my-sub

# Stop and drain Dataflow job
gcloud dataflow jobs drain [job-id]

# Seek subscription to the snapshot
gcloud pubsub subscriptions seek my-sub --snapshot=my-snapshot

# Resubmit the pipeline
gcloud dataflow jobs run my-job-name \
--gcs_location=my_gcs_bucket

## CI/CD Testing

Introduce frameworks and features available to streamline CI/CD workflow for Dataflow pipelines.

### Unit Testing

Performing unit tests for DoFns and PTransforms.
`TestPipeline` is a special class included in the Beam SDK specifically for testing transforms and pipeline logic. Use the `assert_that` method to check that the output PCollection matches the expected output, and `equal_to` to verify that the output PCollection has the correct elements.

In [None]:
# Python PAssert
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to

# Python TestPipeline
with TestPipeline as p:
    INPUTS = [fake_input_1, fake_input_2]
    test_output = p 
        | beam.Create(INPUTS) # Transforms to be tested
    # Check whether a PCollection contains some elements in any order.
        | assert_that(test_output, equal_to(EXPECTED_OUTPUTS))

### In-flight Actions

Update

In [None]:
python -m apache_beam.examples.wordcount \
--project $PROJECT \
--staging_location=gs://$BUCKET/tmp/ \
--input=gs://dataflow-samples/shakespeare/kinglear.txt \
--output=gs://$BUCKET/results/outputs \
--runner=DataflowRunner \
--update \
--job_name=[prior job name] \
--transform_name_mapping=='{"oldTransform1":"newTransform1", ...}' \
--region=$REGION

### Grant the `dataflow.worker` role to the Compute Engine default service account

In [None]:
PROJECT_ID=$(gcloud config get-value project)
export PROJECT_NUMBER=$(gcloud projects list --filter="$PROJECT_ID" 
    --format="value(PROJECT_NUMBER)")
export serviceAccount=""$PROJECT_NUMBER"-compute@developer.gserviceaccount.com"
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${serviceAccount}" \
--role="roles/dataflow.worker"

export PROJECT_ID=$(gcloud config get-value project)
gsutil mb -l US gs://$PROJECT_ID
gsutil cp testing.out gs://$PROJECT_ID/8a_Batch_Testing_Pipeline/

### Performing unit tests for DoFns and PTransforms for a batch pipeline

- Create a `TestPipeline`.
- Create some test input data and use the `Create` transform to create a `PCollection` of your input data.
- Apply your transform to the input `PCollection` and save the resulting `PCollection`.
- Use the `assert_that` method from the `testing.util` module and its other methods to verify that the output `PCollection` contains the elements that you expect.

In [None]:
# weather_statistics_pipeline.py
import json
import typing
import logging
import apache_beam as beam

class WeatherRecord(typing.NamedTuple):
    loc_id: str
    lat: float
    lng: float
    date: str
    low_temp: float
    high_temp: float
    precip: float

beam.coders.registry.register_coder(WeatherRecord, beam.coders.RowCoder)

class ConvertCsvToWeatherRecord(beam.DoFn):

    def process(self, line):
        fields = 'loc_id,lat,lng,date,low_temp,high_temp,precip'.split(',')
        values = line.split(',')
        row = dict(zip(fields,values))
        for num_field in ('lat', 'lng', 'low_temp', 'high_temp', 'precip'):
            row[num_field] = float(row[num_field])
        yield WeatherRecord(**row)

class ConvertTempUnits(beam.DoFn):

    def process(self, row):
        row_dict = row._asdict()
        for field in ('low_temp', 'high_temp'):
            row_dict[field] = row_dict[field] * 1.8 + 32.0
        yield WeatherRecord(**row_dict)

class ConvertToJson(beam.DoFn):

    def process(self, row):
        line = json.dumps(row._asdict())
        yield line

class ComputeStatistics(beam.PTransform):

    def expand(self, pcoll):
    
        results = (
            pcoll | 'ComputeStatistics' >> beam.GroupBy('loc_id')
                                        .aggregate_field('low_temp', min, 'record_low')
                                        .aggregate_field('high_temp', max, 'record_high')
                                        .aggregate_field('precip', sum, 'total_precip')
                | 'ToJson' >> beam.ParDo(ConvertToJson())
        )
        
        return results

class WeatherStats(beam.PTransform):

    def expand(self, pcoll):

        results = (
            pcoll | "ParseCSV" >> beam.ParDo(ConvertCsvToWeatherRecord())
                  | "ConvertToF" >> beam.ParDo(ConvertTempUnits())
                  | "ComputeStats" >> ComputeStatistics()
        )

        return results

def run():

    p = beam.Pipeline()

    (p | 'ReadCSV' >> beam.io.ReadFromText('./weather_data.csv')
       | 'ComputeStatistics' >> WeatherStats()
       | 'WriteJson' >> beam.io.WriteToText('./weather_stats', '.json')
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
# weather_statistics_pipeline_test.py
import logging
import json
import unittest
import sys

from weather_statistics_pipeline import *
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import BeamAssertException
from apache_beam.testing.util import assert_that, equal_to

def main(out = sys.stderr, verbosity = 2):
    loader = unittest.TestLoader()
  
    suite = loader.loadTestsFromModule(sys.modules[__name__])
    unittest.TextTestRunner(out, verbosity = verbosity).run(suite)


class ConvertToWeatherRecordTest(unittest.TestCase):

    def test_convert_to_csv(self):

        with TestPipeline() as p:

            LINES = ['x,0.0,0.0,2/2/2021,1.0,2.0,0.1']
            EXPECTED_OUTPUT = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 1.0, 2.0, 0.1)]

            input_lines = p | beam.Create(LINES)

            output = input_lines | beam.ParDo(ConvertCsvToWeatherRecord())

            assert_that(output, equal_to(EXPECTED_OUTPUT))

class ConvertTempUnitsTest(unittest.TestCase):

    def test_convert_temp_units(self):

        with TestPipeline() as p:

            RECORDS = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 1.0, 2.0, 0.1),
                       WeatherRecord('y', 0.0, 0.0, '2/2/2021', -3.0, -1.0, 0.3)]

            EXPECTED_RECORDS = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 33.8, 35.6, 0.1),
                               WeatherRecord('y', 0.0, 0.0, '2/2/2021', 26.6, 30.2, 0.3)]

            input_records = p | beam.Create(RECORDS)

            output = input_records | beam.ParDo(ConvertTempUnits())
            
            assert_that(output, equal_to(EXPECTED_RECORDS))

class ComputeStatsTest(unittest.TestCase):
    
    def test_compute_statistics(self):

        with TestPipeline() as p:

            INPUT_RECORDS = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 33.8, 35.6, 0.1),
                             WeatherRecord('x', 0.0, 0.0, '2/3/2021', 41.6, 65.3, 0.2),
                             WeatherRecord('x', 0.0, 0.0, '2/4/2021', 45.3, 52.6, 0.2),
                             WeatherRecord('y', 0.0, 0.0, '2/2/2021', 12.8, 23.6, 0.1),
                             WeatherRecord('y', 0.0, 0.0, '2/3/2021', 26.6, 30.2, 0.3)]

            EXPECTED_STATS = [json.dumps({'loc_id': 'x', 'record_low': 33.8, 'record_high': 65.3, 'total_precip': 0.5 }),
                              json.dumps({'loc_id': 'y', 'record_low': 12.8, 'record_high': 30.2, 'total_precip': 0.4 })]

            inputs = p | beam.Create(INPUT_RECORDS)

            output = inputs | ComputeStatistics()

            assert_that(output, equal_to(EXPECTED_STATS))

class WeatherStatsTransformTest(unittest.TestCase):

    def test_weather_stats_transform(self):

        with TestPipeline() as p:

            INPUT_STRINGS = ["x,31.4,-39.2,2/2/21,4.0,7.5,0.1",
                             "x,31.4,-39.2,2/2/21,3.5,6.0,0.3",
                             "y,33.4,-49.2,2/2/21,12.5,17.5,0.5"]

            EXPECTED_STATS = [json.dumps({'loc_id': 'x', 'record_low': 38.3, 'record_high': 45.5, 'total_precip': 0.4 }),
                              json.dumps({'loc_id': 'y', 'record_low': 54.5, 'record_high': 63.5, 'total_precip': 0.5 })]

            inputs = p | beam.Create(INPUT_STRINGS)

            output = inputs | WeatherStats()

            assert_that(output, equal_to(EXPECTED_STATS))
            
if __name__ == '__main__':
    with open('testing.out', 'w') as f:
        main(f)

### Perform unit testing for a streaming pipeline

- Create a `TestPipeline`.
- Use the `TestStream` class to generate streaming data. This includes generating a series of events, advancing the watermark, and advancing the processing time.
- Use the `assert_that` method from the `testing.util` module and its other methods to verify that the output `PCollection` contains the elements that you expect.

In [None]:
# taxi_streaming_pipeline.py
import json
import typing
import logging
import apache_beam as beam
from apache_beam.transforms.trigger import AccumulationMode, AfterCount, AfterWatermark
from apache_beam.transforms.combiners import CountCombineFn
import argparse

class TaxiRide(typing.NamedTuple):
    ride_id: str
    point_idx: int
    latitude: float
    longitude: float
    timestamp: str
    meter_reading: float
    meter_increment: float
    ride_status: str
    passenger_count: int

beam.coders.registry.register_coder(TaxiRide, beam.coders.RowCoder)

class JsonToTaxiRide(beam.DoFn):

    def process(self, line):
        row = json.loads(line)
        yield TaxiRide(**row)

class ConvertCountToDict(beam.DoFn):

    def process(self, element, window=beam.DoFn.WindowParam):
        window_start = window.start.to_utc_datetime().strftime("%Y-%m-%dT%H:%M:%S")
        output = {"taxi_rides" : element, "timestamp": window_start}
        yield output


class TaxiCountTransform(beam.PTransform):

    def expand(self, pcoll):
        
        output = (pcoll
                    | "ParseJson" >> beam.ParDo(JsonToTaxiRide())
                    | "FilterForPickups" >> beam.Filter(lambda x : x.ride_status == 'pickup')
                    | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60),
                                              trigger=AfterWatermark(late=AfterCount(1)),
                                              allowed_lateness=60,
                                              accumulation_mode=AccumulationMode.ACCUMULATING)
                    | "CountPerMinute" >> beam.CombineGlobally(CountCombineFn()).without_defaults()
                 )

        return output

def run():

    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')

    parser.add_argument('--table_name', required=True, help='Output BQ table')

    opts = parser.parse_args()

    table_name = opts['table_name']

    table_schema = {
        "fields": [
            {
                "name": "taxi_rides",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },

        ]
    }

    p = beam.Pipeline()

    (p | "ReadFromPubSub" >> beam.io.ReadFromPubSub(topic="projects/pubsub-public-data/topics/taxirides-realtime") 
       | "TaxiPickupCount" >> TaxiCountTransform()
       | "ConvertToDict" >> beam.ParDo(ConvertCountToDict())
       | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
                table_name,
                schema=table_schema,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
                )
    )

if __name__ == '__main__':

    run()

In [None]:
# taxi_streaming_pipeline_test.py
import logging
import json
import unittest
import sys

import apache_beam as beam

from taxi_streaming_pipeline import *
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import BeamAssertException
from apache_beam.testing.util import assert_that, equal_to_per_window
from apache_beam.testing.test_stream import TestStream
from apache_beam.transforms.window import TimestampedValue, IntervalWindow
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions

def main(out = sys.stderr, verbosity = 2):
    loader = unittest.TestLoader()
  
    suite = loader.loadTestsFromModule(sys.modules[__name__])
    unittest.TextTestRunner(out, verbosity = verbosity).run(suite)


class TaxiWindowingTest(unittest.TestCase):

    def test_windowing_behavior(self):

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True

        with TestPipeline(options=options) as p:

            base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                         "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                         "\"ride_status\":\"pickup\",\"passenger_count\":1}" 

            base_json_enroute = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                         "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                         "\"ride_status\":\"pickup\",\"passenger_count\":1}" 
            

            test_stream = TestStream().advance_watermark_to(0).add_elements([
                TimestampedValue(base_json_pickup, 0),
                TimestampedValue(base_json_pickup, 0),
                TimestampedValue(base_json_enroute, 0),
                TimestampedValue(base_json_pickup, 60)
            ]).advance_watermark_to(60).advance_processing_time(60).add_elements([
                TimestampedValue(base_json_pickup, 120)
            ]).advance_watermark_to_infinity()

            taxi_counts = (p | test_stream
                             | TaxiCountTransform()
                          )

            EXPECTED_WINDOW_COUNTS = {IntervalWindow(0,60): [3],
                                      IntervalWindow(60,120): [1],
                                      IntervalWindow(120,180): [1]}

            assert_that(taxi_counts, equal_to_per_window(EXPECTED_WINDOW_COUNTS),
                        reify_windows=True)

class TaxiLateDataTest(unittest.TestCase):

        def test_late_data_behavior(self):

            options = PipelineOptions()
            options.view_as(StandardOptions).streaming = True

            with TestPipeline(options=options) as p:

                base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                            "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                            "\"ride_status\":\"pickup\",\"passenger_count\":1}" 

                test_stream = TestStream().advance_watermark_to(0).add_elements([
                    TimestampedValue(base_json_pickup, 0),
                    TimestampedValue(base_json_pickup, 0),
                ]).advance_watermark_to(60).advance_processing_time(60).add_elements([
                    TimestampedValue(base_json_pickup, 0)
                ]).advance_watermark_to(300).advance_processing_time(240).add_elements([
                    TimestampedValue(base_json_pickup, 0)
                ])

                EXPECTED_RESULTS = {IntervalWindow(0,60): [2,3]}  #On Time and Late Result

                taxi_counts_late = (p | test_stream
                                      | TaxiCountTransform()
                                   )

                assert_that(taxi_counts_late, equal_to_per_window(EXPECTED_RESULTS),
                            reify_windows=True)

if __name__ == '__main__':
    with open('testing.out', 'w') as f:
        main(f)

### The CI/CD pipeline

![CI CD pipeline](./img/CI_CD_pipeline.png)

In [None]:
# Initializing Cloud Composer environment
gcloud composer environments create $COMPOSER_ENV_NAME \
--location $COMPOSER_REGION \
--zone $COMPOSER_ZONE_ID \
--machine-type n1-standard-1 \
--node-count 3 \
--disk-size 20 \
--python-version 3

# Cloud Composer environment variable
export COMPOSER_DAG_BUCKET=$(gcloud composer environments \
    describe $COMPOSER_ENV_NAME \
    --location $COMPOSER_REGION \
    --format="get(config.dagGcsPrefix)")

# Service account
export COMPOSER_SERVICE_ACCOUNT=$(gcloud composer environments \
    describe $COMPOSER_ENV_NAME \
    --location $COMPOSER_REGION \
    --format="get(config.nodeConfig.serviceAccount)")

In [None]:
# Cloud Source Repositories
gcloud source repos create $SOURCE_CODE_REPO
cp -r ~/ci-cd-for-data-processing-workflow/source-code ~/$SOURCE_CODE_REPO
cd ~/$SOURCE_CODE_REPO
git config --global credential.'https://source.developers.google.com'.helper gcloud.sh
git config --global user.email $(gcloud config list --format 'value(core.account)')
git config --global user.name $(gcloud config list --format 'value(core.account)')
git init
git remote add google \
    https://source.developers.google.com/p/$GCP_PROJECT_ID/r/$SOURCE_CODE_REPO
git add .
git commit -m 'initial commit'
git push google master

In [None]:
# Cloud Build pipeline
cd ~/ci-cd-for-data-processing-workflow/source-code/build-pipeline
gcloud builds submit --config=build_deploy_test.yaml --substitutions=\
REPO_NAME=$SOURCE_CODE_REPO,\
_DATAFLOW_JAR_BUCKET=$DATAFLOW_JAR_BUCKET_TEST,\
_COMPOSER_INPUT_BUCKET=$INPUT_BUCKET_TEST,\
_COMPOSER_REF_BUCKET=$REF_BUCKET_TEST,\
_COMPOSER_DAG_BUCKET=$COMPOSER_DAG_BUCKET,\
_COMPOSER_ENV_NAME=$COMPOSER_ENV_NAME,\
_COMPOSER_REGION=$COMPOSER_REGION,\
_COMPOSER_DAG_NAME_TEST=$COMPOSER_DAG_NAME_TEST

# Get the URL to Cloud Composer web interface
gcloud composer environments describe $COMPOSER_ENV_NAME \
--location $COMPOSER_REGION \
--format="get(config.airflowUri)"

In [None]:
# Cloud Composer variable for the JAR filename
export DATAFLOW_JAR_FILE_LATEST=$(gcloud composer environments run $COMPOSER_ENV_NAME \
--location $COMPOSER_REGION variables -- \
--get dataflow_jar_file_test 2>&1 | grep -i '.jar')

# Build pipeline configuration file
cd ~/ci-cd-for-data-processing-workflow/source-code/build-pipeline
gcloud builds submit --config=deploy_prod.yaml --substitutions=\
REPO_NAME=$SOURCE_CODE_REPO,\
_DATAFLOW_JAR_BUCKET_TEST=$DATAFLOW_JAR_BUCKET_TEST,\
_DATAFLOW_JAR_FILE_LATEST=$DATAFLOW_JAR_FILE_LATEST,\
_DATAFLOW_JAR_BUCKET_PROD=$DATAFLOW_JAR_BUCKET_PROD,\
_COMPOSER_INPUT_BUCKET=$INPUT_BUCKET_PROD,\
_COMPOSER_ENV_NAME=$COMPOSER_ENV_NAME,\
_COMPOSER_REGION=$COMPOSER_REGION,\
_COMPOSER_DAG_BUCKET=$COMPOSER_DAG_BUCKET,\
_COMPOSER_DAG_NAME_PROD=$COMPOSER_DAG_NAME_PROD

# Get the URL for Cloud Composer UI
gcloud composer environments describe $COMPOSER_ENV_NAME \
--location $COMPOSER_REGION \
--format="get(config.airflowUri)"

## Flex Templates

### Create a custom Dataflow Flex Template container image

In [None]:
# my_pipeline.py
import argparse
import time
import logging
import json
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

def parse_json(element):
    return json.loads(element)

def drop_fields(element):
    element.pop('user_agent')
    return element

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--inputPath', required=True, help='Path to events.json')
    parser.add_argument('--outputPath', required=True, help='Path to coldline storage bucket')
    parser.add_argument('--tableName', required=True, help='BigQuery table name')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('my-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.inputPath
    output_path = opts.outputPath
    table_name = opts.tableName

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "ip",
                "type": "STRING"
            },
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "lat",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "lng",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
            {
                "name": "http_request",
                "type": "STRING"
            },
            {
                "name": "http_response",
                "type": "INTEGER"
            },
            {
                "name": "num_bytes",
                "type": "INTEGER"
            }
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    '''
    Steps:
    1) Read something
    2) Transform something
    3) Write something
    '''

    # Read in lines to an initial PCollection that can then be branched off of
    lines = p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)

    # Write to Google Cloud Storage
    lines | 'WriteRawToGCS' >> beam.io.WriteToText(output_path)

    # Read elements from Json, filter out individual elements, and write to BigQuery
    (lines
        | 'ParseJson' >> beam.Map(parse_json)
        | 'DropFields' >> beam.Map(drop_fields)
        | 'FilterFn' >> beam.Filter(lambda row: row['num_bytes'] < 120)
        | 'WriteToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
    run()

In [None]:
# Dockerfile
FROM gcr.io/dataflow-templates-base/python3-template-launcher-base
ARG WORKDIR=/dataflow/template
RUN mkdir -p ${WORKDIR}
WORKDIR ${WORKDIR}
RUN apt-get update && apt-get install -y libffi-dev && rm -rf /var/lib/apt/lists/*
COPY my_pipeline.py .
ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/my_pipeline.py"
RUN python3 -m pip install apache-beam[gcp]==2.25.0

First, enable Kaniko cache use by default. Kaniko caches container build artifacts, so using this option speeds up subsequent builds. We will also use `pip3 freeze` to record the packages and their versions being used in our environment.

Cloud Build to build the container image.

In [None]:
gcloud config set builds/use_kaniko True

export TEMPLATE_IMAGE="gcr.io/$PROJECT_ID/dataflow/my_pipeline:latest"
gcloud builds submit --tag $TEMPLATE_IMAGE .

### Create and stage the flex template

Create a template spec file in a Cloud Storage containing all of the necessary information to run the job, such as the SDK information and metadata.

In [None]:
# metadata.json
{
  "name": "My Branching Pipeline",
  "description": "A branching pipeline that writes raw to GCS Coldline, and filtered data to BQ",
  "parameters": [
    {
      "name": "inputPath",
      "label": "Input file path.",
      "helpText": "Path to events.json file.",
      "regexes": [
        ".*\\.json"
      ]
    },
    {
      "name": "outputPath",
      "label": "Output file location",
      "helpText": "GCS Coldline Bucket location for raw data",
      "regexes": [
        "gs:\\/\\/[a-zA-z0-9\\-\\_\\/]+"
      ]
    },
    {
      "name": "tableName",
      "label": "BigQuery output table",
      "helpText": "BigQuery table spec to write to, in the form 'project:dataset.table'.",
      "regexes": [
        "[^:]+:[^.]+[.].+"
      ]
    }
  ]
}

In [None]:
export TEMPLATE_PATH="gs://${PROJECT_ID}/templates/mytemplate.json"

gcloud dataflow flex-template build $TEMPLATE_PATH \
--image=$TEMPLATE_IMAGE \
--sdk-language="PYTHON" \
--metadata-file=metadata.json

### Execute the flex template

One of the benefits of using Dataflow templates is the ability to execute them from a wider variety of contexts, other than a development environment.

In [None]:
# gcloud
export PROJECT_ID=$(gcloud config get-value project)
export REGION='us-central1'
export JOB_NAME=mytemplate-$(date +%Y%m%d-%H%M%S)
export TEMPLATE_LOC=gs://${PROJECT_ID}/templates/mytemplate.json
export INPUT_PATH=gs://${PROJECT_ID}/events.json
export OUTPUT_PATH=gs://${PROJECT_ID}-coldline/template_output/
export BQ_TABLE=${PROJECT_ID}:logs.logs_filtered
gcloud dataflow flex-template run ${JOB_NAME} \
--region=$REGION \
--template-file-gcs-location ${TEMPLATE_LOC} \
--parameters "inputPath=${INPUT_PATH},outputPath=${OUTPUT_PATH},tableName=${BQ_TABLE}"

In [None]:
# gcloud
gcloud dataflow flex-template run "job-name-`date +%Y%m%d-%H%M%S`" \
--template-file-gcs-location "$TEMPLATE_PATH" \
--parameters inputSubscription="$SUBSCRIPTION" \
--parameters outputTable="$PROJECT:$DATASET.$TABLE" \
--region="$REGION"

# REST API
curl -X POST "https://dataflow.googleapis.com/v1b3/projects/$PROJECT/locations/${REGION}/flexTemplates:launch" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
-d '{
  "launch_parameter": {
    "jobName": "job-name-`date +%Y%m%d-%H%M%S`",
    "parameters": {
      "inputSubscription": "'$SUBSCRIPTION'",
      "outputTable": "'$PROJECT:$DATASET.$TABLE'"
    },
    "containerSpecGcsPath": "'$TEMPLATE_PATH'"
  }
}'

# Cloud Scheduler
gcloud scheduler jobs create http scheduler-job \
--schedule="*/30 * * * *" \
--uri="https://dataflow.googleapis.com/v1b3/projects/$PROJECT/locations/${REGION}/flexTemplates:launch" \
--http-method=POST \
--headers Content-Type=application/json \
--oauth-service-account-email=email@project.iam.gserviceaccount.com \
--message-body='{
    "launch_parameter": {
      "jobName":"job-name"
      "parameters": {
        "inputSubscription": "'$SUBSCRIPTION'",
        "outputTable": "'$PROJECT:$DATASET.$TABLE'"
      },
      "containerSpecGcsPath": "'$TEMPLATE_PATH'"
    }
}'