# Babyweight Data Preprocessing with tf.Transform

## 1. Preparation

### Install required packages

In [1]:
!sudo -i pip install apache-beam==2.46.0

[0m

You can ingnore the dependency resolver errors. Confirm the final message starting with "Successfully installed ..."

In [2]:
!sudo -i pip install tensorflow-transform==1.6.0

Collecting tensorflow-transform==1.6.0
  Downloading tensorflow_transform-1.6.0-py3-none-any.whl (427 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.5/427.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,<2.8,>=1.15.5
  Downloading tensorflow-2.7.4-cp37-cp37m-manylinux2010_x86_64.whl (495.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.5/495.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tensorflow-metadata<1.7.0,>=1.6.0
  Downloading tensorflow_metadata-1.6.0-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.8/48.8 kB[0m [31m283.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pyarrow<6,>=1
  Downloading pyarrow-5.0.0-cp37-cp37m-manylinux2014_x86_64.whl (23.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m12.8 MB/s[0m eta [36

You can ingnore the dependency resolver errors. Confirm the final message starting with "Successfully installed ..."

**Now you have to restart kernel from the menu bar: "Kernel" -> "Restart Kernel".**

After restarting the kernel, you can resume the code execution from the next cell.

### Confirm the installed packages

In [1]:
!pip list | grep -E '(tensorflow|beam)'

apache-beam                            2.46.0
tensorflow                             2.7.4
tensorflow-cloud                       0.1.16
tensorflow-datasets                    4.8.2
tensorflow-estimator                   2.7.0
tensorflow-hub                         0.13.0
tensorflow-io                          0.25.0
tensorflow-io-gcs-filesystem           0.25.0
tensorflow-metadata                    1.6.0
tensorflow-probability                 0.19.0
tensorflow-serving-api                 2.7.4
tensorflow-transform                   1.6.0


### Create setup.py to install packages to Dataflow containers

This is used to install additional packages to Dataflow worker containers.

In [2]:
%%writefile setup.py
import setuptools

setuptools.setup(
    install_requires=['tensorflow-transform==1.6.0'],
    packages=setuptools.find_packages(),
)

Writing setup.py


### Set global flags

In [3]:
PROJECT = 'your-project'
BUCKET = 'your-project-babyweight'
REGION = 'us-central1'
ROOT_DIR = 'babyweight_tft'

RUN_LOCAL = False # if True, the DirectRunner is used, else DataflowRunner
DATA_SIZE = 10000 # number of records to be retrieved from BigQuery

In [4]:
import os
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['ROOT_DIR'] = ROOT_DIR
os.environ['RUN_LOCAL'] = str(RUN_LOCAL)

In [5]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION
if ! gcloud storage ls | grep -q gs://${BUCKET}/; then
    gcloud storage buckets create --location ${REGION} gs://${BUCKET}
fi

Updated property [core/project].
Updated property [compute/region].
Creating gs://etsuji-tft1-babyweight/...


### Import required packages and modules

In [6]:
import argparse
import os

import apache_beam as beam
import tensorflow.compat.v2 as tf
import tensorflow_transform as tft

import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tfx_bsl.coders import example_coder

### Define raw input data and their metadata

In [7]:
CATEGORICAL_FEATURE_NAMES = ['is_male', 'mother_race']
NUMERIC_FEATURE_NAMES = ['mother_age', 'plurality', 'gestation_weeks']
TARGET_FEATURE_NAME = 'weight_pounds'

def create_raw_metadata():  

    feature_spec = dict(
        [(name, tf.io.FixedLenFeature([], tf.string)) for name in CATEGORICAL_FEATURE_NAMES] +
        [(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUMERIC_FEATURE_NAMES] +
        [(TARGET_FEATURE_NAME, tf.io.FixedLenFeature([], tf.float32))])

    raw_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    
    return raw_metadata

The metadata contains feature schema in the protobuf format as below.

In [8]:
create_raw_metadata().schema

feature {
  name: "gestation_weeks"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "is_male"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "mother_age"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "mother_race"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "plurality"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "weight_pounds"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}

### Define source query and source cleanup function

In [9]:
def get_source_query(step, data_size):
    
    train_size = data_size * 0.7
    eval_size = data_size * 0.3
    
    query = """
    SELECT
      ROUND(weight_pounds,1) AS weight_pounds,
      is_male,
      mother_age,
      mother_race,
      plurality,
      gestation_weeks,
      FARM_FINGERPRINT( 
        CONCAT(
          COALESCE(CAST(weight_pounds AS STRING), 'NA'),
          COALESCE(CAST(is_male AS STRING),'NA'),
          COALESCE(CAST(mother_age AS STRING),'NA'),
          COALESCE(CAST(mother_race AS STRING),'NA'),
          COALESCE(CAST(plurality AS STRING), 'NA'),
          COALESCE(CAST(gestation_weeks AS STRING),'NA')
          )
        ) AS key
        FROM
          publicdata.samples.natality
        WHERE year > 2000
        AND weight_pounds > 0
        AND mother_age > 0
        AND plurality > 0
        AND gestation_weeks > 0
        AND month > 0
    """
    
    if step == 'train':
        source_query = 'SELECT * FROM ({}) WHERE MOD(key, 100) < 70 LIMIT {}'.format(query, int(train_size))
    else:
        source_query = 'SELECT * FROM ({}) WHERE MOD(key, 100) >= 70 LIMIT {}'.format(query, int(eval_size))
    
    return source_query

In [10]:
def prep_bq_row(bq_row):

    # modify opaque numeric race code into human-readable data
    races = dict(zip([1,2,3,4,5,6,7,18,28,39,48],
                     ['White', 'Black', 'American Indian', 'Chinese', 
                      'Japanese', 'Hawaiian', 'Filipino',
                      'Asian Indian', 'Korean', 'Samaon', 'Vietnamese']))
    result = {} 
    
    for feature_name in bq_row.keys():
        if isinstance(bq_row[feature_name], bool):
            result[feature_name] = str(bq_row[feature_name])
        else:
            result[feature_name] = bq_row[feature_name]

    if 'mother_race' in bq_row and bq_row['mother_race'] in races:
        result['mother_race'] = races[bq_row['mother_race']]
    else:
        result['mother_race'] = 'Unknown'

    return result

The output from the cleanup function `prep_bq_row` is used as raw input data.

## 2. Define data transformation pipeline

First, we define component functions in the pipeline.

### Read and clean from source

In [11]:
def read_from_bq(pipeline, step, data_size):
    
    source_query = get_source_query(step, data_size)
    raw_data = (
        pipeline
        | '{} - Read Data from BigQuery'.format(step) >> beam.io.Read(
            beam.io.BigQuerySource(query=source_query, use_standard_sql=True))
        | '{} - Clean up Data'.format(step) >> beam.Map(prep_bq_row)
    )

    # Assosiate the metadata to the raw input data. The metadata are used for the transformation.
    raw_metadata = create_raw_metadata()
    raw_dataset = (raw_data, raw_metadata)
    return raw_dataset

### tf.Transform preprocess_fn

This function defines the data transformation against raw input data. This will be used by the data transformation pipeline, and also embeded in the exported model.

In [12]:
def preprocess_fn(input_features):

    # explicitly import packages here as the apache beam fails to serialize them from the global context.
    import tensorflow.compat.v2 as tf
    import tensorflow_transform as tft

    output_features = {}

    # target feature
    output_features['weight_pounds'] = input_features['weight_pounds']

    # normalisation
    output_features['mother_age_normalized'] = tft.scale_to_z_score(input_features['mother_age'])
    
    # scaling
    output_features['gestation_weeks_scaled'] =  tft.scale_to_0_1(input_features['gestation_weeks'])
    
    # bucketisation based on quantiles
    output_features['mother_age_bucketized'] = tft.bucketize(input_features['mother_age'], num_buckets=5)
    
    # you can compute new features based on custom formulas
    output_features['mother_age_log'] = tf.math.log(input_features['mother_age'])
    
    # or create flags/indicators
    is_multiple = tf.as_string(input_features['plurality'] > tf.constant(1.0))
    
    # convert categorical features to indexed vocab
    output_features['mother_race_index'] = tft.compute_and_apply_vocabulary(input_features['mother_race'], vocab_filename='mother_race')
    output_features['is_male_index'] = tft.compute_and_apply_vocabulary(input_features['is_male'], vocab_filename='is_male')
    output_features['is_multiple_index'] = tft.compute_and_apply_vocabulary(is_multiple, vocab_filename='is_multiple')
    
    return output_features

### Analyze and transform

This is applied to the training dataset.

In [13]:
def analyze_and_transform(raw_dataset, step):
    
    transformed_dataset, transform_fn = (
        raw_dataset 
        | '{} - Analyze & Transform'.format(step) >> tft_beam.AnalyzeAndTransformDataset(
            preprocess_fn, output_record_batches=True)
    )
    
    return transformed_dataset, transform_fn

### Transform

This is applied to the evaluation dataset.

In [14]:
def transform(raw_dataset, transform_fn, step):
    
    transformed_dataset = (
        (raw_dataset, transform_fn) 
        | '{} - Transform'.format(step) >> tft_beam.TransformDataset(output_record_batches=True)
    )
    
    return transformed_dataset

### Write tfrecords

In [15]:
def write_tfrecords(transformed_dataset, location, step):
    from tfx_bsl.coders import example_coder

    transformed_data, transformed_metadata = transformed_dataset
    (
        transformed_data
        | '{} - Encode Transformed Data'.format(step) >> beam.FlatMapTuple(
                            lambda batch, _: example_coder.RecordBatchToExamples(batch))
        | '{} - Write Transformed Data'.format(step) >> beam.io.WriteToTFRecord(
                            file_path_prefix=os.path.join(location,'{}'.format(step)),
                            file_name_suffix='.tfrecords')
    )  

### Write text records

In [16]:
def write_text(dataset, location, step):
    
    data, _ = dataset
    (
        data 
        | '{} - WriteData'.format(step) >> beam.io.WriteToText(
            file_path_prefix=os.path.join(location,'{}'.format(step)),
            file_name_suffix='.txt')
    )

### Write transformation artefacts

In [17]:
def write_transform_artefacts(transform_fn, location):
    
    (
        transform_fn 
        | 'Write Transform Artefacts' >> transform_fn_io.WriteTransformFn(location)
    )

Now we can construct a pipeline by combining components.

### Construct data transformation pipeline

In [18]:
def run_transformation_pipeline(args):
    
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
    
    runner = args['runner']
    data_size = args['data_size']
    transformed_data_location = args['transformed_data_location']
    transform_artefact_location = args['transform_artefact_location']
    temporary_dir = args['temporary_dir']
    debug = args['debug']
    
    print("Sample data size: {}".format(data_size))
    print("Sink transformed data files location: {}".format(transformed_data_location))
    print("Sink transform artefact location: {}".format(transform_artefact_location))
    print("Temporary directory: {}".format(temporary_dir))
    print("Runner: {}".format(runner))
    print("Debug enabled: {}".format(debug))

    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
        with tft_beam.Context(temporary_dir):
            
            # Preprocess train data
            step = 'train'
            # Read raw train data from BQ
            raw_train_dataset = read_from_bq(pipeline, step, data_size)
            # Analyze and transform raw_train_dataset 
            transformed_train_dataset, transform_fn = analyze_and_transform(raw_train_dataset, step)
            # Write transformed train data to sink as tfrecords
            write_tfrecords(transformed_train_dataset, transformed_data_location, step)
            
            # Preprocess evaluation data
            step = 'eval'
            # Read raw eval data from BQ
            raw_eval_dataset = read_from_bq(pipeline, step, data_size)
            # Transform eval data based on produced transform_fn
            transformed_eval_dataset = transform(raw_eval_dataset, transform_fn, step)
            # Write transformed eval data to sink as tfrecords
            write_tfrecords(transformed_eval_dataset, transformed_data_location, step)
            
            # Write transformation artefacts 
            write_transform_artefacts(transform_fn, transform_artefact_location)

            # (Optional) for debugging, write transformed data as text 
            step = 'debug'
            # Wwrite transformed train data as text if debug enabled
            if debug == True:
                write_text(transformed_train_dataset, transformed_data_location, step)

## 3. Execute transformation pipeline

### Set pipeline parameters

In [19]:
from datetime import datetime

OUTPUT_DIR = "gs://{}/{}".format(BUCKET, ROOT_DIR)
TRANSFORM_ARTEFACTS_DIR = os.path.join(OUTPUT_DIR, 'transform')
TRANSFORMED_DATA_DIR = os.path.join(OUTPUT_DIR, 'transformed')
TEMP_DIR = os.path.join(OUTPUT_DIR, 'tmp')

if RUN_LOCAL:
    runner = 'DirectRunner'
else:
    runner = 'DataflowRunner'

job_name = 'preprocess-babweight-data-tft-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

args = {
    'job_name': job_name,
    'runner': runner,
    'data_size': DATA_SIZE,
    'transformed_data_location':  TRANSFORMED_DATA_DIR,
    'transform_artefact_location':  TRANSFORM_ARTEFACTS_DIR,
    'temporary_dir': TEMP_DIR,
    'debug': False,
    
    'project': PROJECT,
    'region': REGION,
    'staging_location': os.path.join(OUTPUT_DIR, 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'worker_machine_type': 'n1-standard-1',
    'max_num_workers': 3,
    'setup_file': './setup.py', # requirements_file doesn't work as tft provides only wheel pkg.
}

### Run pipeline

In [20]:
try: 
    tf.io.gfile.rmtree(TRANSFORMED_DATA_DIR)
    tf.io.gfile.rmtree(TRANSFORM_ARTEFACTS_DIR)
    tf.io.gfile.rmtree(TEMP_DIR)
    print('previous transformation files deleted!')
except:
    pass

print('Launching {} job {} ... hang on'.format(runner, job_name))
print('')
run_transformation_pipeline(args)
print('Done!')

Launching DataflowRunner job preprocess-babweight-data-tft-230419-024433 ... hang on

Sample data size: 10000
Sink transformed data files location: gs://etsuji-tft1-babyweight/babyweight_tft/transformed
Sink transform artefact location: gs://etsuji-tft1-babyweight/babyweight_tft/transform
Temporary directory: gs://etsuji-tft1-babyweight/babyweight_tft/tmp
Runner: DataflowRunner
Debug enabled: False


  


  temp_location = pcoll.pipeline.options.view_as(






Instructions for updating:
Use ref() instead.


2023-04-19 02:44:39.859821: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-04-19 02:44:39.859881: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-04-19 02:44:39.859912: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tensorflow-2-8-20230419-113852): /proc/driver/nvidia/version does not exist
2023-04-19 02:44:39.860393: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler fla










Done!


### Explore the produced artefacts

In [21]:
%%bash

echo 'transformed data:' 
gcloud storage ls gs://${BUCKET}/${ROOT_DIR}/transformed 
echo ''

echo 'transformed metadata:'  
gcloud storage ls gs://${BUCKET}/${ROOT_DIR}/transform/transformed_metadata 
echo ''

echo 'transform artefact:'   
gcloud storage ls gs://${BUCKET}/${ROOT_DIR}/transform/transform_fn 
echo ''

echo 'transform assets:'
gcloud storage ls gs://${BUCKET}/${ROOT_DIR}/transform/transform_fn/assets 

transformed data:
gs://etsuji-tft1-babyweight/babyweight_tft/transformed/eval-00000-of-00001.tfrecords
gs://etsuji-tft1-babyweight/babyweight_tft/transformed/train-00000-of-00001.tfrecords

transformed metadata:
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transformed_metadata/
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transformed_metadata/asset_map
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transformed_metadata/schema.pbtxt

transform artefact:
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transform_fn/
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transform_fn/saved_model.pb
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transform_fn/assets/
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transform_fn/variables/

transform assets:
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transform_fn/assets/
gs://etsuji-tft1-babyweight/babyweight_tft/transform/transform_fn/assets/is_male
gs://etsuji-tft1-babyweight/babyweight_tft/trans