## Install Requirements

In [None]:
!git clone https://github.com/Building-ML-Pipelines/building-machine-learning-pipelines

Cloning into 'building-machine-learning-pipelines'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 740 (delta 31), reused 36 (delta 13), pack-reused 663[K
Receiving objects: 100% (740/740), 29.41 MiB | 16.89 MiB/s, done.
Resolving deltas: 100% (388/388), done.


In [None]:
%cd drive/My\ Drive/Building\ ML\ Pipelines/
!pip install -r requirements.txt
%cd ..
%cd ..

/content/drive/My Drive/Building ML Pipelines
Collecting tfx>=0.24.0
[?25l  Downloading https://files.pythonhosted.org/packages/fe/df/440d01bf102225718c844f5126159d71d16bdbe5deb8d451b95f16060767/tfx-0.24.0-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.1MB/s 
[?25hCollecting tensorboard_plugin_fairness_indicators>=0.24.0
  Downloading https://files.pythonhosted.org/packages/b2/cd/809d0c0df2fba0690b3cd7acb105e076576e5234235e42b3d36082f1108c/tensorboard_plugin_fairness_indicators-0.24.0-py3-none-any.whl
Collecting tensorflow_privacy>=0.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/41/ae/7db0dcf76a746314a174578a7b99ff098b40b908c4c693a955a2bbc0127b/tensorflow_privacy-0.5.1-py3-none-any.whl (149kB)
[K     |████████████████████████████████| 153kB 40.4MB/s 
Collecting witwidget
[?25l  Downloading https://files.pythonhosted.org/packages/a9/12/d61b3104cde5181e3340dbd7620885d4127f62e0ef4f738786e226683127/witwidget-1.7.0-py3-none-any.whl (2.3MB)


/content/drive/My Drive
/content/drive


In [None]:
#!/usr/bin/env python3

"""
Downloads the csv data
"""

import logging
import os
import shutil

import pandas as pd
import urllib3

# Initial dataset source
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"

# Initial local dataset location
LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv"


def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path

    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    # disable insecure https warning
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    c = urllib3.PoolManager()
    with c.request("GET", url, preload_content=False) as res, open(
        LOCAL_FILE_NAME, "wb"
    ) as out_file:
        shutil.copyfileobj(res, out_file)
    logging.info("Download completed.")


def create_folder():
    """Creates a data folder if it doesn't exist.

    Returns:
        None
    """
    directory = "data/"
    if not os.path.exists(directory):
        os.makedirs(directory)
        logging.info("Data folder created.")
    else:
        logging.info("Data folder already existed.")


def check_execution_path():
    """Check if the function and therefore all subsequent functions
        are executed from the root of the project

    Returns:
        boolean -- returns False if execution path isn't the root,
            otherwise True
    """
    file_name = "LICENSE"
    if not os.path.exists(file_name):
        logging.error(
            "Don't execute the script from a sub-directory. "
            "Switch to the root of the project folder"
        )
        return False
    return True


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    logging.info("Started download script")

    if check_execution_path():
        create_folder()
        download_dataset()

    logging.info("Finished download script")

INFO:root:Started download script
INFO:root:Data folder created.
INFO:urllib3.poolmanager:Redirecting http://bit.ly/building-ml-pipelines-dataset -> https://drive.google.com/uc?export=download&id=1VHjb8L8n2d6eLz_lA-F-bk6Z0UecHpEF
INFO:urllib3.poolmanager:Redirecting https://drive.google.com/uc?export=download&id=1VHjb8L8n2d6eLz_lA-F-bk6Z0UecHpEF -> https://doc-0o-8s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1hvf6qfiadeelnemkl9d0qngqasdtpvg/1601798625000/06616860426990197454/*/1VHjb8L8n2d6eLz_lA-F-bk6Z0UecHpEF?e=download
INFO:root:Download completed.
INFO:root:Finished download script


# Chapter 3: Data Ingestion

In [None]:
import tensorflow as tf

base_dir = "drive/My Drive/Building ML Pipelines/"
chap_dir = base_dir + "Chapter 3/"
data_dir = base_dir + "Data/"
out_dir = chap_dir + "Outputs/"
csv_data_dir = base_dir + "CSV Data/"
csv_dir = csv_data_dir + "consumer_complaints_with_narrative.csv"

## Concepts for Data Ingestion

#### TFRecord

See https://www.tensorflow.org/tutorials/load_data/tfrecord for details.

In [None]:
# Write binary to tfrecord
with tf.io.TFRecordWriter(out_dir + "test.tfrecord") as w:
    w.write(b"First record")
    w.write(b"Second record")

for record in tf.data.TFRecordDataset(out_dir + "test.tfrecord"):
    print(record)

tf.Tensor(b'First record', shape=(), dtype=string)
tf.Tensor(b'Second record', shape=(), dtype=string)


### Ingesting Local Data Files

#### Converting comma-seperated (CSV) data to tf.Example

In [None]:
import os

from tfx.components import CsvExampleGen
from tfx.utils.dsl_utils import external_input
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

In [None]:
examples = external_input(csv_data_dir)
# Instantiate the pipeline component
example_gen = CsvExampleGen(input=examples)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)

Instructions for updating:
external_input is deprecated, directly pass the uri to ExampleGen.






0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2d6f8cdb00.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f8cde80.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f8cde80.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f8cde80.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f8cde80.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],drive/My Drive/Building ML Pipelines/CSV Data/
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059"
['_beam_pipeline_args'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f8cde80.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1) at 0x7f2d6f8e23c8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


### Folder Structure

#### Importing existing TFRecord Files

In [None]:
import os
from tfx.components import ImportExampleGen
from tfx.utils.dsl_utils import external_input

In [None]:
examples = external_input(out_dir + "CsvExampleGen/examples/1/train") # out_dir only allowed to contain one file
# Instantiate the pipeline component
example_gen = ImportExampleGen(input=examples)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)



0,1
.execution_id,2
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } ImportExampleGen at 0x7f2d6de4e208.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6db33080.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1/train['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:19764017,xor_checksum:1601992763,sum_checksum:1601992763['_beam_pipeline_args'][]"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6db33080.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6db33080.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1/train['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:19764017,xor_checksum:1601992763,sum_checksum:1601992763['_beam_pipeline_args'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6db33080.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/1/train
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:19764017,xor_checksum:1601992763,sum_checksum:1601992763"
['_beam_pipeline_args'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6db33080.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2) at 0x7f2dbf118940.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


#### Converting Parquet-serialized data to tf.Example

Use generic file loader component FileBasedExampleGen, to override executor_class




In [None]:
from tfx.components import FileBasedExampleGen
from tfx.components.example_gen.custom_executors import parquet_executor
from tfx.utils.dsl_utils import external_input

In [None]:
examples = external_input(parquet_dir_path)
example_gen = FileBasedExampleGen(
    input=examples,
    executor_class=parquet_executor.Executor) # override the executor
)

#### Converting Avro-serialized data to tf.Example

Overriding executor_class can be expanded to almost any other file **type**

In [None]:
from tfx.components import FileBasedExampleGen
from tfx.components.example_gen.custom_executors import avro_executor
from tfx.utils.dsl_utils import external_input

In [None]:
examples = external_input(avro_dir_path)
example_gen = FileBasedExampleGen(
    input=examples,
    executor_class=avro_executor.Executor) # override the executor
)

#### Converting your custom data to TFRecord data structures

Convert structured data into TFRecord data structures

###### **Hint:** tf.Example is a simple but highly flexible data structure, which is a key-value mapping: {"string": "value}

In [None]:
import csv

import tensorflow as tf
from tqdm import tqdm

In [None]:
def _bytes_feature(value):
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[value.encode()])
    )


def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def clean_rows(row):
    if not row["zip_code"]:
        row["zip_code"] = "99999"
    return row

def convert_zipcode_to_int(zipcode):
    if isinstance(zipcode, str) and "XX" in zipcode:
        zipcode = zipcode.replace("XX", "00")
    int_zipcode = int(zipcode)
    return int_zipcode

In [None]:
# original_data_file = "../../data/consumer_complaints_with_narrative.csv"
tfrecords_filename = out_dir + "TF Record Writer/consumer_complaints.tfrecord"

# Create TFRecordWrite object that saves to the path specified in tfrecord_filename
tf_record_writer = tf.io.TFRecordWriter(tfrecords_filename)

with open(csv_dir) as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')

    for row in tqdm(reader):
        # Replaces missing data
        row = clean_rows(row)

        # tf.train.Example for every data record
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "product": _bytes_feature(row["product"]),
                    "sub_product": _bytes_feature(row["sub_product"]),
                    "issue": _bytes_feature(row["issue"]),
                    "sub_issue": _bytes_feature(row["sub_issue"]),
                    "state": _bytes_feature(row["state"]),
                    "zip_code": _int64_feature(convert_zipcode_to_int(row["zip_code"])),
                    "company": _bytes_feature(row["company"]),
                    "company_response": _bytes_feature(row["company_response"]),
                    "timely_response": _bytes_feature(row["timely_response"]),
                    "consumer_disputed": _bytes_feature(
                        row["consumer_disputed"]
                    ),
                }
            )
        )
        # Serialize the data structure
        tf_record_writer.write(example.SerializeToString())
    tf_record_writer.close()


66799it [00:07, 8416.20it/s]


In [None]:
examples = external_input(out_dir + "TF Record Writer") # out_dir only allowed to contain one file
# Instantiate the pipeline component
example_gen = ImportExampleGen(input=examples)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)



0,1
.execution_id,5
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } ImportExampleGen at 0x7f2d6cb76828.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6de3df98.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/TF Record Writer['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:2,total_bytes:23053985,xor_checksum:59,sum_checksum:3203986491['_beam_pipeline_args'][]"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6de3df98.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6de3df98.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/TF Record Writer['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:2,total_bytes:23053985,xor_checksum:59,sum_checksum:3203986491['_beam_pipeline_args'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6de3df98.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/TF Record Writer
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:2,total_bytes:23053985,xor_checksum:59,sum_checksum:3203986491"
['_beam_pipeline_args'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6de3df98.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5) at 0x7f2dd608f160.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/ImportExampleGen/examples/5
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


In [None]:
# Read Google Cloud or AWS Simple Storage directly via external_input function
examples = external_input("gs://example_compliance_data/")
example_gen = CsvExampleGen(input=examples)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)

### Ingesting Remote Data Files

### Ingesting Data Directly from Databases

#### Google Cloud BigQuery



In [None]:
import os

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = data_dir + "Chatbot Test Project-5354f68ec738.json"

In [None]:
def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

implicit()

[<Bucket: chatbot-test-project-256610.appspot.com>, <Bucket: staging.chatbot-test-project-256610.appspot.com>]


In [29]:
# Query my own BigQuery table
from tfx.components import BigQueryExampleGen

In [30]:
query = """
    SELECT * FROM `<project_id>.<database>.<table_name>`
    """

example_gen = BigQueryExampleGen(query=query)

### Changes to the BigQueryExmapleGen Component

#### Presto Databases

Not working with current tfx version...

#### PrestoExampleGen Requires Separate Installation

In [None]:
# !git clone git@github.com:tensorflow/tfx.git && cd tfx/
# !git checkout v0.22.0

# %cd examples/custom_components/presto_example_gen
# !pip install -e .

## Data Preparation

### Splitting Datasets

#### Splitting one dataset into subsets



In [31]:
from tfx.components import CsvExampleGen
from tfx.proto import example_gen_pb2
from tfx.utils.dsl_utils import external_input

In [32]:
# Define preferred splits
# specify the ratio (with hash_buckets)
output = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(
        splits=[
                example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=6),
                example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=2),
                example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=2)
                ]
    )
)

examples = external_input(csv_data_dir)

# Add output_config argument
example_gen = CsvExampleGen(input=examples, output_config=output)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)



0,1
.execution_id,6
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2d6afd00f0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6eefcc18.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6eefcc18.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6eefcc18.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6eefcc18.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],drive/My Drive/Building ML Pipelines/CSV Data/
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059"
['_beam_pipeline_args'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6eefcc18.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6) at 0x7f2d6eefcda0.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


In [33]:
for artifact in example_gen.outputs["examples"].get():
    print(artifact)

print(example_gen.exec_properties["output_config"])

Artifact(artifact: id: 6
type_id: 5
uri: "drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/6"
properties {
  key: "split_names"
  value {
    string_value: "[\"train\", \"eval\", \"test\"]"
  }
}
custom_properties {
  key: "input_fingerprint"
  value {
    string_value: "split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059"
  }
}
custom_properties {
  key: "payload_format"
  value {
    string_value: "FORMAT_TF_EXAMPLE"
  }
}
custom_properties {
  key: "span"
  value {
    string_value: "0"
  }
}
custom_properties {
  key: "state"
  value {
    string_value: "published"
  }
}
, artifact_type: id: 5
name: "Examples"
properties {
  key: "span"
  value: INT
}
properties {
  key: "split_names"
  value: STRING
}
properties {
  key: "version"
  value: INT
}
)
{
  "split_config": {
    "splits": [
      {
        "hash_buckets": 6,
        "name": "train"
      },
      {
        "hash_buckets": 2,
        "name": "ev

### Default Splits

#### Preserving existing splits

In [34]:
# Define preferred splits
# specify the ratio (with hash_buckets)
input = example_gen_pb2.Input(
        splits=[
                example_gen_pb2.Input.Split(name="train", pattern="train/*"),
                example_gen_pb2.Input.Split(name="eval", pattern="eval/*"),
                example_gen_pb2.Input.Split(name="test", pattern="test/*")
                ]
    
)

examples = external_input(csv_data_dir)

# Add output_config argument
example_gen = CsvExampleGen(input=examples, output_config=output)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)



0,1
.execution_id,7
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2d6eefca20.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6e4a0be0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6e4a0be0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6e4a0be0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6e4a0be0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],drive/My Drive/Building ML Pipelines/CSV Data/
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059"
['_beam_pipeline_args'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6e4a0be0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7) at 0x7f2d6e46a9e8.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/7
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


### Spanning Datasets

In [35]:
# Define preferred splits
input = example_gen_pb2.Input(
        splits=[
                # if not splitted data
                # example_gen_pb2.Input.Split(pattern="export-{SPAN}/*")

                # if already splitted data
                example_gen_pb2.Input.Split(name="train", pattern="export-{SPAN}/train/*"),
                example_gen_pb2.Input.Split(name="eval", pattern="export-{SPAN}/eval/*")
                ]
    
)

examples = external_input(csv_data_dir)

# Add output_config argument
example_gen = CsvExampleGen(input=examples, output_config=output)

# Create context object
context = InteractiveContext(pipeline_root=out_dir)

# Execute component interactively
# Output highlights the storage locations of the training and the evaluation datasets
context.run(example_gen)



0,1
.execution_id,8
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2d6cd91f60.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f418ac8.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f418ac8.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f418ac8.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']drive/My Drive/Building ML Pipelines/CSV Data/['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059['_beam_pipeline_args'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f418ac8.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],drive/My Drive/Building ML Pipelines/CSV Data/
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1601799059,sum_checksum:1601799059"
['_beam_pipeline_args'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2d6f418ac8.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8) at 0x7f2d6f418f28.type<class 'tfx.types.standard_artifacts.Examples'>.uridrive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,drive/My Drive/Building ML Pipelines/Chapter 3/Outputs/CsvExampleGen/examples/8
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


### Versioning Datasets

Currently not possible in TFX
Use external tools like:

 - **Data Version Control (DVC)**
 - **Pachyderm**

## Ingestion Strategies

### Structured Data

### Text Data for Natural Language Problems

### Image Data for Computer Vision Problems

Just an example code

In [36]:
import tensorflow as tf

In [None]:
img_dir = base_dir + "Img Data/"
filenames = os.listdir(img_dir)

def generate_label_from_path(image_path):
    """
    ...
    """
    return label


def _bytes_feature(value):
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[value.encode()])
    )


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


tfrecord_filename = data_dir + "image_dataset.tfrecord"

with tf.io.TFRecordWriter(tfrecord_filename) as writer:
    for img_path in filenames:
        image_path = os.path.join(base_dir, img_path)
        try:
            raw_file = tf.io.read_file(image_path)
        except FileNotFoundError:
            print("File {} could not be found".format(image_path))
            continue

    example = tf.train.Example(features=tf.train.Features(features={
        "image_raw": _bytes_feature(raw_file.numpy()),
        "label": _int64_feature(generate_label_from_path(image_path))
    }))
    writer.write(example.SerializeToString())

# References and Additional Resources

 - <a href="https://dvc.org/">Data Version Control</a>
 - <a href="https://www.pachyderm.com/">Pachyderm</a>