# Preprocess data with TensorFlow Transform 

credits: [TFX Tutorials Transform](https://www.tensorflow.org/tfx/transform/get_started)

In [2]:
!pip install tensorflow-transform

Collecting tensorflow-transform
  Downloading tensorflow_transform-1.0.0-py3-none-any.whl (402 kB)
[K     |████████████████████████████████| 402 kB 3.1 MB/s eta 0:00:01
Collecting tfx-bsl<1.1.0,>=1.0.0
  Downloading tfx_bsl-1.0.0-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 10.7 MB/s eta 0:00:01
[?25hCollecting pyarrow<3,>=1
  Downloading pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 1.5 MB/s eta 0:00:01
Collecting apache-beam[gcp]<3,>=2.29
  Downloading apache_beam-2.29.0-cp36-cp36m-manylinux2010_x86_64.whl (9.7 MB)
[K     |████████████████████████████████| 9.7 MB 1.2 MB/s eta 0:00:01
[?25hCollecting tensorflow-metadata<1.1.0,>=1.0.0
  Downloading tensorflow_metadata-1.0.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 1.9 MB/s eta 0:00:01
[?25hCollecting pydot<2,>=1.2
  Downloading pydot-1.4.2-py2.py3-none-any.whl (21

[K     |████████████████████████████████| 169 kB 673 kB/s eta 0:00:01
[?25hCollecting google-cloud-core<2,>=0.28.1; extra == "gcp"
  Downloading google_cloud_core-1.6.0-py2.py3-none-any.whl (28 kB)
Collecting google-cloud-language<2,>=1.3.0; extra == "gcp"
  Downloading google_cloud_language-1.3.0-py2.py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 372 kB/s eta 0:00:01
Collecting google-cloud-bigtable<2,>=0.31.1; extra == "gcp"
  Downloading google_cloud_bigtable-1.7.0-py2.py3-none-any.whl (267 kB)
[K     |████████████████████████████████| 267 kB 425 kB/s eta 0:00:01
[?25hCollecting google-cloud-pubsub<2,>=0.39.0; extra == "gcp"
  Downloading google_cloud_pubsub-1.7.0-py2.py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 771 kB/s eta 0:00:01
[?25hCollecting grpcio-gcp<1,>=0.2.2; extra == "gcp"
  Downloading grpcio_gcp-0.2.2-py2.py3-none-any.whl (9.4 kB)
Collecting google-cloud-spanner<2,>=1.13.0; extra == "gcp"
  Downloading goog

  Building wheel for avro-python3 (setup.py) ... [?25ldone
[?25h  Created wheel for avro-python3: filename=avro_python3-1.9.2.1-py3-none-any.whl size=43512 sha256=3fd6450a6cbefaae2e75bffd952a9a930d21ed084d74dc2843af333b95f21a50
  Stored in directory: /root/.cache/pip/wheels/4e/08/0c/727bff8f20fedbdeb8a2c5214e460b214d41c10dc879cf6dac
  Building wheel for future (setup.py) ... [?25ldone
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491059 sha256=8b476f2020a5c80e226fe9eb28d1f2a36cc3117ff72d8157c60be73b47954fb2
  Stored in directory: /root/.cache/pip/wheels/6e/9c/ed/4499c9865ac1002697793e0ae05ba6be33553d098f3347fb94
  Building wheel for google-apitools (setup.py) ... [?25ldone
[?25h  Created wheel for google-apitools: filename=google_apitools-0.5.31-py3-none-any.whl size=131041 sha256=1d56ee6bfe9aa61b85dbbbd487a628510baa6443c8f90dad89a33d873a5bbf89
  Stored in directory: /root/.cache/pip/wheels/19/90/12/279bd3b09f3c9a4a0a5416b196eeec18ebc0e11d90ac34215

### Preprocessing function 

In [4]:
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam

In [6]:
def preprocessing_fn(inputs):
    """Preprocessing function transforms each of three inputs in different ways
    
    Note:
        Input `inputs` must be a dictionary of `Tensor` or `SparseTensor`
    
    Args:
        inputs: Dictionary of `Tensor` or `SparseTensor` of the raw data
    
    Returns:
        A dictionary of `Tensor` or `SparseTensor` containing the transformed values
    """
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = x_centered * y_normalized
    
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }

In [11]:
raw_data = [
    {'x': 1, 'y': 1, 's': 'hello'},
    {'x': 2, 'y': 2, 's': 'world'},
    {'x': 3, 'y': 3, 's': 'hello'}
]

In [12]:
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils

raw_data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        'x': tf.io.FixedLenFeature([], tf.int64),
        'y': tf.io.FixedLenFeature([], tf.int64),
        's': tf.io.FixedLenFeature([], tf.string),
    })
)

In [17]:
import tensorflow_transform.beam as tft_beam

transformed_dataset, transform_fn = (
(raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
preprocessing_fn))
# transformed_data, transformed_metadata = transformed_dataset





ValueError: A tf.Transform function that required a temp dir was called but no temp dir was set.  To set a temp dir use the impl.Context context manager.

In [10]:
transformed_data

NameError: name 'transformed_data' is not defined