# TensorFlow Transform

### Install tensorflow transform

In [None]:
!pip install -U tensorflow_transform

Collecting tensorflow_transform
  Downloading tensorflow_transform-1.14.0-py3-none-any.whl (447 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.8/447.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting apache-beam[gcp]<3,>=2.47 (from tensorflow_transform)
  Downloading apache_beam-2.50.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow<11,>=10 (from tensorflow_transform)
  Downloading pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.9/35.9 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting tfx-bsl<1.15.0,>=1.14.0 (from tensorflow_transform)
  Downloading tfx_bsl-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

### Import packages

In [1]:
import os
import tempfile

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam

from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils

###  Define a Preprocessing Function

In [2]:
def preprocessing_fn(inputs):
  x = inputs['x']
  y = inputs['y']
  s = inputs['s']
  x_centered = x - tft.mean(x)
  y_normalized = tft.scale_to_0_1(y)
  s_integerized = tft.compute_and_apply_vocabulary(s)
  x_centered_times_y_normalized = x_centered * y_normalized
  return {
      'x_centered': x_centered,
      'y_normalized': y_normalized,
      'x_centered_times_y_normalized': x_centered_times_y_normalized,
      's_integerized': s_integerized
  }


### Define raw_data

In [3]:
raw_data = [
    {'x': 1, 'y': 1, 's': 'hello'},
    {'x': 2, 'y': 2, 's': 'world'},
    {'x': 3, 'y': 3, 's': 'hello'}
]

# Apache Beam Implementation

### Create raw data metadata

In [11]:
raw_data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        'y': tf.io.FixedLenFeature([], tf.float32),
        'x': tf.io.FixedLenFeature([], tf.float32),
        's': tf.io.FixedLenFeature([], tf.string),
    }))

### creates an Apache Beam pipeline

In [12]:
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
  transformed_dataset, transform_fn = (
      (raw_data, raw_data_metadata) |
      tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))



In [6]:
transformed_data, transformed_metadata = transformed_dataset
transformed_data