<a href="https://colab.research.google.com/github/Neo-glitch/t.f-2.0-practice/blob/master/Dataset_Preprocessing_with_TFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow-transform

Collecting tensorflow-transform
  Downloading tensorflow_transform-1.7.0-py3-none-any.whl (433 kB)
[?25l[K     |▊                               | 10 kB 16.4 MB/s eta 0:00:01[K     |█▌                              | 20 kB 20.0 MB/s eta 0:00:01[K     |██▎                             | 30 kB 15.4 MB/s eta 0:00:01[K     |███                             | 40 kB 11.2 MB/s eta 0:00:01[K     |███▉                            | 51 kB 5.4 MB/s eta 0:00:01[K     |████▌                           | 61 kB 6.4 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 7.2 MB/s eta 0:00:01[K     |██████                          | 81 kB 6.2 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 6.8 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 7.4 MB/s eta 0:00:01[K     |████████▎                       | 112 kB 7.4 MB/s eta 0:00:01[K     |█████████                       | 122 kB 7.4 MB/s eta 0:00:01[K     |█████████▉                      | 133 

In [None]:
import tempfile  # to create temp file for logging
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as tft_beam   # to provide context for apache beam(for faster preprocesing)

# to get metadata and schema info about dataset
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils

#### Dataset Preprocessing

In [None]:
dataset = pd.read_csv("pollution_small.csv")

dataset.head(5)

Unnamed: 0,Date,pm10,no2,so2,soot
0,1/1/2009,98.67,14.1,44.38,34.81
1,1/2/2009,52.33,14.1,29.75,33.06
2,1/3/2009,74.67,20.5,36.25,39.25
3,1/4/2009,72.0,17.3,46.44,34.38
4,1/5/2009,81.0,25.64,56.56,45.59


In [None]:
features = dataset.drop("Date", axis = 1)
features.head(3)

Unnamed: 0,pm10,no2,so2,soot
0,98.67,14.1,44.38,34.81
1,52.33,14.1,29.75,33.06
2,74.67,20.5,36.25,39.25


In [None]:
# convert datset from df to list of py dicts( way needed by tft), each df row is a py dict(key = cols and  values = values)

# orient = index means that each dict should be a row or index in df
dict_features = list(features.to_dict(orient = "index").values())

dict_features[:2]

[{'no2': 14.1, 'pm10': 98.67, 'so2': 44.38, 'soot': 34.81},
 {'no2': 14.1, 'pm10': 52.33, 'so2': 29.75, 'soot': 33.06}]

In [None]:
# Defining dataset metadata(manual way)

data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        "no2": tf.io.FixedLenFeature([], tf.float32),  # value type is number and shape is [] since single number
        "pm10": tf.io.FixedLenFeature([], tf.float32),  # value type is number
        "so2": tf.io.FixedLenFeature([], tf.float32),  # value type is number
        "soot": tf.io.FixedLenFeature([], tf.float32)  # value type is number
    }
    )
)

data_metadata

{'_schema': feature {
  name: "no2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "pm10"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "so2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "soot"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
}

In [None]:
# Preprocessing function(takes input dict and does some preprocessing)
def preprocessing_fn(inputs):
  """
  inputs is of type dict
  """
  no2 = inputs["no2"]
  pm10 = inputs["pm10"]
  so2 = inputs["so2"]
  soot  = inputs["soot"]

  # proprocessing(could be anything based on usecase)
  no2_normalized = no2 - tft.mean(no2)
  pm10_normalized = tft.scale_to_0_1(pm10)
  so2_normalized = so2 - tft.mean(so2)
  soot_normalized = tft.scale_by_min_max(soot)

  return {
      "no2": no2_normalized,
      "pm10": pm10_normalized,
      "so2": so2_normalized,
      "soot": soot_normalized
  }




In [None]:
def data_transform():
  """
  init data transformation using the preprocessing fn defined and prints 
  info. if part of pipeline we would return the transformed data
  """
  with tft_beam.Context(
      # mks temp file for logging
      temp_dir=tempfile.mkdtemp()
      ):
    transformed_dataset, transform_fn = (dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)

    transformed_data, transformed_metadata = transformed_dataset

    for i in range(len(transformed_data)):
      print("Raw: ", dict_features[i])
      print("Transformed", transformed_data[i])

    



In [None]:
data_transform()







Instructions for updating:
Use ref() instead.


Instructions for updating:
Use ref() instead.






INFO:tensorflow:Assets written to: /tmp/tmpuw8o8ma2/tftransform_tmp/dab4dd71cbd54f52905ec0df9a4b994c/assets


INFO:tensorflow:Assets written to: /tmp/tmpuw8o8ma2/tftransform_tmp/dab4dd71cbd54f52905ec0df9a4b994c/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: /tmp/tmpuw8o8ma2/tftransform_tmp/a10b8bc767fe485bae80fa92c1f3b4cb/assets


INFO:tensorflow:Assets written to: /tmp/tmpuw8o8ma2/tftransform_tmp/a10b8bc767fe485bae80fa92c1f3b4cb/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


Raw:  {'pm10': 98.67, 'no2': 14.1, 'so2': 44.38, 'soot': 34.81}
Transformed {'no2': -18.577982, 'pm10': 0.34071696, 'so2': 28.855408, 'soot': 0.2834235}
Raw:  {'pm10': 52.33, 'no2': 14.1, 'so2': 29.75, 'soot': 33.06}
Transformed {'no2': -18.577982, 'pm10': 0.16963857, 'so2': 14.225408, 'soot': 0.26620758}
Raw:  {'pm10': 74.67, 'no2': 20.5, 'so2': 36.25, 'soot': 39.25}
Transformed {'no2': -12.177982, 'pm10': 0.25211358, 'so2': 20.725407, 'soot': 0.3271028}
Raw:  {'pm10': 72.0, 'no2': 17.3, 'so2': 46.44, 'soot': 34.38}
Transformed {'no2': -15.377983, 'pm10': 0.24225645, 'so2': 30.915405, 'soot': 0.2791933}
Raw:  {'pm10': 81.0, 'no2': 25.64, 'so2': 56.56, 'soot': 45.59}
Transformed {'no2': -7.037983, 'pm10': 0.2754827, 'so2': 41.035408, 'soot': 0.38947368}
Raw:  {'pm10': 147.0, 'no2': 25.2, 'so2': 42.38, 'soot': 69.33}
Transformed {'no2': -7.4779816, 'pm10': 0.51914203, 'so2': 26.855408, 'soot': 0.6230202}
Raw:  {'pm10': 185.0, 'no2': 21.91, 'so2': 71.69, 'soot': 84.06}
Transformed {'no2'