<a href="https://colab.research.google.com/github/Neo-glitch/t.f-2.0-practice/blob/master/Tf_Data_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow-data-validation

Collecting tensorflow-data-validation
  Downloading tensorflow_data_validation-1.7.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 15.0 MB/s 
[?25hCollecting pyfarmhash<0.4,>=0.2
  Downloading pyfarmhash-0.3.2.tar.gz (99 kB)
[K     |████████████████████████████████| 99 kB 8.6 MB/s 
Collecting tfx-bsl<1.8,>=1.7.0
  Downloading tfx_bsl-1.7.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (19.2 MB)
[K     |████████████████████████████████| 19.2 MB 159 kB/s 
Collecting apache-beam[gcp]<3,>=2.36
  Downloading apache_beam-2.38.0-cp37-cp37m-manylinux2010_x86_64.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 43.5 MB/s 
[?25hCollecting pyarrow<6,>=1
  Downloading pyarrow-5.0.0-cp37-cp37m-manylinux2014_x86_64.whl (23.6 MB)
[K     |████████████████████████████████| 23.6 MB 1.2 MB/s 
Collecting joblib<0.15,>=0.12
  Downloading joblib-0.14.1-py2.py3-none-any.whl (294 kB)
[K     |███████████████

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_data_validation as tfdv


# for printing in python 2
# from __future__ import print_function

#### Dataset analysis

In [None]:
dataset = pd.read_csv("pollution_small.csv")

dataset.shape

(2188, 5)

In [None]:
training_data = dataset[:1600]

training_data.describe()

Unnamed: 0,pm10,no2,so2,soot
count,1600.0,1600.0,1600.0,1600.0
mean,49.656494,30.980519,16.229981,21.551956
std,35.211906,12.400788,10.621896,12.127354
min,6.38,9.74,4.01,6.0
25%,28.345,22.5675,9.7775,14.4
50%,38.835,28.715,13.275,18.63
75%,58.05,36.37,19.2825,24.0725
max,277.25,138.01,123.13,107.65


In [None]:
test_set = dataset[1600:]

test_set.describe()

Unnamed: 0,pm10,no2,so2,soot
count,588.0,588.0,588.0,588.0
mean,44.648248,37.296922,13.60517,18.44131
std,28.992087,10.94005,5.098944,6.596459
min,11.9,15.07,4.99,8.0
25%,28.3375,29.2175,10.1225,14.41
50%,35.555,35.815,12.345,17.09
75%,50.8125,43.8725,15.855,20.9625
max,273.77,106.03,38.03,87.21


### Data Analysis and Validation with TFDV

In [None]:
# gen training data stats
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)

In [None]:
# infering the schema
schema = tfdv.infer_schema(train_stats)

In [None]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Date',BYTES,required,,-
'pm10',FLOAT,required,,-
'no2',FLOAT,required,,-
'so2',FLOAT,required,,-
'soot',FLOAT,required,,-


In [None]:
test_stats = tfdv.generate_statistics_from_dataframe(test_set)

### Compare Test Stats and Schema

In [None]:
anomalies = tfdv.validate_statistics(statistics = test_stats, schema = schema )

# display anomalies if any
tfdv.display_anomalies(anomalies)

In [None]:
"""
anomalies e.g
- having a value lower than 10 and having a value higher than 10 as input
- Cast error when what is expected and what is passed
- Integer value smaller than 0
"""

'\nanomalies e.g\n- having a value lower than 10 and having a value higher than 10 as input\n- Cast error when what is expected and what is passed\n- Integer value smaller than 0\n'

In [None]:
# new data with anomalies, i.e without target col
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis = 1, inplace =True)

In [None]:
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(test_set_copy)

anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema = schema)

tfdv.display_anomalies(anomalies_new)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'soot',Column dropped,Column is completely missing


In [None]:
schema.default_environment.append("TRAINING")  # training env

# serving time env, since training col won't be there in serving since we are predicting here
schema.default_environment.append("SERVING")   

In [None]:
# Removing target col from serving env schema but not in training
tfdv.get_feature(schema = schema, feature_path="soot").not_in_environment.append("SERVING")

In [None]:
serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, "SERVING")

tfdv.display_anomalies(serving_env_anomalies)

In [None]:
# Freezing Schema for later use in end to end pipeline of more data val
tfdv.write_schema_text(schema = schema, output_path="pollution_schema.pbtxt")