## Stage 1: Install all dependencies and setting up the environment

In [1]:
#!apt-get install python-dev python-snappy

In [3]:
!pip install tensorflow-data-validation

Collecting tensorflow-data-validation
  Downloading tensorflow_data_validation-1.4.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.4 MB)
[?25l[K     |▎                               | 10 kB 26.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 28.8 MB/s eta 0:00:01[K     |▊                               | 30 kB 12.4 MB/s eta 0:00:01[K     |█                               | 40 kB 9.5 MB/s eta 0:00:01[K     |█▏                              | 51 kB 5.4 MB/s eta 0:00:01[K     |█▍                              | 61 kB 6.0 MB/s eta 0:00:01[K     |█▋                              | 71 kB 5.7 MB/s eta 0:00:01[K     |█▉                              | 81 kB 6.4 MB/s eta 0:00:01[K     |██▏                             | 92 kB 4.8 MB/s eta 0:00:01[K     |██▍                             | 102 kB 5.2 MB/s eta 0:00:01[K     |██▋                             | 112 kB 5.2 MB/s eta 0:00:01[K     |██▉                             | 122 kB 5.2 MB/s eta 0:0

## Stage 2: Import project dependencies

In [3]:
#!pip install google-api-core

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv

from __future__ import print_function

## Stage 3: Simple dataset analysis

In [2]:
dataset = pd.read_csv("pollution_small.csv")

In [3]:
dataset.shape

(2188, 5)

In [4]:
training_data = dataset[:1600]

In [5]:
training_data.describe()

Unnamed: 0,pm10,no2,so2,soot
count,1600.0,1600.0,1600.0,1600.0
mean,49.656494,30.980519,16.229981,21.551956
std,35.211906,12.400788,10.621896,12.127354
min,6.38,9.74,4.01,6.0
25%,28.345,22.5675,9.7775,14.4
50%,38.835,28.715,13.275,18.63
75%,58.05,36.37,19.2825,24.0725
max,277.25,138.01,123.13,107.65


In [6]:
test_set = dataset[1600:]

In [7]:
test_set.describe()

Unnamed: 0,pm10,no2,so2,soot
count,588.0,588.0,588.0,588.0
mean,44.648248,37.296922,13.60517,18.44131
std,28.992087,10.94005,5.098944,6.596459
min,11.9,15.07,4.99,8.0
25%,28.3375,29.2175,10.1225,14.41
50%,35.555,35.815,12.345,17.09
75%,50.8125,43.8725,15.855,20.9625
max,273.77,106.03,38.03,87.21


## Stage 3: Data analysis and validation with TFDV

### Generate training data statistics

In [8]:
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)

### Infering the schema

In [9]:
schema = tfdv.infer_schema(statistics=train_stats)

In [10]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Date',BYTES,required,,-
'pm10',FLOAT,required,,-
'no2',FLOAT,required,,-
'so2',FLOAT,required,,-
'soot',FLOAT,required,,-


### Calculate test set statistics

In [11]:
test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

## Stage 4: Compare test statistics with the Schema

### Checking for anomalies in new data

In [12]:
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

### Displaying all detected anomalies

- Integer larger than 10
- STRING type when expected INT type
- FLOAT type when expected INT type
- Integer smaller than 0

In [13]:
tfdv.display_anomalies(anomalies)

### New data WITH anomalies

In [14]:
test_set_copy = test_set.copy()

In [15]:
test_set_copy.drop("soot", axis=1, inplace=True)

### Statistics based on data with anomalies

In [16]:
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set_copy)

In [17]:
anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema)

In [18]:
tfdv.display_anomalies(anomalies_new)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'soot',Column dropped,Column is completely missing


## Stage 5: Prepare the schema for Serving

In [19]:
schema.default_environment.append("TRAINING")
schema.default_environment.append("SERVING")

### Removing a target column from the Serving schema

In [20]:
tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING")

### Checking for anomalies between the SERVING environment and new test set

In [21]:
serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment="SERVING")

In [22]:
tfdv.display_anomalies(serving_env_anomalies)

## Freezing the schema:

In [23]:
tfdv.write_schema_text(schema=schema,output_path="pollution_schema.pbtxt")

In [24]:
tfdv.visualize_statistics(train_stats)