In [1]:
import tensorflow as tf

import pandas, warnings

warnings.filterwarnings('ignore')

In [2]:
data = pandas.read_table('data/adult.data', sep=',', skipinitialspace=True)

data.iloc[:5, :7]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty


In [3]:
data.iloc[:5, 7:]

Unnamed: 0,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,Husband,White,Male,0,0,13,United-States,<=50K
2,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,Husband,Black,Male,0,0,40,United-States,<=50K
4,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
null = data.isnull().sum()

null.sum()

0

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, shuffle=False)

In [6]:
from util import add_extra_rows

sample = add_extra_rows(test)

sample.shape[0]

6517

Generate Train Statistics

In [7]:
from tensorflow_data_validation import generate_statistics_from_dataframe

from tensorflow_data_validation import visualize_statistics

train_stats = generate_statistics_from_dataframe(train)

visualize_statistics(train_stats)

In [8]:
test_stats = generate_statistics_from_dataframe(test)

sample_stats = generate_statistics_from_dataframe(sample)

Generate Schemas

In [9]:
from tensorflow_data_validation import infer_schema, display_schema

schemas = infer_schema(statistics=train_stats)

display_schema(schemas)

In [10]:
statlist = {}

statlist['rhs_statistics'] = train_stats; statlist['rhs_name'] = 'Train Stats'

statlist['lhs_statistics'] = test_stats; statlist['lhs_name'] = 'Test Stats'

visualize_statistics(**statlist)

In [11]:
statlist = {}

statlist['rhs_statistics'] = sample_stats; statlist['rhs_name'] = 'Sample Stats'

statlist['lhs_statistics'] = test_stats; statlist['lhs_name'] = 'Test Stats'

visualize_statistics(**statlist)

In [12]:
sample = sample[(sample['age'] > 16) & (sample['age'] < 91)]

sample = sample.dropna()

In [13]:
sample_stats = generate_statistics_from_dataframe(sample)

In [14]:
statlist = {}

statlist['rhs_statistics'] = sample_stats; statlist['rhs_name'] = 'Sample Stats'

statlist['lhs_statistics'] = test_stats; statlist['lhs_name'] = 'Test Stats'

visualize_statistics(**statlist)

In [15]:
from tensorflow_data_validation import validate_statistics, display_anomalies

anomalies = validate_statistics(test_stats, schemas)

len(anomalies.anomaly_info.items())

0

In [16]:
anomalies = validate_statistics(sample_stats, schemas)

display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'race',Unexpected string values,Examples contain values missing from the schema: Asian (<1%).
'occupation',Unexpected string values,Examples contain values missing from the schema: Gamer (<1%).
'native-country',Unexpected string values,Examples contain values missing from the schema: Mongolia (<1%).


In [17]:
from tensorflow_data_validation import get_feature, get_domain

country_feature = get_feature(schemas, 'native-country')

country_feature.distribution_constraints.min_domain_mass = 0.9

In [18]:
anomalies = validate_statistics(sample_stats, schemas)

len(anomalies.anomaly_info.items())

2

In [19]:
occupation_feature = get_feature(schemas, 'occupation')

occupation_feature.distribution_constraints.min_domain_mass = 0.9

In [20]:
anomalies = validate_statistics(sample_stats, schemas)

len(anomalies.anomaly_info.items())

1

In [21]:
races = get_domain(schemas, 'race')

races.value

['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White']

In [22]:
races.value.append('Asian')

races.value

['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White', 'Asian']

In [23]:
anomalies = validate_statistics(sample_stats, schemas)

len(anomalies.anomaly_info.items())

0

In [24]:
from tensorflow_data_validation import set_domain

from tensorflow_metadata.proto.v0 import schema_pb2

limit = schema_pb2.IntDomain(name='age', min=17, max=90)

set_domain(schemas, 'age', limit)

In [25]:
from tensorflow_data_validation.utils import slicing_util

slices = slicing_util.get_feature_value_slicer(features={'age':[50, 60, 70, 80]})

In [26]:
from tensorflow_data_validation import StatsOptions

stats_options = StatsOptions(schema=schemas, slice_functions=[slices], infer_type_from_schema=True)

In [None]:
from tensorflow_data_validation import generate_statistics_from_csv

raw_stats = generate_statistics_from_csv('data/adult.data', stats_options=stats_options)

In [28]:
[i.name for i in raw_stats.datasets]

['All Examples', 'age_50', 'age_70', 'age_60', 'age_80']

In [29]:
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

middle_stats_list = DatasetFeatureStatisticsList()

middle_stats_list.datasets.extend([raw_stats.datasets[1]])

middle_stats_list_name = raw_stats.datasets[1].name

middle_stats_list_name

'age_50'

In [30]:
visualize_statistics(middle_stats_list)