In [1]:
import pandas

tableurl = 'data/adult.data'

table = pandas.read_table(tableurl, sep=',', skipinitialspace=True)

table.shape

(32561, 15)

In [2]:
table.iloc[:5, :7]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty


In [3]:
table.iloc[:5, 7:]

Unnamed: 0,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,Husband,White,Male,0,0,13,United-States,<=50K
2,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,Husband,Black,Male,0,0,40,United-States,<=50K
4,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
null = table.isnull().sum()

null.sum()

0

In [5]:
from sklearn.model_selection import train_test_split

test_size = 0.2

train, test = train_test_split(table, test_size=test_size, shuffle=False)

In [6]:
train.isnull().sum().sum()

0

In [7]:
test.isnull().sum().sum()

0

`26048 & 6513`

#### `Train & Test`

In [8]:
from tensorflow_data_validation import generate_statistics_from_dataframe

from tensorflow_data_validation import visualize_statistics

from tensorflow_data_validation import infer_schema, validate_statistics

trainStats = generate_statistics_from_dataframe(train)

visualize_statistics(trainStats)

In [9]:
testStats = generate_statistics_from_dataframe(test)

visualize_statistics(testStats)

In [10]:
statlist = {}

statlist['lhs_statistics'] = trainStats; statlist['lhs_name'] = 'Train'

statlist['rhs_statistics'] = testStats; statlist['rhs_name'] = 'Test'

visualize_statistics(**statlist)

In [11]:
from tensorflow_data_validation import display_schema

schemas = infer_schema(statistics=trainStats)

display_schema(schemas)

In [12]:
anomalies = validate_statistics(testStats, schemas)

TotalAnomalies = len(anomalies.anomaly_info.items())

TotalAnomalies

0

#### `Train & Other Test`

In [13]:
from util import add_extra_rows

sample = add_extra_rows(test)

assert sample.shape[0] == test.shape[0] + 4, 'Fail !'

In [14]:
sample.isnull().sum().sum()

1

In [15]:
evalStats = generate_statistics_from_dataframe(sample)

statlist['rhs_statistics'] = evalStats; statlist['rhs_name'] = 'Sample'

visualize_statistics(**statlist)

In [16]:
sample = sample[(sample['age'] > 16) & (sample['age'] < 91)]

sample = sample.dropna()

sample.shape[0]

6514

In [17]:
evalStats = generate_statistics_from_dataframe(sample)

statlist['rhs_statistics'] = evalStats; statlist['rhs_name'] = 'Sample'

visualize_statistics(**statlist)

In [18]:
anomalies = validate_statistics(evalStats, schemas)

TotalAnomalies = len(anomalies.anomaly_info.items())

TotalAnomalies

3

In [19]:
from tensorflow_data_validation import display_anomalies

display_anomalies(anomalies)

In [20]:
from tensorflow_data_validation import get_domain

race_column = get_domain(schemas, 'race')

assert 'Asian' not in race_column.value, 'Fail !'

In [21]:
occ_column = get_domain(schemas, 'occupation')

assert 'Gamer' not in occ_column.value, 'Fail !'

In [22]:
countries_column = get_domain(schemas, 'native-country')

assert 'Mongolia' not in countries_column.value, 'Fail !'

#### `Finish Anomalies`

In [23]:
from tensorflow_data_validation import get_feature

occ_col_feature = get_feature(schemas, 'occupation')

type(occ_col_feature.distribution_constraints)

tensorflow_metadata.proto.v0.schema_pb2.DistributionConstraints

In [24]:
occ_col_feature.distribution_constraints.min_domain_mass

1.0

In [25]:
occ_col_feature.distribution_constraints.min_domain_mass = 0.9

occ_col_feature.distribution_constraints.min_domain_mass

0.9

In [26]:
countries_col_feat = get_feature(schemas, 'native-country')

countries_col_feat.distribution_constraints.min_domain_mass

1.0

In [27]:
countries_col_feat.distribution_constraints.min_domain_mass = 0.9

countries_col_feat.distribution_constraints.min_domain_mass

0.9

In [28]:
anomalies = validate_statistics(evalStats, schemas)

TotalAnomalies = len(anomalies.anomaly_info.items())

TotalAnomalies

1

In [29]:
race_column.value.append('Asian')

assert 'Asian' in race_column.value, 'Fail !'

In [30]:
anomalies = validate_statistics(evalStats, schemas)

TotalAnomalies = len(anomalies.anomaly_info.items())

TotalAnomalies

0

In [31]:
from tensorflow_metadata.proto.v0 import schema_pb2

from tensorflow_data_validation import set_domain

limit = schema_pb2.IntDomain(name='age', min=17, max=90)

set_domain(schemas, 'age', limit)

display_schema(schemas)

In [32]:
from tensorflow_data_validation.utils import slicing_util

sex_slicer = slicing_util.get_feature_value_slicer(features={'sex': None})

#### `Male & Female`

In [33]:
from tensorflow_data_validation import StatsOptions

slice_options = StatsOptions(schema=schemas, slice_functions=[sex_slicer], infer_type_from_schema=True)

In [34]:
train.to_csv('slice_train_sample.csv')

In [None]:
from tensorflow_data_validation import generate_statistics_from_csv

url = 'slice_train_sample.csv'

train_slice_stats = generate_statistics_from_csv(url, stats_options=slice_options)

In [36]:
[sliced.name for sliced in train_slice_stats.datasets]

['All Examples', 'sex_Male', 'sex_Female']

In [37]:
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

male_stats_list = DatasetFeatureStatisticsList()

male_stats_list.datasets.extend([train_slice_stats.datasets[1]])

male_stats_name = train_slice_stats.datasets[1].name

male_stats_name

'sex_Male'

In [38]:
female_stats_list = DatasetFeatureStatisticsList()

female_stats_list.datasets.extend([train_slice_stats.datasets[2]])

female_stats_name = train_slice_stats.datasets[2].name

female_stats_name

'sex_Female'

In [39]:
statlist = {}

statlist['lhs_statistics'] = male_stats_list; statlist['lhs_name'] = male_stats_name

statlist['rhs_statistics'] = female_stats_list; statlist['rhs_name'] = female_stats_name

visualize_statistics(**statlist)