In [None]:
import os, tempfile, urllib, zipfile, pandas

import tensorflow_data_validation as tfdv

In [None]:
url = 'data/diabetic_data.csv'

data = pandas.read_table(url, na_values='?', sep=',')

data.shape

(101766, 50)

In [None]:
from sklearn.model_selection import train_test_split

train_split = 0.7

train, test = train_test_split(data, train_size=train_split, shuffle=False)

total_train = train.shape[0]

total_train

71236

In [None]:
test_split = 0.5

test, eval = train_test_split(test, test_size=test_split, shuffle=False)

total_test = test.shape[0]

total_test

15265

In [None]:
assert test.shape[0] == eval.shape[0], 'Fail !'

In [None]:
from tensorflow_data_validation import StatsOptions

remopal = ['encounter_id', 'patient_nbr']

usecols = [i for i in data.columns if i not in remopal]

stats_options = StatsOptions(feature_allowlist = usecols)

In [None]:
from tensorflow_data_validation import generate_statistics_from_dataframe

trainStats = generate_statistics_from_dataframe(train, stats_options=stats_options)

testStats = generate_statistics_from_dataframe(test, stats_options=stats_options)

evalStats = generate_statistics_from_dataframe(eval, stats_options=stats_options)

In [None]:
totalFeatures = len(trainStats.datasets[0].features)

totalFeatures

48

In [None]:
from tensorflow_data_validation import visualize_statistics

visualize_statistics(trainStats)

In [None]:
from tensorflow_data_validation import infer_schema, display_schema

schemas = infer_schema(trainStats)

display_schema(schemas)

In [None]:
assert len(schemas.feature) == totalFeatures, 'Fail !'

In [None]:
statlist = {}

statlist['lhs_statistics'] = evalStats; statlist['lhs_statistics'] = 'Eval'

statlist['rhs_statistics'] = trainStats; statlist['rhs_name'] = 'Train'

visualize_statistics(**statlist)

In [None]:
train["glimepiride-pioglitazone"].nunique()

1

In [None]:
test["glimepiride-pioglitazone"].nunique()

2

In [None]:
from tensorflow_data_validation import validate_statistics

anomalies = validate_statistics(testStats, schemas)

TotalAnomalies = len(anomalies.anomaly_info.items())

TotalAnomalies

2

In [None]:
from tensorflow_data_validation import display_anomalies

display_anomalies(anomalies)

In [None]:
from tensorflow_data_validation import get_domain

medical_specialty_cols = get_domain(schemas, 'medical_specialty')

assert 'Neurophysiology' not in medical_specialty_cols.value, 'Fail !'

In [None]:
medical_specialty_cols.value.append('Neurophysiology')

In [None]:
glipio_cols = get_domain(schemas, 'glimepiride-pioglitazone')

assert 'Steady' not in glipio_cols.value, 'Fail !'

In [None]:
glipio_cols.value.append('Steady')

In [None]:
anomalies = validate_statistics(testStats, schemas)

TotalAnomalies = len(anomalies.anomaly_info.items())

TotalAnomalies

0