<a href="https://colab.research.google.com/github/KevinTheRainmaker/MLOps/blob/main/%08MLOps_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 검증

In [1]:
!pip install -q tensorflow-data-validation

[K     |████████████████████████████████| 1.4 MB 5.5 MB/s 
[K     |████████████████████████████████| 48 kB 4.1 MB/s 
[K     |████████████████████████████████| 294 kB 52.5 MB/s 
[K     |████████████████████████████████| 19.1 MB 1.2 MB/s 
[K     |████████████████████████████████| 9.9 MB 23.5 MB/s 
[K     |████████████████████████████████| 247 kB 73.2 MB/s 
[K     |████████████████████████████████| 2.3 MB 15.6 MB/s 
[K     |████████████████████████████████| 45 kB 2.8 MB/s 
[K     |████████████████████████████████| 62 kB 689 kB/s 
[K     |████████████████████████████████| 151 kB 20.7 MB/s 
[K     |████████████████████████████████| 183 kB 45.9 MB/s 
[K     |████████████████████████████████| 110 kB 56.4 MB/s 
[K     |████████████████████████████████| 435 kB 50.0 MB/s 
[K     |████████████████████████████████| 144 kB 48.3 MB/s 
[K     |████████████████████████████████| 255 kB 56.7 MB/s 
[K     |████████████████████████████████| 267 kB 47.5 MB/s 
[K     |██████████████████████

In [None]:
import tensorflow_data_validation as tfdv

# csv 로드
stats = tfdv.generate_statistics_from_csv(
	data_location='/data/consumer_complaints.csv',
    delimiter=',')

# TFRecord 로드
stats = tfdv.generate_statistics_from_tfrecord(
	data_location='/data/consumer_complaints.csv')

In [None]:
schema = tfdv.infer_schema(stats)

In [None]:
tfdv.display_schema(schema)

In [None]:
train_stats = tfdv.generate_statistics_from_tfrecord(
    data_location='train_tfrecord_filename'
)

val_stats = tfdv.generate_statistics_from_tfrecord(
    data_location='val_tfrecord_filename'
)

tfdv.visualize_statistics(lhs_statistics=val_stats, rhs_statistics=train_stats,
                          lhs_name='VAL_DATASET', rhs_name='TRAIN_DATASET')

In [None]:
anomalies = tfdv.validate_statistics(statistics=val_stats, schema=schema)

In [None]:
tfdv.display_anomalies(anomalies)

In [None]:
# anomaly_info {
#     key: "company"
#     value {
#         description: "The feature was present in fewer examples than expected."
#         severity: ERROR
#         short_description: "Column dropped"
#         reason {
#             type: FEATURE_TYPE_LOW_FRACTION_PRESENT
#             short_description: "Column dropped"
#             description: "The feature was present in fewer examples than expected."
#         }
#         path {
#             step: "company"
#         }
#     }
# }

In [None]:
schema = tfdv.load_schema_text('schema_location')

In [None]:
feature = tfdv.get_feature(schema, 'feature_name')
feature.presence.min_fraction = 0.9

In [None]:
# 미국 state 목록에서 알래스카(AK) 제거하기
state_domain = tfdv.get_domain(schema, 'state')
state_domain.value.remove('AK')

In [None]:
tfdv.write_schema_text(schema, 'schema_location')

In [None]:
updated_anomalies = tfdv.validate_statistics(val_stats, schema)
tfdv.display_anomalies(updated_anomalies)

In [None]:
# 임계값 설정
tfdv.get_feature(schema, 'company').skew_comparator.infinity_norm.threshold = 0.01 

skew_anomalies = tfdv.validate_statistics(statistics=train_stats,
                                          schema=schema,
                                          serving_statistics=serving_stats)

In [None]:
tfdv.get_feature(schema, 'company').drift_comparator.infinity_norm.threshold = 0.01 

drift_anomalies = tfdv.validate_statistics(statistics=train_stats_today,
                                          schema=schema,
                                          previous_statistics=train_stats_yesterday)

In [None]:
from tensorflow_data_validation.utils import slicing_util

slice_fn1 = slicing_util.get_feature_value_slicer(
    features={'state':[b'CA']} # feature 값은 이진수 값 list로 제공해야 한다.
)

slice_options = tfdv.StatsOptions(slice_functions=[slice_fn1])
slice_stats = tfdv.generate_statistics_from_csv(
    data_location='./data/consumer-complaints.csv',
    stats_options=slice_options
)

In [None]:
from tensorflow_metadata.proto.v0 import statistics_pb2

def display_slice_keys(stats):
  print(list(map(lambda x: x.name, slice_stats.datasets)))

def get_sliced_stats(stats, slice_key):
  for sliced_stats in stats.datasets:
    if sliced_stats.name == slice_key:
      result = statistics_pb2.DatasetFeatureStatisticsList()
      result.datasets.add().CopyFrom(sliced_stats)
      return result
    print('Invalid Slice Key')

def compare_slices(stats, slice_key1, slice_key2):
  lhs_stats = get_sliced_stats(stats, slice_key1)
  rhs_stats = get_sliced_stats(stats, slice_key2)
  tfdv.validate_statistics(lhs_stats, rhs_stats)

# 시각화
tfdv.visualize_statistics(get_sliced_stats(slice_stats, 'state_CA'))

In [None]:
compare_slices(slice_stats, 'state_CA', 'All Examples')