In [1]:
import os
import tensorflow as tf
import tensorflow_model_analysis as tfma
import tensorflow_data_validation as tfdv
from absl import logging as absl_logging
from apache_beam import logging as beam_logging

tf.get_logger().setLevel('ERROR')
absl_logging.set_verbosity('ERROR')
beam_logging.getLogger().setLevel('ERROR')

print('TF version: {}'.format(tf.__version__))
print('TFMA version: {}'.format(tfma.__version__))
print('TFDV version: {}'.format(tfdv.__version__))

2023-09-20 20:48:28.183379: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-20 20:48:28.218949: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-20 20:48:28.475631: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-20 20:48:28.478366: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TF version: 2.13.0
TFMA version: 0.44.0
TFDV version: 1.13.0


In [2]:
# imports for helper function
import csv
from tensorflow.core.example import example_pb2
from tensorflow_metadata.proto.v0 import schema_pb2

# Load data test

In [3]:
import os

In [4]:
CSV_DIR = '../data/data_test/'

In [5]:
# Path to the full test set
TEST_DATA_PATH = os.path.join(CSV_DIR, 'data_test.csv')

# Preview the first few rows
!head {TEST_DATA_PATH}

age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0
34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0
29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,0
63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,1
24,Private,369667,S

# Load schema

In [6]:
SCHEMA_FILE = "../pipelines_local/census-pipelines/SchemaGen/schema/4/schema.pbtxt"

In [7]:
# Load the schema as a protocol buffer
SCHEMA = tfdv.load_schema_text(SCHEMA_FILE)

# Display the schema
tfdv.display_schema(SCHEMA)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,-
'capital-gain',INT,required,,-
'capital-loss',INT,required,,-
'education',STRING,required,,'education'
'education-num',INT,required,,-
'fnlwgt',INT,required,,-
'hours-per-week',INT,required,,-
'label',INT,required,,-
'marital-status',STRING,required,,'marital-status'
'native-country',STRING,required,,'native-country'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'marital-status',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'native-country',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'"
'race',"'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'sex',"'Female', 'Male'"
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"


## Use the Schema to Create TFRecords

In [8]:
def csv_to_tfrecord(schema, csv_file, tfrecord_file):
  ''' Converts a csv file into a tfrecord
  Args:
    schema (schema_pb2)  Schema protobuf from TFDV
    csv_file (string) - file to convert to tfrecord-
    tfrecord_file (string) - filename of tfrecord to create

  Returns:
    filename of tfrecord
  '''

  # Open CSV file for reading. Each row is mapped as a dictionary.
  reader = csv.DictReader(open(csv_file, 'r'))
  
  # Initialize TF examples list
  examples = []

  # For each row in CSV, create a TF Example based on
  # the Schema and append to the list
  for line in reader:

    # Intialize example
    example = example_pb2.Example()

    # Loop through features in the schema
    for feature in schema.feature:

      # Get current feature name
      key = feature.name

      # Populate values based on data type of current feature
      if feature.type == schema_pb2.FLOAT:
        example.features.feature[key].float_list.value[:] = (
            [float(line[key])] if len(line[key]) > 0 else [])
      elif feature.type == schema_pb2.INT:
        example.features.feature[key].int64_list.value[:] = (
            [int(line[key])] if len(line[key]) > 0 else [])
      elif feature.type == schema_pb2.BYTES:
        example.features.feature[key].bytes_list.value[:] = (
            [line[key].encode('utf8')] if len(line[key]) > 0 else [])
        
    # Append to the list
    examples.append(example)

  # Write examples to tfrecord file
  with tf.io.TFRecordWriter(tfrecord_file) as writer:
    for example in examples:
      writer.write(example.SerializeToString())
  
  return tfrecord_file

In [9]:
path = "../data/TFRecord"

In [10]:
file_created=csv_to_tfrecord(SCHEMA,TEST_DATA_PATH,os.path.join(path,"record_for_test.tfrecord"))

In [11]:
print(f'files created: {file_created}')

files created: ../data/TFRecord/record_for_test.tfrecord


In [12]:
path_record = file_created

## Load model trained

In [13]:
import tensorflow as tf

In [14]:
model_path = "../serving-dir/1694455820/"

In [15]:
loaded_model = tf.keras.models.load_model(model_path)
loaded_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 fnlwgt (InputLayer)         [(None, 1)]                  0         []                            
                                                                                                  
 education-num (InputLayer)  [(None, 1)]                  0         []                            
                                                                                                  
 capital-gain (InputLayer)   [(None, 1)]                  0         []                            
                                                                                                  
 capital-loss (InputLayer)   [(None, 1)]                  0         []                            
                                                                                            

In [16]:
# Transformation layer can be accessed in two ways. These are equivalent.
loaded_model.get_layer('transform_features_layer_1') is loaded_model.tft_layer

True

In [17]:
inference_fn = loaded_model.signatures['serving_default']
inference_fn

<ConcreteFunction signature_wrapper_serve_tf_examples_fn(*, examples) at 0x7F09DF2BCDF0>

In [18]:
inference_fn.structured_input_signature

((), {'examples': TensorSpec(shape=(None,), dtype=tf.string, name='examples')})

# Test batch predict 

In [19]:
from tensorflow_transform.tf_metadata import schema_utils

# Load one tfrecord
tfrecord_file = tf.data.TFRecordDataset(path_record)

# Parse schema object as a feature spec
feature_spec = schema_utils.schema_as_feature_spec(SCHEMA).feature_spec

# Create a batch from the dataset
for records in tfrecord_file.batch(1).take(1):

  # Parse the batch to get a dictionary of raw features
  parsed_examples = tf.io.parse_example(records, feature_spec)

  # Print the results
  print("\nRAW FEATURES:")
  for key, value in parsed_examples.items():
    print(f'{key}: {value.numpy()}')
  
  # Pop the label since the model does not expect a label input
  parsed_examples.pop('label')

  # Transform the rest of the raw features using the transform layer
  transformed_examples = loaded_model.tft_layer(parsed_examples)

  # Print the input to the model
  print("\nTRANSFORMED FEATURES:")
  for key, value in transformed_examples.items():
    print(f'{key}: {value.numpy()}')


RAW FEATURES:
age: [[25]]
capital-gain: [[0]]
capital-loss: [[0]]
education: [[b'11th']]
education-num: [[7]]
fnlwgt: [[226802]]
hours-per-week: [[40]]
label: [[0]]
marital-status: [[b'Never-married']]
native-country: [[b'United-States']]
occupation: [[b'Machine-op-inspct']]
race: [[b'Black']]
relationship: [[b'Own-child']]
sex: [[b'Male']]
workclass: [[b'Private']]

TRANSFORMED FEATURES:
fnlwgt: [0.14569008]
marital-status: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
workclass: [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
education-num: [0.4]
capital-gain: [0.]
age: [[1. 0. 0. 0.]]
sex: [[1. 0. 0. 0. 0. 0. 0.]]
capital-loss: [0.]
hours-per-week: [0.39795917]
relationship: [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
race: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
education: [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
occupation: [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
native-country: [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [20]:
from tensorflow_transform.tf_metadata import schema_utils


# Load one tfrecord
tfrecord_file = tf.data.TFRecordDataset(path_record)

# Parse schema object as a feature spec
feature_spec = schema_utils.schema_as_feature_spec(SCHEMA).feature_spec
# Create a batch from the dataset
for records in tfrecord_file.batch(30).take(1):

  # Get the label values from the raw input
  parsed_examples = tf.io.parse_example(records, feature_spec)
  print("type : " , type(parsed_examples), "\nparsed_examples : ",parsed_examples)
  y_true = parsed_examples.pop('label')
  print(f'labels:\n {y_true.numpy()}\n')
  
  # Transform the raw features and pass to the model to get predictions
  transformed_examples = loaded_model.tft_layer(parsed_examples)
  
  y_pred = loaded_model(transformed_examples)
  print(f'predictions:\n {y_pred.numpy()}\n')
  
  # Measure the binary accuracy
  metric = tf.keras.metrics.BinaryAccuracy(threshold=0.3)
  metric.update_state(y_true, y_pred)
  print(f'binary accuracy: {metric.result().numpy()}\n')

type :  <class 'dict'> 
parsed_examples :  {'age': <tf.Tensor: shape=(30, 1), dtype=int64, numpy=
array([[25],
       [38],
       [28],
       [44],
       [18],
       [34],
       [29],
       [63],
       [24],
       [55],
       [65],
       [36],
       [26],
       [58],
       [48],
       [43],
       [20],
       [43],
       [37],
       [40],
       [34],
       [34],
       [72],
       [25],
       [25],
       [45],
       [22],
       [23],
       [54],
       [32]])>, 'capital-gain': <tf.Tensor: shape=(30, 1), dtype=int64, numpy=
array([[   0],
       [   0],
       [   0],
       [7688],
       [   0],
       [   0],
       [   0],
       [3103],
       [   0],
       [   0],
       [6418],
       [   0],
       [   0],
       [   0],
       [3103],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [   0],
       [7298],
       [   0],
       [   0],
       [   0],
       [   

## Test predict online fc

In [21]:
# Prepare an example and run inference.
features = {

  'age': tf.train.Feature(int64_list=tf.train.Int64List(value=[43])),
  'capital-gain': tf.train.Feature(int64_list=tf.train.Int64List(value=[0])),
  'capital-loss': tf.train.Feature(int64_list=tf.train.Int64List(value=[40])),
  'education': tf.train.Feature(bytes_list=tf.train.BytesList(value=['1st-4th'.encode('utf-8')])),

    'education-num': tf.train.Feature(int64_list=tf.train.Int64List(value=[13])),
    'fnlwgt': tf.train.Feature(int64_list=tf.train.Int64List(value=[77516])),
    'hours-per-week': tf.train.Feature(int64_list=tf.train.Int64List(value=[40])),

    'marital-status': tf.train.Feature(bytes_list=tf.train.BytesList(value=[ 'Unmarried'.encode('utf-8')])),
    'native-country': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Haiti'.encode('utf-8')])),
    'occupation': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Sale'.encode('utf-8')])),

  'race': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Black'.encode('utf-8')])),
  'relationship': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Not-in-family'.encode('utf-8')])),
    'sex': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Male'.encode('utf-8')])),
    'workclass': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Never-worked'.encode('utf-8')]))

}
example_proto = tf.train.Example(features=tf.train.Features(feature=features))
examples = example_proto.SerializeToString()
examples

b'\n\xd6\x02\n!\n\x0crelationship\x12\x11\n\x0f\n\rNot-in-family\n\x15\n\x0ccapital-loss\x12\x05\x1a\x03\n\x01(\n\x15\n\x0ccapital-gain\x12\x05\x1a\x03\n\x01\x00\n\x18\n\teducation\x12\x0b\n\t\n\x071st-4th\n\x1d\n\tworkclass\x12\x10\n\x0e\n\x0cNever-worked\n\x1f\n\x0emarital-status\x12\r\n\x0b\n\tUnmarried\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01+\n\x16\n\noccupation\x12\x08\n\x06\n\x04Sale\n\x11\n\x04race\x12\t\n\x07\n\x05Black\n\x17\n\x0ehours-per-week\x12\x05\x1a\x03\n\x01(\n\x11\n\x06fnlwgt\x12\x07\x1a\x05\n\x03\xcc\xdd\x04\n\x0f\n\x03sex\x12\x08\n\x06\n\x04Male\n\x1b\n\x0enative-country\x12\t\n\x07\n\x05Haiti\n\x16\n\reducation-num\x12\x05\x1a\x03\n\x01\r'

In [22]:
result = inference_fn(examples=tf.constant([examples]))
result

{'outputs': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.1801035]], dtype=float32)>}

In [23]:
# Prepare an example and run inference.
features = {
  'age': tf.train.Feature(int64_list=tf.train.Int64List(value=[31])),
  'capital-gain': tf.train.Feature(int64_list=tf.train.Int64List(value=[5178])),
  'capital-loss': tf.train.Feature(int64_list=tf.train.Int64List(value=[0])),
  'education': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Masters'.encode('utf-8')])),

    'education-num': tf.train.Feature(int64_list=tf.train.Int64List(value=[14])),
    'fnlwgt': tf.train.Feature(int64_list=tf.train.Int64List(value=[159449])),
    'hours-per-week': tf.train.Feature(int64_list=tf.train.Int64List(value=[45])),

    'marital-status': tf.train.Feature(bytes_list=tf.train.BytesList(value=[ 'Married-civ-spouse'.encode('utf-8')])),
    'native-country': tf.train.Feature(bytes_list=tf.train.BytesList(value=['United-States'.encode('utf-8')])),
    'occupation': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Exec-managerial'.encode('utf-8')])),

  'race': tf.train.Feature(bytes_list=tf.train.BytesList(value=['White'.encode('utf-8')])),
  'relationship': tf.train.Feature(bytes_list=tf.train.BytesList(value=['husband'.encode('utf-8')])),
    'sex': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Male'.encode('utf-8')])),
    'workclass': tf.train.Feature(bytes_list=tf.train.BytesList(value=['Private'.encode('utf-8')]))

}
example_proto = tf.train.Example(features=tf.train.Features(feature=features))
examples = example_proto.SerializeToString()
result = inference_fn(examples=tf.constant([examples]))
result

{'outputs': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.9205638]], dtype=float32)>}

In [24]:
# model_export_path="../serving_model_local/census-pipelines/1694480529"
# !saved_model_cli show --dir {model_export_path} --all

In [25]:
import tensorflow as tf

example_to_test = {
    'age': tf.constant([[25]], dtype=tf.int64),
    'capital-gain': tf.constant([[0]], dtype=tf.int64),
    'capital-loss': tf.constant([[0]], dtype=tf.int64),
    'education': tf.constant([[b'11th']], dtype=tf.string),
    'education-num': tf.constant([[7]], dtype=tf.int64),
    'fnlwgt': tf.constant([[226802]], dtype=tf.int64),
    'hours-per-week': tf.constant([[40]], dtype=tf.int64),
    'marital-status': tf.constant([[b'Never-married']], dtype=tf.string),
    'native-country': tf.constant([[b'United-States']], dtype=tf.string),
    'occupation': tf.constant([[b'Machine-op-inspct']], dtype=tf.string),
    'race': tf.constant([[b'Black']], dtype=tf.string),
    'relationship': tf.constant([[b'Own-child']], dtype=tf.string),
    'sex': tf.constant([[b'Male']], dtype=tf.string),
    'workclass': tf.constant([[b'Private']], dtype=tf.string),
}

transformed_examples_test = loaded_model.tft_layer(example_to_test)
  
y_pred = loaded_model(transformed_examples_test)
y_pred

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00202559]], dtype=float32)>

In [47]:
y_pred.numpy()

array([[0.00202559]], dtype=float32)