In [None]:
# Copyright 2020 Google LLC. All Rights Reserved.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# With the exception of matplotlib all other requirements are the same as 
!pip install matplotlib

In [None]:
! pip freeze

# Timeseries sample notebook

This notebook can be used to explore the various stages of data engineering used within the time series library. It has various sections, which correspond to different parts of the ML part of the solution.

The Java part of the library is not explored yet, this will be made availabel through Apache Beam xlang transforms at a later date.

The notebook will make use of the libraries made avialable via the timeseries python samples. This notebook should be run from within a virtual env that has those samples installed.

## [Training](#training)

* [Explore ImportGen output](#example)
* [Explore StatisticsGen output](#statistics)
* [Explore Transform Output](#transform)
* [Running Training](#training)

In [None]:
import os
import pprint
import tempfile
import urllib

import absl
import tensorflow as tf
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import pandas as pd 
from datetime import datetime

import matplotlib

# Import the tfx components we will make use of.
import tfx
from tfx.components.example_gen.import_example_gen.component import ImportExampleGen
from tfx.components import Evaluator
from tfx.components import ExampleValidator
from tfx.components import Pusher
from tfx.components import ResolverNode
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Trainer
from tfx.components import Transform

from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor

from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.proto.evaluator_pb2 import SingleSlicingSpec
from tfx.utils.dsl_utils import external_input
from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing

%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip

print('TF version', tf.__version__, '\nTFX version', tfx.__version__)

# <a id = "training"> Training </a>

## <a id = "example"> Import Examples </a>

In [None]:
# The Data root is the location that the TF.Examples used for training are landed from the Java pipeline. 
BOOTSTRAP_DATA_ROOT = os.path.expanduser('~/demo/timeseries/data/simple-data-bootstrap/')
print(f'The path {BOOTSTRAP_DATA_ROOT} should contain the generated TF.Example files')

In [None]:
context = InteractiveContext( pipeline_name='SimpleData' )

In [None]:
# ImortExampleGen is used to read the TF.Examples created by the streaming java pipeline.
examples = external_input(BOOTSTRAP_DATA_ROOT)
example_gen = ImportExampleGen(input=examples) 
context.run(example_gen)

### Exploring the output from ImportExampleGen

In [None]:
# We can explore the name of the outputs from ImportExampleGen
artifact = example_gen.outputs['examples'].get()[0]
print(artifact.split_names, artifact.uri)

In [None]:
# Get the URI of the output artifact representing the training examples, which is a directory
train_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
# Note that we have both Metadata information and features. 
# The features will have number of values == the number of timesteps.
for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  pp.pprint(example)

### Lets explore the feature LAST in a graph

In [None]:
# Given a Dataset with time series examples, extract the LAST value
def convert_time_series_data_to_raw_values(dataset: tf.data.Dataset, num_records: int, num_timesteps : int):
    for tfrecord in dataset.take(num_records):
        serialized_example = tfrecord.numpy()
        example = tf.train.Example()
        example.ParseFromString(serialized_example)
        features = example.features.feature
        
        output = {}
        # Extract the time bounds
        output['span_start_timestamp'] = datetime.fromtimestamp(features['METADATA_SPAN_START_TS'].int64_list.value[0] / 1000)
        output['span_end_timestamp'] = datetime.fromtimestamp(features['METADATA_SPAN_END_TS'].int64_list.value[0] / 1000)
        
        for key in features:
            if key.endswith('-LAST') or key.endswith('-FIRST'):
                output[key] = features[key].float_list.value[num_timesteps-1]
        yield output

        # Get the URI of the output artifact representing the training examples, which is a directory
train_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

output = convert_time_series_data_to_raw_values(dataset,num_records=28800, num_timesteps=5)
df = pd.DataFrame.from_dict(output)
df.set_index('span_start_timestamp')
df[(df['span_start_timestamp'] > '2000-1-1 01:00:00') & (df['span_start_timestamp'] <= '2000-1-1 01:05:00')].plot('span_end_timestamp',y=['value-LAST','value-FIRST'],figsize=(18,9))

## <a id = "statistics"> StatisticsGen </a>

In [None]:
statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
context.run(statistics_gen)
# context.show(statistics_gen.outputs['statistics']) # If you want to look at the stats

In [None]:
context.show(statistics_gen.outputs['statistics'])

In [None]:
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    infer_feature_shape=False) # If this is True, the look back is explicitly provided as 4 instead of None
context.run(schema_gen)

In [None]:
context.show(schema_gen.outputs['schema'])

## <a id = "transform"> TF Transform </a>

Note we are using 

preprocessing_fn = 'timeseries.encoder_decoder.encoder_decoder_preprocessing.preprocessing_fn'

This is the same process function from our pipeline, avoiding duplicating the code.

In [None]:
transform = Transform(
    examples=example_gen.outputs['examples'],
    schema=schema_gen.outputs['schema'],
    preprocessing_fn = 'timeseries.encoder_decoder.encoder_decoder_preprocessing.preprocessing_fn')
context.run(transform, enable_cache=False)

In [None]:
transform.outputs

In [None]:
train_uri = transform.outputs['transform_graph'].get()[0].uri
os.listdir(train_uri)

In [None]:
%%skip_for_export

# Get the URI of the output artifact representing the transformed examples, which is a directory
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
for tfrecord in dataset.take(10):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  pp.pprint(example)

In [None]:
timesteps = 5
number_features = 2

def _gzip_reader_fn(filenames):
    """Small utility returning a record reader that can read gzip'ed files."""
    return tf.data.TFRecordDataset(filenames, compression_type="GZIP")

def create_training_data(features):
    """Extract only one feature for debug"""
    return features['Float32']

# Get the URI of the output artifact representing the transformed examples, which is a directory
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

transformed_feature_spec = {'Float32': tf.io.FixedLenFeature(shape=[timesteps, number_features], 
                            dtype=tf.float32, default_value=None), 
                            'LABEL': tf.io.FixedLenFeature(shape=[timesteps, number_features], 
                            dtype=tf.float32, default_value=None)}

dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=tfrecord_filenames,
            batch_size=1,
            shuffle=False,
            features=transformed_feature_spec,
            reader=_gzip_reader_fn)

dataset = dataset.map(create_training_data)

for tfrecord in dataset.take(5):
    serialized_example = tfrecord.numpy()   
    print(serialized_example)

## <a id = "training"> Training </a>

In [None]:
# Uses user-provided Python function that implements a model using TF-Learn.
trainer_args = {
        'run_fn': 'timeseries.encoder_decoder.encoder_decoder_run_fn.run_fn',
        'transformed_examples': transform.outputs['transformed_examples'],
        'schema': schema_gen.outputs['schema'],
        'transform_graph': transform.outputs['transform_graph'],
        'train_args': trainer_pb2.TrainArgs(num_steps=280),
        'eval_args': trainer_pb2.EvalArgs(num_steps=140),
        'custom_executor_spec': executor_spec.ExecutorClassSpec(GenericExecutor),
        'custom_config': {'epochs': 30, 
                          'train_batches': 1000, 
                          'eval_batches': 1000, 
                          'timesteps': 5, 
                          'number_features': 2, 
                          'outer_units' : 16, 
                          'inner_units' : 4},
}
trainer = Trainer(**trainer_args)
context.run(trainer, enable_cache=False)