# 3.6 Construct TFRecordDataset

In this notebook we will load the previously encoded 'Global Dollar Volume w/ Fractional Differencing' dataset and prepare it for modeling. In order to do so, we will transform it into X, y matrix and then write it to a TFRecordDataset stored on a Google Cloud Compute Bucket. Doing so will optimize read speed for TPU and neural network training on the cloud. 

In [1]:
import pandas as pd
import numpy as np

In [3]:
from google.cloud import storage
import os
import tensorflow as tf

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "../data/keys/tpu-training-289520-f7727af0669b.json"

storage_client = storage.Client()

def _floatlist_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _string_bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    if isinstance(value, type(str)):
        value = value.encode('utf-8')
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def serialize_example(dataset):
    """
    Takes a json encoded Dataset with the structure:
    {'feature_name' :
        {'data':[dataset], 
        'type':'name of corresponding tf message type'}
        }
        
    Creates a tf.train.Example message ready to be written to a file.
    """

    feature_type_dict = {
        'floats' : _floatlist_feature,
        'float' : _float_feature,
        'bytes' : _bytes_feature,
        'string' : _string_bytes_feature,
        'int' : _int64_feature
    }
    
    feature = {key: feature_type_dict[dataset[key]['type']](dataset[key]['data']) for key in dataset.keys()}

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def write_tfrecord(dataset, filename, bucket_path='gs://fin-aml/data/'):
    destination = bucket_path + filename
    
    num_records = dataset[dataset.keys()[0]]['data'].shape[0]
    
    with tf.io.TFRecordWriter(destination) as writer:
        for i in range(num_records):
            example = serialize_example(dataset)
            writer.write(example)

In [33]:
y = pd.read_csv('../data/sp500/interval_data_100milvol.csv', index_col=0, nrows=5).head()

drop_cols = y.columns[y.columns.str.contains('value_close')]

y = y.drop(drop_cols, axis=1)

close_cols = y.columns[y.columns.str.contains('close')]

y = y[close_cols].diff(1) / y[close_cols].shift(1)
y

Unnamed: 0_level_0,AAL_close,AAPL_close,AAP_close,ABBV_close,ABC_close,ABMD_close,ABT_close,ACN_close,ADBE_close,ADI_close,...,XLNX_close,XOM_close,XRAY_close,XRX_close,XYL_close,YUM_close,ZBH_close,ZBRA_close,ZION_close,ZTS_close
cum_value_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(0.0, 100000000.0]",,,,,,,,,,,...,,,,,,,,,,
"(100000000.0, 200000000.0]",0.001505,0.000628,0.0,0.002518,,,,,,,...,0.00473,0.001386,,0.006667,,,,,,
"(200000000.0, 300000000.0]",-0.001503,0.001103,-0.001672,0.0,,,,,,0.0,...,-0.001884,-0.001615,,-0.001656,,,,,,
"(300000000.0, 400000000.0]",0.003762,0.00175,0.001898,-0.001465,,,,,-0.001565,0.0,...,0.00302,-0.000462,,0.000553,,,0.0,,,0.0
"(400000000.0, 500000000.0]",0.004498,0.0,-0.000347,0.0,,,,,0.0,0.0,...,-0.0095,-0.000693,,0.0,,,0.0,,,-0.010709


In [34]:
y.columns

Index(['AAL_close', 'AAPL_close', 'AAP_close', 'ABBV_close', 'ABC_close',
       'ABMD_close', 'ABT_close', 'ACN_close', 'ADBE_close', 'ADI_close',
       ...
       'XLNX_close', 'XOM_close', 'XRAY_close', 'XRX_close', 'XYL_close',
       'YUM_close', 'ZBH_close', 'ZBRA_close', 'ZION_close', 'ZTS_close'],
      dtype='object', length=505)

In [39]:
X = pd.read_csv('../data/adf_testing/fracdiff_d20.csv', nrows=5, index_col=1).drop('Unnamed: 0', axis=1)

## drop cols, etc. 
X

Unnamed: 0_level_0,AAL_close,AAL_high,AAL_low,AAL_open,AAL_value_close,AAL_volume,AAPL_close,AAPL_high,AAPL_low,AAPL_open,...,ZION_low,ZION_open,ZION_value_close,ZION_volume,ZTS_close,ZTS_high,ZTS_low,ZTS_open,ZTS_value_close,ZTS_volume
cum_value_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(4000000000.0, 4100000000.0]",13.2099,13.2877,13.18,13.285,6687618.0,506258.0,,,,,...,34.175,34.175,3417.5,100.0,,,,,1657255.72,10481.0
"(4100000000.0, 4200000000.0]",13.2099,13.2877,13.18,13.285,0.0,0.0,,,,,...,34.175,34.175,0.0,0.0,,,,,0.0,0.0
"(4200000000.0, 4300000000.0]",13.2099,13.2877,13.18,13.285,0.0,0.0,,,,,...,34.175,34.175,0.0,0.0,,,,,0.0,0.0
"(4300000000.0, 4400000000.0]",13.2099,13.2877,13.18,13.285,0.0,0.0,,,,,...,34.175,34.175,0.0,0.0,,,,,0.0,0.0
"(4400000000.0, 4500000000.0]",13.2099,13.2877,13.18,13.285,0.0,0.0,,,,,...,34.175,34.175,0.0,0.0,,,,,0.0,0.0


#### To do:

<b>Before writing, ensure the that index columns on both files match 100% </b>

In [None]:
# Define X and y.. 
dataset = {'X':{'data':X, 'type':'floats'},
           'y':{'data':y, 'type':'floats'}