# Import

In [1]:
import tensorflow as tf
import argparse
# Needed for PCA
from sklearn import decomposition

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

  from ._conv import register_converters as _register_converters


In [2]:
orderbook_cols = ['{}_{}_{}'.format(s,t,l) for l in range(1,6) for s in ['ask','bid'] for t in ['price', 'vol'] ]
orderbook_ori = pd.read_csv('INTC_2012-06-21_34200000_57600000_orderbook_5.csv', \
                            header = None, names = orderbook_cols)
orderbook1 = orderbook_ori.copy()
orderbook1['mid_price'] = (orderbook1.iloc[:,0] + orderbook1.iloc[:,2]) / 2
orderbook1['mid_price_mov'] = np.sign(orderbook1['mid_price'].shift(-1)-orderbook1['mid_price'])
orderbook2 = orderbook1.dropna()

scaler = StandardScaler()
x_all_array = scaler.fit_transform(orderbook2.iloc[:,:len(orderbook_cols)])
orderbook = orderbook2.copy()
orderbook.iloc[:,:len(orderbook_cols)] = x_all_array

train_weight = 0.8
cv_weight = 0.1
split1 = int(orderbook.shape[0] * train_weight)
split2 = int(orderbook.shape[0] * cv_weight)
df_train = orderbook[:split1]
df_cv = orderbook[split1:split1+split2]
df_test = orderbook[split1+split2:]
x_train = df_train.iloc[:,:len(orderbook_cols)]
x_train_array = np.array(x_train)
#y_train = df_train.iloc[:,-1]
x_cv = df_cv.iloc[:,:len(orderbook_cols)]
x_cv_array = np.array(x_cv)
#y_cv = df_cv.iloc[:,-1]
x_test = df_test.iloc[:,:len(orderbook_cols)]
x_test_array = np.array(x_test)
#y_test = df_test.iloc[:,-1]
x_all = orderbook.iloc[:,:len(orderbook_cols)]
x_all_array = np.array(x_all)
#y_all = orderbook.iloc[:,-1]

# Definition of the Architecture

In [3]:
input_size = 20
#re-constructed size
output_size = 20

# 3 hidden layers for encoder
n_encoder_h_1 = 16
n_encoder_h_2 = 8
n_encoder_h_3 = 4

# 3 hidden layers for decoder
n_decoder_h_1 = 4
n_decoder_h_2 = 8
n_decoder_h_3 = 16

# Parameters
learning_rate = 0.01
training_epochs = 20 #200
batch_size = 200
display_step = 1

# Batch Normalization 

In [4]:
def layer_batch_normalization(x, n_out, phase_train):
    """
    Defines the network layers
    input:
        - x: input vector of the layer
        - n_out: integer, depth of input maps - number of sample in the batch 
        - phase_train: boolean tf.Varialbe, true indicates training phase
    output:
        - batch-normalized maps   
    """

    beta_init = tf.constant_initializer(value=0.0, dtype=tf.float32)
    beta = tf.get_variable("beta", [n_out], initializer=beta_init)
    
    gamma_init = tf.constant_initializer(value=1.0, dtype=tf.float32)
    gamma = tf.get_variable("gamma", [n_out], initializer=gamma_init)

    #tf.nn.moment: https://www.tensorflow.org/api_docs/python/tf/nn/moments
    #calculate mean and variance of x
    batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')

    #tf.train.ExponentialMovingAverage:
    #https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
    #Maintains moving averages of variables by employing an exponential decay.
    ema = tf.train.ExponentialMovingAverage(decay=0.9)
    ema_apply_op = ema.apply([batch_mean, batch_var])
    ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var)
    
    def mean_var_with_update():
        with tf.control_dependencies([ema_apply_op]):
            return tf.identity(batch_mean), tf.identity(batch_var)
       
    #tf.cond: https://www.tensorflow.org/api_docs/python/tf/cond
    #Return true_fn() if the predicate pred is true else false_fn()
    mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema_mean, ema_var))

    reshaped_x = tf.reshape(x, [-1, 1, 1, n_out])
    normed = tf.nn.batch_norm_with_global_normalization(reshaped_x, mean, var, beta, gamma, 1e-3, True)
    #normed = tf.nn.batch_normalization(reshaped_x, mean, var, beta, gamma, 1e-3, True)
    
    return tf.reshape(normed, [-1, n_out])


# Definition of the Layer 

In [5]:
def layer(x, weight_shape, bias_shape, phase_train):
    
    """
    Defines the network layers
    input:
        - x: input vector of the layer
        - weight_shape: shape the the weight maxtrix
        - bias_shape: shape of the bias vector
        - phase_train: boolean tf.Varialbe, true indicates training phase
    output:
        - output vector of the layer after the matrix multiplication and non linear transformation
    """
    
    #initialize weights
    weight_init = tf.random_normal_initializer(stddev=(1.0/weight_shape[0])**0.5)
    W = tf.get_variable("W", weight_shape, initializer=weight_init)
    
    bias_init = tf.constant_initializer(value=0)
    b = tf.get_variable("b", bias_shape, initializer=bias_init)

    logits = tf.matmul(x, W) + b
    
    #apply the non-linear function after the batch normalization
    return tf.nn.sigmoid(layer_batch_normalization(logits, weight_shape[1], phase_train))
    # Using sigmoid to avoid sharp transitions in neurons

# Definition of the Encoder Part

In [6]:
def encoder(x, n_code, phase_train):
    """
    Defines the network encoder part
    input:
        - x: input vector of the encoder
        - n_code: number of neurons in the code layer (output of the encoder - input of the decoder) 
        - phase_train: boolean tf.Varialbe, true indicates training phase
    output:
        - output vector: reduced dimension
    """
    
    with tf.variable_scope("encoder"):
        
        with tf.variable_scope("h_1"):
            h_1 = layer(x, [input_size, n_encoder_h_1], [n_encoder_h_1], phase_train)

        with tf.variable_scope("h_2"):
            h_2 = layer(h_1, [n_encoder_h_1, n_encoder_h_2], [n_encoder_h_2], phase_train)

        with tf.variable_scope("h_3"):
            h_3 = layer(h_2, [n_encoder_h_2, n_encoder_h_3], [n_encoder_h_3], phase_train)

        with tf.variable_scope("code"):
            output = layer(h_3, [n_encoder_h_3, n_code], [n_code], phase_train)

    return output

# Definition of the Decoder Part

In [7]:
def decoder(x, n_code, phase_train):
    """
    Defines the network encoder part
    input:
        - x: input vector of the decoder - reduced dimension vector
        - n_code: number of neurons in the code layer (output of the encoder - input of the decoder)
        - phase_train: boolean tf.Varialbe, true indicates training phase
    output:
        - output vector: reconstructed dimension of the initial vector
    """
    
    with tf.variable_scope("decoder"):
        
        with tf.variable_scope("h_1"):
            h_1 = layer(x, [n_code, n_decoder_h_1], [n_decoder_h_1], phase_train)

        with tf.variable_scope("h_2"):
            h_2 = layer(h_1, [n_decoder_h_1, n_decoder_h_2], [n_decoder_h_2], phase_train)

        with tf.variable_scope("h_3"):
            h_3 = layer(h_2, [n_decoder_h_2, n_decoder_h_3], [n_decoder_h_3], phase_train)

        with tf.variable_scope("output"):
            output = layer(h_3, [n_decoder_h_3, output_size], [output_size], phase_train)

    return output

# Definition of the Loss

In [8]:
# loss is L2 measure
def loss(output, x):
    """
    Compute the loss of the auto-encoder
    
    intput:
        - output: the output of the decoder
        - x: true value of the sample batch - this is the input of the encoder
        
        the two have the same shape (batch_size * num_of_classes)
    output:
        - loss: loss of the corresponding batch (scalar tensor)
    
    """
    with tf.variable_scope("training"):
        
        l2_measure = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(output, x)), 1))
        train_loss = tf.reduce_mean(l2_measure)
        train_summary_op = tf.summary.scalar("train_cost", train_loss)
        return train_loss, train_summary_op

# Training Function

In [9]:
# Using Adam as optimizer for training 	
def training(cost, global_step):
    """
    defines the necessary elements to train the network
    
    intput:
        - cost: the cost is the loss of the corresponding batch
        - global_step: number of batch seen so far, it is incremented by one 
        each time the .minimize() function is called
    """
    
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')
    train_op = optimizer.minimize(cost, global_step=global_step)
    return train_op

# Evaluation Function

In [10]:
def evaluate(output, x):
    """
    evaluates the accuracy on the validation set 
    input:
        - output: prediction vector of the network for the validation set
        - x: true value for the validation set
    output:
        - val_loss: loss of the autoencoder
        - val_summary_op: summary of the loss
    """
    
    with tf.variable_scope("validation"):
        
        l2_norm = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(output, x, name="val_diff")), 1))
        
        val_loss = tf.reduce_mean(l2_norm)
        
        val_summary_op = tf.summary.scalar("val_cost", val_loss)
        
        return val_loss, val_summary_op

# Main Function

In [11]:
if __name__ == '__main__':

    #parser = argparse.ArgumentParser(description='Autoencoder')
    #parser.add_argument('n_code', nargs=1, type=str)
    #args = parser.parse_args(['--help'])
    #n_code = args.n_code[0]
    
    #if a jupyter file, please comment the 4 above and use the one bellow
    n_code = '2'
    
    #feel free to change with your own
    model_path = '/Users/meihuaren/personal/DL_logs/ae/'

    with tf.Graph().as_default():

        with tf.variable_scope("autoencoder_model"):


            #the input variables are first define as placeholder 
            # a placeholder is a variable/data which will be assigned later 
            x = tf.placeholder("float", [None, 20]) # 20 original features
            
            phase_train = tf.placeholder(tf.bool)

            #define the encoder 
            code = encoder(x, int(n_code), phase_train)

            #define the decoder
            output = decoder(code, int(n_code), phase_train)

            #compute the loss 
            cost, train_summary_op = loss(output, x)

            #initialize the value of the global_step variable 
            # recall: it is incremented by one each time the .minimise() is called
            global_step = tf.Variable(0, name='global_step', trainable=False)

            train_op = training(cost, global_step)

            #evaluate the accuracy of the network (done on a validation set)
            eval_op, val_summary_op = evaluate(output, x)

            summary_op = tf.summary.merge_all()

            #save and restore variables to and from checkpoints.
            saver = tf.train.Saver()

            #defines a session
            sess = tf.Session()

            # summary writer
            #https://www.tensorflow.org/api_docs/python/tf/summary/FileWriter
            train_writer = tf.summary.FileWriter(model_path, graph=sess.graph)
            val_writer   = tf.summary.FileWriter(model_path, graph=sess.graph)

            #initialization of the variables
            init_op = tf.global_variables_initializer()

            sess.run(init_op)

            # Training cycle
            for epoch in range(training_epochs):

                avg_cost = 0.
                total_batch = int(x_train_array.shape[0]/batch_size)
                
                #train_writer = tf.summary.FileWriter(model_path+str(epoch)+'/model.ckpt', graph=sess.graph)
                #val_writer   = tf.summary.FileWriter(model_path+str(epoch)+'/model.ckpt', graph=sess.graph)
                
                # Loop over all batches
                for i in range(total_batch):
                    
                    minibatch_x = x_train_array[i*batch_size:(i+1)*batch_size]
                    
                    # Fit training using batch data
                    #the training is done using the training dataset
                    _, new_cost, train_summary = sess.run([train_op, cost, train_summary_op], feed_dict={x: minibatch_x, phase_train: True})
                    
                    train_writer.add_summary(train_summary, sess.run(global_step))
                    
                    # Compute average loss
                    avg_cost += new_cost/total_batch
                
                # Display logs per epoch step
                if epoch % display_step == 0:
                    
                    print("Epoch:", '%04d' % (epoch+1), "cost =", "{:.9f}".format(avg_cost))

                    train_writer.add_summary(train_summary, sess.run(global_step))

                    validation_loss, val_summary = sess.run([eval_op, val_summary_op], feed_dict={x: x_cv_array, phase_train: False})
                    
                    val_writer.add_summary(val_summary, sess.run(global_step))
                    
                    print("Validation Loss:", validation_loss)

                    save_path = saver.save(sess, model_path)
                    print("Model saved in file: %s" % save_path)


            print("Optimization Done")

            test_loss = sess.run(eval_op, feed_dict={x: x_test_array, phase_train: False})

            print("Test Loss:", test_loss)

Epoch: 0001 cost = 3.751135787
Validation Loss: 5.238889
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0002 cost = 3.508099950
Validation Loss: 5.0794716
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0003 cost = 3.395779326
Validation Loss: 4.89628
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0004 cost = 3.333917028
Validation Loss: 4.719273
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0005 cost = 3.278065537
Validation Loss: 4.689906
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0006 cost = 3.257912736
Validation Loss: 4.7118177
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0007 cost = 3.247385927
Validation Loss: 4.638049
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0008 cost = 3.202488335
Validation Loss: 4.7283945
Model saved in file: /Users/meihuaren/personal/DL_logs/ae/
Epoch: 0009 cost = 3.180002682
Validation Loss: 5.3466163
Model saved 

In [12]:
if __name__ == '__main__':

    #feel free to change with your own
    args_savepath = '/Users/meihuaren/personal/DL_logs/ae/'
    new_features_resultpath = '/Users/meihuaren/personal/OR_2018fall/Courses/E4720 Deep Learning/project_coding/Team E_code/'
    n_code = 2
    
    #=====================================
    # PCA
    print ('Performing PCA')
    pca = decomposition.PCA(n_components=2) # grid search for the parameter
    pca.fit(x_train_array) # use train data for feature selection in order to avoid look ahead bias
    print('PCA Codes')
    pca_codes = pca.transform(x_all_array)
    print(pca_codes)
    pca_codes_df = pd.DataFrame(pca_codes)
    ob_new_pca = pd.concat([pca_codes_df,orderbook.iloc[:,-1]],axis = 1)
    filename = new_features_resultpath + 'ob_new_pca.csv'
    ob_new_pca.to_csv(filename)
    
    '''
    print('Re-Constructing')
    # transform data into its original space
    pca_reconstructed = pca.inverse_transform(pca_codes[:20])
    #print(pca_reconstructed)
    '''

    #=====================================
    # AutoEncoder

    with tf.Graph().as_default():

        with tf.variable_scope("autoencoder_model"):

            x = tf.placeholder("float", [None, 20]) # 20 original features
            
            phase_train = tf.placeholder(tf.bool)

            code = encoder(x, n_code, phase_train)

            output = decoder(code, n_code, phase_train)

            cost, train_summary_op = loss(output, x)

            global_step = tf.Variable(0, name='global_step', trainable=False)

            train_op = training(cost, global_step)

            eval_op, val_summary_op = evaluate(output, x)

            #saver = tf.train.Saver()
            #sess = tf.Session()
            print('\n')
            print('Starting Autoencoder', args_savepath ) #args.savepath[0]
            print('\n')
            
            sess = tf.Session()
            saver = tf.train.Saver()
            save_path = saver.restore(sess, args_savepath ) #args.savepath[0]
            print("Model restored from file: %s" % save_path)

            print('Running Autoencoder & Autoencoder Codes')
            print('\n')
            
            ae_codes = sess.run(code, feed_dict={x: x_all_array, phase_train: True})
            print(ae_codes)
            ae_codes_df = pd.DataFrame(ae_codes)
            ob_new_ae = pd.concat([ae_codes_df,orderbook.iloc[:,-1]],axis = 1)
            filename = new_features_resultpath + 'ob_new_ae.csv'
            ob_new_ae.to_csv(filename)
            
            #ae_codes, ae_reconstruction = sess.run([code, output], feed_dict={x: mnist.test.images*np.random.randint(2, size=(784)), phase_train: True})
            

Performing PCA
PCA Codes
[[  6.83314187  -2.69809518]
 [  6.83204746  -2.69698052]
 [  6.81168241  -2.70884073]
 ...
 [-15.09461315  15.5592874 ]
 [-15.07970768  15.53025868]
 [-15.08080209  15.53137334]]


Starting Autoencoder /Users/meihuaren/personal/DL_logs/ae/


INFO:tensorflow:Restoring parameters from /Users/meihuaren/personal/DL_logs/ae/
Model restored from file: None
Running Autoencoder & Autoencoder Codes


[[0.06358596 0.3159923 ]
 [0.06355276 0.31604666]
 [0.06350567 0.316114  ]
 ...
 [0.39423656 0.15631337]
 [0.39152122 0.1568615 ]
 [0.39070725 0.15702535]]
