
### Data Formatting

Run this first cell to generate the data necessary for training the model

In [6]:
import numpy as np
import pandas as pd

#
# Change these to switch around parameters
#

set_size = 5 # Number of consecutive days in a row
data_column = 1 # Index of column to use (S&P - 1)


# Load data from current directory
data = pd.read_csv('./data_stocks.csv')

data = data.values

# Pull specified column of data
snp = data[:,np.arange(data_column, data_column + 1)]

data_set = []

# Loop over full set
for idx in range(len(snp)):

    row = []

    # Check if our range goes over the list length
    if idx + set_size > len(snp):
        break

    # Build a row starting at current
    # index to index + set_size
    for v in range(set_size):
        is_last = v == (set_size - 1)
        
        current_index = idx + v
        current_value = snp[current_index][0]

        # Produce last day predictor if it's the last
        # iteration
#         if is_last:
#             last_value = row[-1]
#             diff = current_value - last_value
            
#             # Reults in either [-1,0,1]
#             predictor = 0 if diff == 0 else (diff/abs(diff))
#             row.append(predictor)
            
#         # Otherwise append the value
#         else:
#             row.append(current_value)

        row.append(current_value)


    row = np.array(row)
    data_set.append(row)


data_set = np.asarray(data_set)

fmt = {'float_kind':'{:0.2f}'.format}
np.set_printoptions(suppress=True, formatter=fmt)

# Prints top 10 records for quick confirmation
print(data_set[0:10])

np.savetxt("output.csv", data_set, delimiter=",", fmt="%10.2f")
print("\n\nSaved data to 'output.csv'")

[[42.33 42.36 42.31 42.37 42.54]
 [42.36 42.31 42.37 42.54 42.54]
 [42.31 42.37 42.54 42.54 42.47]
 [42.37 42.54 42.54 42.47 42.47]
 [42.54 42.54 42.47 42.47 42.39]
 [42.54 42.47 42.47 42.39 42.33]
 [42.47 42.47 42.39 42.33 42.40]
 [42.47 42.39 42.33 42.40 42.29]
 [42.39 42.33 42.40 42.29 42.29]
 [42.33 42.40 42.29 42.29 42.39]]


Saved data to 'output.csv'



### Network

In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib notebook

#
# Change these to switch around parameters
#

number_of_days = 4
percent_training = 0.8
data = pd.read_csv('./output.csv')



data = data.values
np.random.shuffle(data) # Interesting to comment this out


scaler = MinMaxScaler(feature_range=(-1, 1))

# Split data into inputs and outputs
# without scaling the output

# inputs = data[:, np.arange(0,number_of_days - 1)]
# outputs = data[:, number_of_days]
# scaled_input = scaler.fit_transform(inputs)




# Split data into inputs and outputs
# also scaling the output

scaled_data = scaler.fit_transform(data)
scaled_input = scaled_data[:, np.arange(0,number_of_days - 1)]
outputs = scaled_data[:, number_of_days]



# Split Data into train and test
cutoff = int(np.floor(percent_training * len(scaled_input)))

train_inputs = scaled_input[:cutoff]
train_outputs = outputs[:cutoff]

test_inputs = scaled_input[cutoff + 1:]
test_outputs = outputs[cutoff + 1:]

# Setup placeholders for input and output
X = tf.placeholder(tf.float32, [None, number_of_days - 1])
Y = tf.placeholder(tf.float32, [None])
            
# Build basic network with sigmoid activation
# and using the pre-built dense layer
network = tf.layers.dense(X, units=1, activation=tf.nn.sigmoid)
network = tf.layers.dense(network, units=1, activation=tf.nn.sigmoid)

cost = tf.reduce_mean(tf.squared_difference(network, Y))

optimizer = tf.train.RMSPropOptimizer(0.01).minimize(cost)

# Optionally use Gradient Descent Optimizer instead
# optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
init = tf.global_variables_initializer()

batch_size = 4096

training_error = []
testing_error = []
batch_array = []

with tf.Session() as sess:
    sess.run(init)
   
    for epoch in range(30):
        shuffled_indices = np.random.permutation(np.arange(len(train_inputs)))
        train_inputs = train_inputs[shuffled_indices]
        train_outputs = train_outputs[shuffled_indices]
        
        r = len(train_outputs) // batch_size
        if epoch == 0:
            print(r)
        
        # To keep track of every epoch, uncomment below (takes quite a bit longer)
        # If you do this, make sure to uncomment the graph at the bottom
        
#         train_error = sess.run(cost, feed_dict={X: train_inputs, Y: train_outputs})
#         test_error = sess.run(cost, feed_dict={X: test_inputs, Y: test_outputs})
#         training_error.append(train_error)
#         testing_error.append(test_error)
#         batch_array.append(epoch)
        
#         print("Epoch {}".format(epoch))
#         print(train_error)
#         print(test_error)
#         print("\n")
        
        for i in range(r):
            start = i * batch_size
            batch_x = train_inputs[start:start + batch_size]
            batch_y = train_outputs[start:start + batch_size]
            
            sess.run([optimizer, cost], feed_dict={X: batch_x, Y: batch_y})
            
            
    train_error = sess.run(cost, feed_dict={X: train_inputs, Y: train_outputs})
    test_error = sess.run(cost, feed_dict={X: test_inputs, Y: test_outputs})  
    
    print("Train Error: {}, Test Error: {}".format(train_error, test_error))
             
   
# To graph progress, uncomment below (make sure data is being recorded above!)
# plt.figure()
# plt.title('Error Rate Over Time')
# plt.ylabel('Error')
# plt.xlabel('Batch')
# plt.plot(batch_array, training_error, label='Training')
# plt.plot(batch_array, testing_error, label='Testing')
# plt.legend()
# print("\nTraining Error")
# print(training_error)
# print("\nTestng Error")
# print(testing_error)
        

8
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Train Error: 0.12543058395385742, Test Error: 0.14533810317516327
