In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import threading
import multiprocessing
from random import choice
from time import sleep
from time import time
import scipy.signal
from env import stock
import collections as col

In [2]:
class A3C:
    def __init__(self, opt, scope, inputs=12, outputs=3, dropout=0.5, depth=2):        
        with tf.variable_scope(scope):
            layers = []
            self.inp = tf.placeholder(dtype=tf.float32, shape=[None, inputs])
            d_layer = tf.layers.dense(self.inp, 2**(depth+2), activation=tf.nn.relu)
            layers.append(d_layer)
            for i in range(1, depth):
                d_layer = tf.layers.dense(layers[-1], 2**(depth+2-i), activation=tf.nn.relu)
            self.policy = tf.layers.dense(layers[-1], outputs, activation=tf.nn.softmax)
            self.value = tf.layers.dense(layers[-1], 1, activation=None)
       
            if scope != 'global':
                self.advantages = tf.placeholder(dtype=tf.float32, shape=[None])
                self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,outputs,dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
    
                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
        
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy+1e-50))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs+1e-50)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01
    
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
                
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = opt.apply_gradients(zip(grads, global_vars))

In [3]:
class Worker:
    def __init__(self, name, opt, path, global_eps, input_size=12, output_size=3):
        self.name = "w_"+str(name)
        self.number = name
        self.path = path
        self.opt = opt
        self.glob_eps = global_eps
        self.increment = self.glob_eps.assign_add(1)
        self.rewards = []
        self.lengths = []
        self.mean_vals = []
        self.a3c = A3C(opt, self.name, inputs=input_size)
        self.update_local_ops = self.update_target_graph('global',self.name)
        self.stock = stock()
        self.stock.get_first_ohlc()
    
    def play(self,gamma,sess,coord,saver):
        num_eps = 0
        episode_rewards=[]
        while not coord.should_stop():
            sess.run(self.update_local_ops)
            inputs = self.stock.next_data()
            episode_buffer = []
            episode_values = []
            episode_frames = []
            episode_reward = 0
            num_obs = 0
            #print(inputs)
            while len(inputs) is not 0:
                a_dist,v = sess.run([self.a3c.policy,self.a3c.value], 
                        feed_dict={self.a3c.inp:[inputs]})
                with np.errstate(invalid='raise'):
                    try:
                        a = np.random.choice(a_dist[0],p=a_dist[0])
                    except:
                        break
                a = np.argmax(a_dist == a)
                r = self.stock.make_choice(a)
                episode_buffer.append([inputs,a,r,v[0,0]])
                episode_values.append(v[0,0])
                episode_reward += r
                num_obs += 1
                inputs = self.stock.next_data()
            #print(len(episode_buffer))
            self.stock.get_new_day()
            self.stock.get_first_ohlc()
            episode_rewards.append(episode_reward)
            if len(episode_buffer) != 0:
                episode_buffer = np.array(episode_buffer)
                t_inputs = episode_buffer[:,0]
                actions = episode_buffer[:,1]
                rewards = episode_buffer[:,2]
                values = episode_buffer[:,3]
                
                self.rewards.append(episode_reward)
                self.lengths.append(num_obs)
                self.mean_vals.append(np.mean(episode_values))
            
                rewards_plus = np.asarray(rewards.tolist()+[0])
                discounted_rewards = self.discount(rewards_plus,gamma)[:-1]
                value_plus = np.asarray(values.tolist()+[0])
                advantages = rewards + gamma * value_plus[1:] - value_plus[:-1]
                advantages = self.discount(advantages,gamma)
                
                feed_dict = {self.a3c.inp:np.vstack(t_inputs),
                             self.a3c.target_v:discounted_rewards,
                             self.a3c.actions:actions,
                             self.a3c.advantages:advantages}
                
                v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.a3c.value_loss,
                                                  self.a3c.policy_loss,
                                                  self.a3c.entropy,
                                                  self.a3c.grad_norms,
                                                  self.a3c.var_norms,
                                                  self.a3c.apply_grads],
                                                  feed_dict=feed_dict)
                
                num_eps+=1
                if self.name == 'w_0':
                    sess.run(self.increment)
                    print(col.Counter(actions))
                    if num_eps % 10 == 0:
                        saver.save(sess,self.path+'/model/ac.cptk')
                        print(np.mean(episode_rewards[-5:]))
                
    def discount(self, x, gamma):
        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
    
    def update_target_graph(self, from_scope,to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []
        for from_var,to_var in zip(from_vars,to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder


In [None]:
gamma = .99 
s_size = 14 
a_size = 3 
load_model = False
model_path = 'graph'

with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-4)
    master_network = A3C(trainer, 'global', inputs=s_size) # Generate global network
    num_workers = multiprocessing.cpu_count() # Set workers to number of available CPU threads
    workers = []
    # Create worker classes
    for i in range(num_workers):
        workers.append(Worker(i, trainer, model_path, global_episodes, input_size=s_size))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.play(gamma,sess,coord,saver)
        t = threading.Thread(target=(worker_work))
        t.start()
        sleep(0.5)
        worker_threads.append(t)
    coord.join(worker_threads)

This was an A3C algorithm designed to try to buy and sell stocks.  Using an external state class, it would attempt to decide whether to buy, sell, or do nothing. It was trained using the USD-GBP forex trading pair. It ultimately failed to produce an effective model. I believe the reson for this is two fold:

1) The predictions from my rnn linear regression wasn't accurate enough (more on that in the rnn repo)
2) There needed to be more data to draw a conclusion from

I believe that, had I done more research in to current stock forecasting, this could have been more effective. 

That being said, the best I could get the model to do was to not ever buy or sell anything. This is the stategy I would suggest for anyone trying to earn money through the stock market as naively as I was.