In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf
from tradingenv import StockTradingEnv
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

import collections
import itertools
from lib import plotting

In [3]:
# Set env
df = pd.read_csv('./data/AAPL.csv')
df.sort_values('Date')
df_recent = df[df['Date'] >= '2008-01-01'].reset_index()
print(df_recent)
# env = DummyVecEnv([lambda:StockTradingEnv(df)])
env = StockTradingEnv(df_recent)
observation = env.reset()

      index  Unnamed: 0        Date    Open      High       Low   Close  \
0      2514        2514  2008-01-02  199.27  200.2600  192.5500  194.84   
1      2515        2515  2008-01-03  195.41  197.3900  192.6900  194.93   
2      2516        2516  2008-01-04  191.45  193.0000  178.8900  180.05   
3      2517        2517  2008-01-07  181.25  183.6000  170.2300  177.64   
4      2518        2518  2008-01-08  180.14  182.4600  170.8000  171.25   
...     ...         ...         ...     ...       ...       ...     ...   
2736   5250        5250  2018-11-12  199.00  199.8500  193.7900  194.17   
2737   5251        5251  2018-11-13  191.63  197.1800  191.4501  192.23   
2738   5252        5252  2018-11-14  193.90  194.4800  185.9300  186.80   
2739   5253        5253  2018-11-15  188.39  191.9700  186.9000  191.41   
2740   5254        5254  2018-11-16  190.50  194.9695  189.4600  193.53   

          Volume  
0     38542100.0  
1     30073800.0  
2     51994000.0  
3     74006900.0  
4   

In [None]:
class PolicyEstimator():
    """
    Policy Function approximator. 
    """
    
    def __init__(self, learning_rate=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [36], "state")
            self.action = tf.placeholder(dtype=tf.int32, name="action")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
            state_one_hot = tf.one_hot(self.state, 5) #tf.one_hot(indices, depth)
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(state_one_hot, 0),
                num_outputs=env.action_space.n,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)
            
            self.action_probs = tf.squeeze(tf.nn.softmax(self.output_layer))
            self.action_probs = tf.reduce_sum(self.action_probs, 0)/tf.reduce_sum(self.action_probs)
            
            self.picked_action_prob = tf.gather(self.action_probs, self.action)
            
            # Loss and train op
            self.loss = -tf.log(self.picked_action_prob) * self.target

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())
    
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        
        return sess.run(self.action_probs, { self.state: state })

    def update(self, state, target, action, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = { self.state: state, self.target: target, self.action: action  }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

class ValueEstimator():
    """
    Value Function approximator. 
    """
    
    def __init__(self, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [36], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
            state_one_hot = tf.one_hot(self.state, 5)
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(state_one_hot, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)

            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimate, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())        
    
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, { self.state: state })

    def update(self, state, target, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = { self.state: state, self.target: target }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

def reinforce(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    """
    REINFORCE (Monte Carlo Policy Gradient) Algorithm. Optimizes the policy
    function approximator using policy gradient.
    
    Args:
        env: OpenAI environment.
        estimator_policy: Policy Function to be optimized 
        estimator_value: Value function approximator, used as a baseline
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    for i_episode in range(num_episodes):
        # Reset the environment and pick the first action
        state = env.reset()

        episode = []
        
        # One step in the environment
        for t in itertools.count():
            
            # Take a step
            action_probs = estimator_policy.predict(state)
            # Define the action to be either buy one unit, do nothing or sell one unit
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            # Keep track of the transition
            episode.append(Transition(
              state=state, action=action, reward=reward, next_state=next_state, done=done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # Print out which step we're on, useful for debugging.
            print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="")
            # sys.stdout.flush()

            if done:
                break
                
            state = next_state
    
        # Go through the episode and make policy updates
        for t, transition in enumerate(episode):
            # The return after this timestep
            total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))
            # Calculate baseline/advantage
            baseline_value = estimator_value.predict(transition.state)            
            advantage = total_return - baseline_value
            # Update our value estimator
            estimator_value.update(transition.state, total_return)
            # Update our policy estimator
            estimator_policy.update(transition.state, advantage, transition.action)
    
    return stats

In [None]:
# 1. REINFORCE
tf.reset_default_graph()

global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator()
value_estimator = ValueEstimator()

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    # Note, due to randomness in the policy the number of episodes you need to learn a good
    # policy may vary. ~2000-5000 seemed to work well for me.
    stats = reinforce(env, policy_estimator, value_estimator, 2000, discount_factor=1.0)

In [None]:
plotting.plot_episode_stats(stats, smoothing_window=25)

In [None]:
with np.printoptions(threshold=np.inf):
    print(len(stats.episode_rewards))
    print(max(stats.episode_rewards))
    print(min(stats.episode_rewards))

plt.plot(sorted(stats.episode_rewards))
plt.hist(sorted(stats.episode_rewards))

In [4]:
# 2, DQN
model = DQN(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("deepq_cartpole")








Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where










  prev_cost + additional_cost) / (self.shares_held + shares_bought)


--------------------------------------
| % time spent exploring  | 80       |
| episodes                | 100      |
| mean 100 episode reward | 3.52e+03 |
| steps                   | 495      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 60       |
| episodes                | 200      |
| mean 100 episode reward | 3.62e+03 |
| steps                   | 995      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 41       |
| episodes                | 300      |
| mean 100 episode reward | 3.58e+03 |
| steps                   | 1495     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 21       |
| episodes                | 400      |
| mean 100 episode reward | 3.54e+03 |
| steps                   | 1995     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 3700     |
| mean 100 episode reward | 3.64e+03 |
| steps                   | 18495    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 3800     |
| mean 100 episode reward | 3.48e+03 |
| steps                   | 18995    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 3900     |
| mean 100 episode reward | 3.57e+03 |
| steps                   | 19495    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 4000     |
| mean 100 episode reward | 3.68e+03 |
| steps                   | 19995    |
--------------------------------------
--------------------------------------
| % time spent exploring 

In [None]:
model = DQN.load("deepq_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()