In [1]:
import datetime as dt
import yfinance as yf

In [2]:
import numpy as np
import pandas as pd
import gym
from gym import spaces

class StockTradingEnv(gym.Env):
    """
    A stock trading environment for OpenAI gym
    """
    # - human: render to the current display or terminal and return nothing. 
    # Usually for human consumption.
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df

        # Actions of the format Buy(0), Sell(1), and Hold(2)
        self.action_space = spaces.Discrete(3)

        # Stock's Price
        self.observation_space = spaces.Box(
            low = df['Open'].min(), high = df['Open'].max(), shape = (1,), dtype = np.float16)

    def _next_observation(self):
        # Get the stock price
        obs = self.df['Open'][self.current_step]

        return obs

    def _take_action(self, action):
        # Set the current price to a random price within the time step
        current_price = np.random.uniform(
            self.df.loc[self.current_step, "Open"], self.df.loc[self.current_step, "Close"])

        action_type = action

        if action_type == 0:
            if self.balance > current_price:
              # Buy 1 share of stock
              shares_bought = 1
              additional_cost = shares_bought * current_price
              self.balance -= additional_cost
              self.shares_held += shares_bought

        elif action_type == 2:
            if self.shares_held > 0:
              # Sell 1 share of stock
              shares_sold = 1
              self.balance += shares_sold * current_price
              self.shares_held -= shares_sold

        self.net_worth = self.balance + self.shares_held * current_price

    def step(self, action):
        # Execute one time step within the environment
        self._take_action(action)

        self.current_step += 1

        if self.current_step >= self.df.shape[0]:
            done = True
            return 0, 0, done, {}

        delay_modifier = (self.current_step - self.start) / MAX_STEPS

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0

        obs = self._next_observation()

        return obs, reward, done, {}

    def reset(self):
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0

        # Set the current step to a random point within the data frame (0-100)
        self.current_step = np.random.randint(0, 100)
        self.start = self.current_step

        return self._next_observation()

    def render(self, mode = 'human', show = False):
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
        if show:
          print(f'Step: {self.current_step - self.start}')
          print(f'Balance: {self.balance}')
          print(
              f'Shares held: {self.shares_held}')
          print(
              f'Net worth: {self.net_worth}')
          print(f'Profit: {profit}')
          print('-----------------------------------------------------------------')

In [3]:
MAX_STEPS = 20000
INITIAL_ACCOUNT_BALANCE = 10000

In [4]:
aapl_df = yf.download('^GSPC', 
                      start='2016-01-02', 
                      end='2018-11-16', 
                      progress=False,
)
aapl_df = pd.DataFrame(aapl_df.values, columns = aapl_df.columns.values)
aapl_df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,2038.199951,2038.199951,1989.680054,2012.660034,2012.660034,4.304880e+09
1,2013.780029,2021.939941,2004.170044,2016.709961,2016.709961,3.706620e+09
2,2011.709961,2011.709961,1979.050049,1990.260010,1990.260010,4.336660e+09
3,1985.319946,1985.319946,1938.829956,1943.089966,1943.089966,5.076590e+09
4,1945.969971,1960.400024,1918.459961,1922.030029,1922.030029,4.664940e+09
...,...,...,...,...,...,...
720,2794.100098,2794.100098,2764.239990,2781.010010,2781.010010,4.019090e+09
721,2773.929932,2775.989990,2722.000000,2726.219971,2726.219971,3.670930e+09
722,2730.050049,2754.600098,2714.979980,2722.179932,2722.179932,4.091440e+09
723,2737.899902,2746.800049,2685.750000,2701.580078,2701.580078,4.402370e+09


In [5]:
df = aapl_df.copy()
env = StockTradingEnv(df)
env.df.shape

(725, 6)

In [6]:
LEARNING_RATE = 0.1

DISCOUNT = 0.95
EPISODES = 25000
SHOW_EVERY = 1000

DISCRETE_OS_SIZE = [199]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
DISCRETE_OS_SIZE[0] += 1

# Exploration settings
epsilon = 1  # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)


q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))


def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  


for episode in range(EPISODES):
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print('This is episode', episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)


        new_state, reward, done, _ = env.step(action)

        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # And here's our equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value


env.close()

This is episode 0
This is episode 1000
This is episode 2000
This is episode 3000
This is episode 4000
This is episode 5000
This is episode 6000
This is episode 7000
This is episode 8000
This is episode 9000
This is episode 10000
This is episode 11000
This is episode 12000
This is episode 13000
This is episode 14000
This is episode 15000
This is episode 16000
This is episode 17000
This is episode 18000
This is episode 19000
This is episode 20000
This is episode 21000
This is episode 22000
This is episode 23000
This is episode 24000


In [7]:
np.argmax(q_table, axis = 1)

array([2, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 0,
       1, 1, 1, 1, 2, 0, 1, 2, 1, 1, 1, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 1,
       2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2,
       2, 1, 1, 0, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2,
       2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 0, 2, 2, 1, 2, 1, 2, 1, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1,
       2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1,
       1, 2])

In [12]:
discrete_state = get_discrete_state(env.reset())
done = False
while not done:
    action = np.argmax(q_table[discrete_state])
    new_state, reward, done, _ = env.step(action)
    env.render()
    discrete_state = get_discrete_state(new_state)
env.net_worth - INITIAL_ACCOUNT_BALANCE

8.160995138694489

In [9]:
done = False
env.reset()
while not done:
    action = env.action_space.sample()
    new_state, reward, done, _ = env.step(action)
    env.render()
env.net_worth - INITIAL_ACCOUNT_BALANCE

2263.823690294692