In [102]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [2]:
BUY = 0
SELL = 1
HOLD = 2

WINDOW_SIZE = 20

In [3]:
def split_sequence(sequence, n_steps):
    X = []
    for i in range(len(sequence)):
        end_ix = i + n_steps
        if end_ix > len(sequence):
            break
        seq_x = sequence[i:end_ix]
        X.append(seq_x)
        
    return np.array(X)

In [4]:
from metaflow import Flow

run = Flow('TraderFlow').latest_successful_run
df = run.data.df['MSFT'].dropna()
df.to_pickle('tsla.pkl')

In [5]:
X = split_sequence(pd.read_pickle('tsla.pkl'), WINDOW_SIZE)
X

array([[ 37.56,  37.5 ,  36.23, ...,  31.97,  31.77,  31.6 ],
       [ 37.5 ,  36.23,  36.61, ...,  31.77,  31.6 ,  31.48],
       [ 36.23,  36.61,  35.38, ...,  31.6 ,  31.48,  33.11],
       ...,
       [157.58, 160.09, 162.09, ..., 170.23, 174.38, 180.12],
       [160.09, 162.09, 161.34, ..., 174.38, 180.12, 179.9 ],
       [162.09, 161.34, 163.28, ..., 180.12, 179.9 , 183.63]])

In [10]:
np.amax(np.array([7,8,9]))

9

In [11]:
# https://towardsdatascience.com/reinforcement-learning-tutorial-part-3-basic-deep-q-learning-186164c3bf4
# https://medium.com/@gtnjuvin/my-journey-into-deep-q-learning-with-keras-and-gym-3e779cc12762
# https://quantdare.com/deep-reinforcement-trading/
# https://github.com/edwardhdlu/q-trader

class Trader:
    def __init__(self, 
                 discount_rate=0.95, 
                 exploration_rate=1.,
                 exploration_decay=.995,
                 state_size=WINDOW_SIZE,
                 action_size=3,
                 learning_rate=0.001):
        
        self.discount_rate     = discount_rate
        self.exploration_rate  = exploration_rate
        self.exploration_decay = exploration_decay
        self.state_size        = state_size
        self.action_size       = action_size
        self.memory            = deque(maxlen=1000)
        self.learning_rate     = learning_rate
        
        self.model = self.build_model()
        
    def build_model(self):
        model = Sequential()
        
        model.add(Dense(64, input_dim=self.state_size, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(8, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
    
    def get_next_action(self, state):
        if random.random() > self.exploration_rate:
            return self.model_action(state)
        else:
            return self.random_action()
        
    def model_action(self, state):
        return np.argmax(self.model.predict(state)[0])
    
    def random_action(self):
        return random.randrange(self.action_size)
    
    def remember(self, state, next_state, action, reward):
        self.memory.append((state, next_state, action, reward))
        
    def replay(self, sample_batch_size):
        if len(self.memory) < sample_batch_size:
            return
        
        sample_batch = random.sample(self.memory, sample_batch_size)
        
        for (state, next_state, action, reward) in sample_batch:
            model_pred = self.model.predict(next_state)
            
            # Fixes a bug where the model doesn't predict
            if len(model_pred) == 0:
                continue
            
            target = reward + (self.discount_rate * np.amax(model_pred[0]))
            
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        self.update_rate()
            
    def update_rate(self):
        # Move the exploration rate to zero to stop exploring
        self.exploration_rate *= self.exploration_decay

In [130]:
class Environment:
    def __init__(self, 
                 data,
                 state_size=WINDOW_SIZE,
                 episodes=3,
                 sample_batch_size=64):
        self.logs              = []
        self.profits           = 0
        self.data              = data
        self.stocks            = []
        self.episodes          = episodes
        self.state_size        = state_size
        self.sample_batch_size = sample_batch_size
        
        self.trader = Trader(state_size=self.state_size)
        
    def reset(self):
        self.profits = 0
        self.stocks  = []
        
    def buy_stocks(self):
        self.stocks.append(self.stock_price)
        return 0
        
    def sell_stocks(self):
        # If there is no inventory
        if len(self.stocks) == 0:
            return 0
        
        bought_price = self.stocks.pop(0)
        profit = self.stock_price - bought_price
        self.profits += profit
        
        reward = profit
        
        return reward
    
    def run(self):
        for episode in range(self.episodes):
            
            state        = self.data[0:1]
            reward       = 0
            done         = False
            index        = 0
            cons_buys    = 0
            cons_sells   = 0
            cons_holds   = 0
            logs         = {'profit': [], 'actions': []}
            
            while not done:
                index += 1
                
                action = self.trader.get_next_action(state)
                self.stock_price = state[0][-1]
                
                if action == BUY:
                    reward = self.buy_stocks()
                    cons_buys += 1
                    cons_sells = 0
                    cons_holds = 0
                elif action == SELL:
                    reward = self.sell_stocks() * 100
                    cons_sells += 1
                    cons_buys = 0
                    cons_holds = 0
                elif action == HOLD:
                    reward = 0
                    cons_holds += 1
                    cons_buys = 0
                    cons_sells = 0
                    
                if (cons_buys > 10) or (cons_sells > 10) or (cons_holds > 20):
                    reward -= 200

                next_state = self.data[index:index+1]

                self.trader.remember(state, next_state, action, reward)
                state = next_state
                
                logs['profit'].append(self.profits)
                logs['actions'].append(action)
        
                if (index >= len(self.data)):
                    done = True
    
            print(f'Episode: {episode} Profit: {int(self.profits)} Exploration: {self.trader.exploration_rate}')

            self.trader.replay(self.sample_batch_size)
            self.reset()
            
            self.logs.append(logs)
            
env = Environment(
    data=X,
    episodes=200,
    sample_batch_size=32)

env.run()

Episode: 0 Profit: 2471 Exploration: 1.0
Episode: 1 Profit: 12926 Exploration: 0.995
Episode: 2 Profit: 3492 Exploration: 0.990025
Episode: 3 Profit: 6159 Exploration: 0.985074875
Episode: 4 Profit: 6206 Exploration: 0.9801495006250001
Episode: 5 Profit: 1803 Exploration: 0.9752487531218751
Episode: 6 Profit: 1823 Exploration: 0.9703725093562657
Episode: 7 Profit: 647 Exploration: 0.9655206468094844
Episode: 8 Profit: 1222 Exploration: 0.960693043575437
Episode: 9 Profit: 2030 Exploration: 0.9558895783575597
Episode: 10 Profit: 551 Exploration: 0.9511101304657719
Episode: 11 Profit: 398 Exploration: 0.946354579813443
Episode: 12 Profit: 463 Exploration: 0.9416228069143757
Episode: 13 Profit: 420 Exploration: 0.9369146928798039
Episode: 14 Profit: 974 Exploration: 0.9322301194154049
Episode: 15 Profit: 572 Exploration: 0.9275689688183278
Episode: 16 Profit: 513 Exploration: 0.9229311239742362
Episode: 17 Profit: 503 Exploration: 0.918316468354365
Episode: 18 Profit: 787 Exploration: 0.9

Episode: 148 Profit: 51 Exploration: 0.47622912292284103
Episode: 149 Profit: 38 Exploration: 0.4738479773082268
Episode: 150 Profit: 55 Exploration: 0.47147873742168567
Episode: 151 Profit: 6 Exploration: 0.46912134373457726
Episode: 152 Profit: 100 Exploration: 0.46677573701590436
Episode: 153 Profit: 32 Exploration: 0.46444185833082485
Episode: 154 Profit: -4 Exploration: 0.46211964903917074
Episode: 155 Profit: 49 Exploration: 0.4598090507939749
Episode: 156 Profit: 6 Exploration: 0.457510005540005
Episode: 157 Profit: 40 Exploration: 0.45522245551230495
Episode: 158 Profit: 14 Exploration: 0.4529463432347434
Episode: 159 Profit: 58 Exploration: 0.4506816115185697
Episode: 160 Profit: -32 Exploration: 0.4484282034609769
Episode: 161 Profit: 71 Exploration: 0.446186062443672
Episode: 162 Profit: -34 Exploration: 0.4439551321314536
Episode: 163 Profit: 16 Exploration: 0.4417353564707963
Episode: 164 Profit: 27 Exploration: 0.43952667968844233
Episode: 165 Profit: 15 Exploration: 0.43

In [134]:
def show_profits(episode=0):
    logs = env.logs[episode]

    actions = np.array(logs['actions'])
    profits = np.array(logs['profit'])
    indices = np.arange(0,len(actions))
    stocks  = df.values[WINDOW_SIZE-1:]

    logs = pd.DataFrame({
        'action': actions,
        'profit': profits, 
        'stock': stocks,
        'bought': actions == BUY,
        'sold': actions == SELL,
        'held': actions == HOLD}).reset_index()

    logs['action_cat'] = logs['action'].replace({0: 'Buy', 1: 'Sell', 2: 'Hold'})
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=logs['index'], 
                             y=logs['profit'],
                             mode='lines',
                             name='Profits'))
    
    fig.add_trace(go.Scatter(x=logs['index'], 
                         y=logs['stock'],
                         mode='lines',
                         name='Stock'))

    bought = logs[logs['bought']]
    fig.add_trace(go.Scatter(x=bought['index'], 
                             y=bought['profit'],
                             mode='markers',
                             name='Bought'))

    sold = logs[logs['sold']]
    fig.add_trace(go.Scatter(x=sold['index'], 
                             y=sold['profit'],
                             mode='markers',
                             name='Sold'))

    fig.show()
    
    fig = go.Figure(data=[go.Pie(labels=logs['action_cat'].value_counts().index,
                             values=logs['action_cat'].value_counts().values)])
    fig.show()
    
show_profits(196)