# Cryptocurrency Trader Agent
Deep Q-Learning Implementation

In [1]:
from environment.env import Coin
from environment.portfolio import Portfolio
from environment.simulator import Simulator
from environment.simulator import Action

### Neural Networks

In [2]:
# Q Value Function Approximator
# Neural Network Implementation

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K
from keras.models import load_model

class QValue_NN:
    
    # init neural network
    def __init__(self, state_size, action_size, units):
        # define input and target shapes
        self._state_size = state_size
        self._action_size = action_size

        self._units = units
        
        # init model
        self._model = self._build_model()
    
    
    # define loss function
    def _huber_loss(self, target, prediction):
        # sqrt(1+error^2)-1
        error = prediction - target
        return K.mean(K.sqrt(1+K.square(error))-1, axis=-1)

    
    # neural net for Deep-Q Learning Model
    def _build_model(self):
        model = Sequential()
        model.add(Dense(self._units, input_dim=self._state_size, activation='relu'))
        model.add(Dense(self._units, activation='relu'))
        model.add(Dense(self._action_size, activation='linear'))
        model.compile(loss=self._huber_loss, optimizer='adam')
        return model

    
    # online training
    def train(self, state, qvalues):
        state_reshape = np.reshape(state, [1, len(state)])
        self._model.fit(state_reshape, qvalues, epochs=1, verbose=0)
    
    
    # get q-values based on state
    def predict(self, state):
        state_reshape = np.reshape(state, [1, len(state)])
        return self._model.predict(state_reshape)
    
    def set_weights(self, model_weights):
        self._model.set_weights(model_weights)
        
    def get_weights(self):
        return self._model.get_weights()
    
    def save(self, path):
        self._model.save_weights(path)
        
    def load(self, path):
        self._model.load_weights(path)
    

Using TensorFlow backend.


### Deep Q Learning Implementation

In [3]:
# Cryptocurrency Trader Q-Learning Implementation

import random
import numpy as np
from collections import deque

class Crypto_Trader:

    def __init__(self, gamma = 0.95, epsilon = 1.0, epsilon_min = 0.01, epsilon_decay = 0.99, num_episodes = 1000,
                num_neutron = 24, num_coins_per_order = 100, init_capital = 1000, coin_name = 'ethereum'):

        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        
        # Reward Discount Rate
        self.gamma = gamma

        # Esiplon (exploration factor)
        self.epsilon = epsilon
        
        self.epsilon_min = epsilon_min
        
        # Reduce exploration overtime
        self.epsilon_decay = epsilon_decay

        # number of episodes for training
        self.num_episodes = num_episodes
        
        # init simulator
        self.simulator = Simulator(num_coins_per_order, init_capital, Coin(coin_name))
        
        # init NN model
        self.model = QValue_NN(self.simulator.get_state_size(), self.simulator.get_action_size(), num_neutron)
        self.target_model = QValue_NN(self.simulator.get_state_size(), self.simulator.get_action_size(), num_neutron)

        
        
    def act(self, state):
        # Choose action by e-greedy
        if np.random.rand() <= self.epsilon:
            #print 'random'
            return self.simulator.get_ran_action()
        
        # Get Q values, choose action by Q values
        act_values = self.model.predict(state)
        return Action(np.argmax(act_values[0]))
    
    
    
    def remember(self, state, action, reward, next_state, isDone):
        self.memory.append((state, action, reward, next_state, isDone))
        
        
    def update_target_model(self):
        # copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())
        
        
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, self.batch_size)
        
        for state, action, reward, next_state, isDone in minibatch:
            target = self.model.predict(state)
            if isDone:
                target[0][action.value] = reward
            else:
                a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action.value] = reward + self.gamma * t[np.argmax(a)]
                ## -0.60 + gamma * -0.50
            self.model.train(state, target)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    
        
    def train(self):
        for i in range(self.num_episodes):
            
            self.simulator.reset()
            state = self.simulator.get_current_state()
            
            while (True):
                
                action = self.act(state)
                #print action
                
                # step to the next state and reward based on action
                next_state, reward, isDone = self.simulator.act_and_step(action)
                #print next_state
                #print reward
                
                self.remember(state, action, reward, next_state, isDone)
                state = next_state
                
                if isDone:
                    self.update_target_model()
                    print("episode: {}/{}, reward: {}, epsilon: {:.2}"
                          .format(i+1, self.num_episodes, reward, self.epsilon))
                    #print self.simulator.get_current_holdings()
                    break
                    
            if len(self.memory) > self.batch_size:
                self.replay(self.batch_size)
                
        self.model.save('model/model.h5')
        
    
    def test(self):
        self.simulator.reset()
        self.model.load('model/model.h5')
        state = self.simulator.get_current_state()
        while (True):
                
            action = self.act(state)
            print action
                
            # step to the next state and reward based on action
            next_state, reward, isDone = self.simulator.act_and_step(action)
                
            self.remember(state, action, reward, next_state, isDone)
            state = next_state
                
            if isDone:
                print("Test run: reward: {}, holdings: {}"
                          .format(reward, self.simulator.get_current_holdings()))
                break
        

In [4]:
trader = Crypto_Trader(num_episodes = 800, epsilon_min = 0.10)

In [5]:
trader.train()

episode: 1/800, reward: 563.010080419, epsilon: 1.0
episode: 2/800, reward: 3342.97976977, epsilon: 0.99
episode: 3/800, reward: 13685.5652989, epsilon: 0.98
episode: 4/800, reward: 1938.14992887, epsilon: 0.97
episode: 5/800, reward: 252.426696115, epsilon: 0.96
episode: 6/800, reward: 1242.8684154, epsilon: 0.95
episode: 7/800, reward: 5365.06705917, epsilon: 0.94
episode: 8/800, reward: 772.739580114, epsilon: 0.93
episode: 9/800, reward: 9810.74553924, epsilon: 0.92
episode: 10/800, reward: 908.919937203, epsilon: 0.91
episode: 11/800, reward: 2411.02064561, epsilon: 0.9
episode: 12/800, reward: 329.862383947, epsilon: 0.9
episode: 13/800, reward: 90.3070156573, epsilon: 0.89
episode: 14/800, reward: 1428.80660447, epsilon: 0.88
episode: 15/800, reward: 1160.3292679, epsilon: 0.87
episode: 16/800, reward: -35.1285739805, epsilon: 0.86
episode: 17/800, reward: 2470.49169484, epsilon: 0.85
episode: 18/800, reward: 7464.16699361, epsilon: 0.84
episode: 19/800, reward: 2621.41405201, e

episode: 153/800, reward: 22.9494505805, epsilon: 0.22
episode: 154/800, reward: 139.843820779, epsilon: 0.21
episode: 155/800, reward: 45.0886893912, epsilon: 0.21
episode: 156/800, reward: 86.7481890643, epsilon: 0.21
episode: 157/800, reward: 1093.4396558, epsilon: 0.21
episode: 158/800, reward: 211.208515052, epsilon: 0.21
episode: 159/800, reward: 5219.86423341, epsilon: 0.2
episode: 160/800, reward: 360.629308696, epsilon: 0.2
episode: 161/800, reward: 681.482966889, epsilon: 0.2
episode: 162/800, reward: 1246.54366712, epsilon: 0.2
episode: 163/800, reward: 1487.39485134, epsilon: 0.2
episode: 164/800, reward: 3818.63739213, epsilon: 0.19
episode: 165/800, reward: 338.159935349, epsilon: 0.19
episode: 166/800, reward: 2793.0273675, epsilon: 0.19
episode: 167/800, reward: 1108.34145994, epsilon: 0.19
episode: 168/800, reward: 381.58373246, epsilon: 0.19
episode: 169/800, reward: 764.071132725, epsilon: 0.18
episode: 170/800, reward: 1141.60109568, epsilon: 0.18
episode: 171/800, 

episode: 302/800, reward: 205.743477869, epsilon: 0.099
episode: 303/800, reward: 4983.44616587, epsilon: 0.099
episode: 304/800, reward: 489.375489205, epsilon: 0.099
episode: 305/800, reward: 15438.5195541, epsilon: 0.099
episode: 306/800, reward: 704.396885369, epsilon: 0.099
episode: 307/800, reward: 799.691680053, epsilon: 0.099
episode: 308/800, reward: 1581.54028018, epsilon: 0.099
episode: 309/800, reward: 17058.3951216, epsilon: 0.099
episode: 310/800, reward: 1295.1550628, epsilon: 0.099
episode: 311/800, reward: 19052.7866505, epsilon: 0.099
episode: 312/800, reward: 127.958867232, epsilon: 0.099
episode: 313/800, reward: 289.651447715, epsilon: 0.099
episode: 314/800, reward: 181.424364027, epsilon: 0.099
episode: 315/800, reward: 327.136605391, epsilon: 0.099
episode: 316/800, reward: 620.182966961, epsilon: 0.099
episode: 317/800, reward: 2704.83962424, epsilon: 0.099
episode: 318/800, reward: 7569.88915487, epsilon: 0.099
episode: 319/800, reward: 20557.1588602, epsilon:

episode: 449/800, reward: 554.752225531, epsilon: 0.099
episode: 450/800, reward: 3493.32662514, epsilon: 0.099
episode: 451/800, reward: 1583.01585789, epsilon: 0.099
episode: 452/800, reward: 8824.2724078, epsilon: 0.099
episode: 453/800, reward: 7546.17761161, epsilon: 0.099
episode: 454/800, reward: 455.084884608, epsilon: 0.099
episode: 455/800, reward: 20673.7435424, epsilon: 0.099
episode: 456/800, reward: 7072.94021515, epsilon: 0.099
episode: 457/800, reward: 55.7884062891, epsilon: 0.099
episode: 458/800, reward: 4787.35589112, epsilon: 0.099
episode: 459/800, reward: 11317.8054618, epsilon: 0.099
episode: 460/800, reward: 348.735561848, epsilon: 0.099
episode: 461/800, reward: 79.132689865, epsilon: 0.099
episode: 462/800, reward: 141.352587852, epsilon: 0.099
episode: 463/800, reward: 553.134618076, epsilon: 0.099
episode: 464/800, reward: 4379.00447724, epsilon: 0.099
episode: 465/800, reward: 3286.56399264, epsilon: 0.099
episode: 466/800, reward: 4952.84360408, epsilon: 

episode: 596/800, reward: 6080.34313354, epsilon: 0.099
episode: 597/800, reward: 6180.24110402, epsilon: 0.099
episode: 598/800, reward: 7497.68584745, epsilon: 0.099
episode: 599/800, reward: 1904.51688066, epsilon: 0.099
episode: 600/800, reward: 191.10248639, epsilon: 0.099
episode: 601/800, reward: 339.373311668, epsilon: 0.099
episode: 602/800, reward: 2054.62883874, epsilon: 0.099
episode: 603/800, reward: 196.654182555, epsilon: 0.099
episode: 604/800, reward: 217.94173611, epsilon: 0.099
episode: 605/800, reward: 894.063210776, epsilon: 0.099
episode: 606/800, reward: 7404.04107849, epsilon: 0.099
episode: 607/800, reward: 755.874526778, epsilon: 0.099
episode: 608/800, reward: 423.40188777, epsilon: 0.099
episode: 609/800, reward: 8727.28245242, epsilon: 0.099
episode: 610/800, reward: 640.912758521, epsilon: 0.099
episode: 611/800, reward: 10079.7742228, epsilon: 0.099
episode: 612/800, reward: 19981.2148813, epsilon: 0.099
episode: 613/800, reward: 1066.62632625, epsilon: 0

episode: 743/800, reward: 1344.26502387, epsilon: 0.099
episode: 744/800, reward: 1522.16733127, epsilon: 0.099
episode: 745/800, reward: 1114.89163127, epsilon: 0.099
episode: 746/800, reward: 1500.57818031, epsilon: 0.099
episode: 747/800, reward: 1691.11329082, epsilon: 0.099
episode: 748/800, reward: 1626.71691709, epsilon: 0.099
episode: 749/800, reward: 19078.1883644, epsilon: 0.099
episode: 750/800, reward: 18749.3490091, epsilon: 0.099
episode: 751/800, reward: 20901.4547759, epsilon: 0.099
episode: 752/800, reward: 23627.3043575, epsilon: 0.099
episode: 753/800, reward: 20388.5136054, epsilon: 0.099
episode: 754/800, reward: 19840.4693758, epsilon: 0.099
episode: 755/800, reward: 1400.80037524, epsilon: 0.099
episode: 756/800, reward: 16023.6139054, epsilon: 0.099
episode: 757/800, reward: 416.454416673, epsilon: 0.099
episode: 758/800, reward: 20076.4218349, epsilon: 0.099
episode: 759/800, reward: 761.257316735, epsilon: 0.099
episode: 760/800, reward: 353.678267613, epsilon

IOError: Unable to create file (Unable to open file: name = 'model/model.h5', errno = 2, error message = 'no such file or directory', flags = 13, o_flags = 602)

We can see that the reward become quite stable after 600 episodes 

In [6]:
trader.test()

IOError: Unable to open file (Unable to open file: name = 'model/model.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

Seem the model leans to keep buying

## Benchmarking Crypto_Trader

In [11]:
print "epsilon is now %.4f" % trader.epsilon

epsilon is now 0.0000


In [21]:
def run_crypto_trader(trader, num_coins_per_order, init_capital, coin_name):
    simulator = Simulator(num_coins_per_order, init_capital, Coin(coin_name))
    state = simulator.get_current_state()

    i = 0
    while True:
        action = trader.act(state)
                
        next_state, reward, isDone = simulator.act_and_step(action)
        print("time: {}, action: {}, reward: {}, state: {}, next_state: {}"
              .format(i+1, action, reward, str(state), str(next_state)))
        state = next_state
        i += 1
        
        if isDone:
            break
    
    return simulator

In [22]:
simulator_crpyto = run_crypto_trader(trader, 100, 1000, "ethereum")
print simulator_crpyto.get_current_holdings()

time: 1, action: Action.BUY, reward: 0.0, state: [1000.0, 0.0, 0.0, 0, 0, 0], next_state: [1000.0, 0.0, 0.0, 0, 0, 0]
time: 2, action: Action.BUY, reward: -0.4, state: [1000.0, 0.0, 0.0, 0, 0, 0], next_state: [996.0, -0.002, 0.002, -15.874507866387544, 0, 0]
time: 3, action: Action.BUY, reward: -42.07728, state: [996.0, -0.002, 0.002, -15.874507866387544, 0, 0], next_state: [579.22720000000004, -0.14081552878179385, 0.19632159516620867, -11.386303261562263, 0, 0]
time: 4, action: Action.BUY, reward: -41.84169, state: [579.22720000000004, -0.14081552878179385, 0.19632159516620867, -11.386303261562263, 0, 0], next_state: [581.58309999999994, -0.10459481761146303, 0.18122485011583575, -9.1620506453460848, 0, 0]
time: 5, action: Action.BUY, reward: -42.07777, state: [581.58309999999994, -0.10459481761146303, 0.18122485011583575, -9.1620506453460848, 0, 0], next_state: [579.2222999999999, -0.084487707115848826, 0.16700640888761442, -8.0308341467670203, 0, 0]
time: 6, action: Action.BUY, rew