In [None]:
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import itertools as it

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

#import argparse
import re
import os
import pickle
import csv



In [None]:
def load_data_file(filename):
    df = pd.read_excel(filename)
    return df.values

In [None]:
ds = load_data_file('GBPAUD_3M.xlsx') #GBPAUD_Long_Small.xlsx  ;  GBPAUD_.xlsx; GBPAUD_3M.xlsx;GBPAUD_New_Small

In [None]:
ds[0]

In [None]:
type(ds)

In [None]:
ds.shape

In [None]:
class ReplayBuffer:
  #inspired by lazy programmer
    def __init__(self, state_dim, action_dim, size):
        self.cur_state_buffer = np.zeros([size, state_dim], dtype=np.float32)
        self.next_state_buffer = np.zeros([size, state_dim], dtype=np.float32)
        self.actions_buffer = np.zeros(size, dtype=np.uint8)
        self.rewards_buffer = np.zeros(size, dtype=np.float32)
        self.done_buffer = np.zeros(size, dtype=np.uint8)
        self.pointer, self.size, self.max_size = 0, 0, size

    def save_experience(self, cur_state, action, reward, next_state, done):
        self.cur_state_buffer[self.pointer] = cur_state
        self.next_state_buffer[self.pointer] = next_state
        self.actions_buffer[self.pointer] = action
        self.rewards_buffer[self.pointer] = reward
        self.done_buffer[self.pointer] = done
        self.pointer = (self.pointer+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def random_batch(self, batch_size=500):
        idx = np.random.randint(0, self.size, size=batch_size)
        return dict(s=self.cur_state_buffer[idx],
                    s2=self.next_state_buffer[idx],
                    a=self.actions_buffer[idx],
                    r=self.rewards_buffer[idx],
                    d=self.done_buffer[idx])


In [None]:
def create_scaler(env):
    #scale the features
    states = []
    n = env.n_step
    for x in range(n):
        action = np.random.choice(env.action_space)
        state, reward, done, info, _, _, _ = env.step(action)
        states.append(state)
        if done:
            break

    scl = StandardScaler()
    scl.fit(states)
    return scl

In [None]:
def create_directory(namedirectory):
    if not os.path.exists(namedirectory):
        os.makedirs(namedirectory)

In [None]:
class DQN_Agent(object):
    def __init__(self, state_size, action_size):
        self.action_size = action_size
        self.state_size = state_size
        self.gamma = 0.95 # discount rate
        self.epsilon = 1 # exploration rate
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.99
        self.learning_rate=0.001
        #self.previous_target = np.random.rand()
        self.model = self.DNN()
        self.memory = ReplayBuffer(state_size, action_size, size=20000)

    def DNN(self):
        model = Sequential()
        model.add(Dense(64, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(64, activation = 'relu'))
        model.add(Dense(64, activation = 'relu'))
        model.add(Dense(32, activation = 'relu'))
        model.add(Dense(32, activation = 'relu'))
        model.add(Dense(32, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        print((model.summary()))
        return model

    def update_replay_memory(self, state, action, reward, next_state, done):
        self.memory.save_experience(state, action, reward, next_state, done)


    def act(self, state):
        if np.random.rand() <= self.epsilon: #random selecion
            #random selection
            return np.random.choice(self.action_size)
        act_values = self.model.predict(state)
        '''print('state')
        print(state)
        print('act_values')
        print(act_values)
        '''
        return np.argmax(act_values[0])	# return the action with the higest predicted reward


    def replay(self, batch_size=500):
        # check wether there is enough data
        if self.memory.size < batch_size:
            return

        #random batch
        minibatch = self.memory.random_batch(batch_size)
        states = minibatch['s']
        actions = minibatch['a']
        rewards = minibatch['r']
        next_states = minibatch['s2']
        done = minibatch['d']

        # Calculate the tentative target: Q(s',a)

        target = rewards + np.amax(self.model.predict(next_states), axis=1) *self.gamma *(1-done)

        target_f = self.model.predict(states) #using cur_state
        #only update the action number which has been predicted
        target_f[np.arange(batch_size), actions] = target 
        # train the model
        self.model.train_on_batch(states, target_f) 

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        #self.previous_target = target
        #return results


    def load_model(self, filename):
        self.model.load_weights(filename)


    def save_model(self, filename):
        self.model.save_weights(filename)

In [None]:
class TradingEnv:

    def __init__(self, dataset, initial_investment=100000):
        self.n_step, self.state_dim = dataset.shape 
        self.dataset= dataset
        self.initial_investment = initial_investment
        self.has_expo = 0 #0 - no expo, 1 with long, -1 with short
        self.comm = -8.87 # commission is fixed at 8.87 EUR per tarding one lot
        self.EURAUD = 1.65 #fixed at 1.65
        self.value = initial_investment
        self.tradecount_long =0
        self.transactioncost_long =0.0
        self.tradecount_short =0
        self.transactioncost_short =0.0
        self.action_space = np.arange(3)
        self.trx_cost = -30.0 
        self.action_list = list(map(list, it.product([0, 1, 2], repeat=1)))
        self.session_cost =0.0

        self.reset()


    def reset(self):
        self.cur_step = 0
        self.initial_investment = initial_investment
        cur_state = self.dataset[self.cur_step]
        self.has_expo=0
        return cur_state


    def step(self, action):
        #assert action in self.action_space
        # get current value before performing the action
        #prev_val = self._get_val()
        # update price, i.e. go to the next day
        cur_state = self.dataset[self.cur_step]
        cur_state[0] = self.has_expo #get the cur exposure from last step
        next_state = self.dataset[self.cur_step +1]
        '''print('cur_state')
        print(cur_state)
        print('next state')
        print(next_state)'''
        
        #simple risk management plociy
        #maximal 5 longs or 5 shorts at at any time
        if self.has_expo >= 5 and action==0:
            action =1
        elif self.has_expo <= -5 and action==2:
            action=1
        
        #action 0 is to long
        if action == 0 and self.has_expo >= 0: #to add one long
            self.has_expo += 1 #for next step
            reward =next_state[1] * self.has_expo + self.trx_cost * cost_scaler #reward is the sum of price delta and trx cost
            self.transactioncost_long += self.trx_cost #stats
            self.tradecount_long +=1 #stats

        elif action == 0 and self.has_expo <0: #to long, but with existing short, it will only close the short to no exposure
            self.has_expo = 0 #it closes the existing short position
            reward =0 

        #action 2 is to short
        elif action ==2 and self.has_expo <=0: #it adds a short position with transaction cost
            self.has_expo += -1 #negative expo
            reward =next_state[1]* self.has_expo + self.trx_cost * cost_scaler #reward is the sum of price delta and trx cost
            self.transactioncost_short += self.trx_cost #stats
            self.tradecount_short +=1 #stats

        elif action ==2 and self.has_expo >0: #with existing long position, short action will close the long
            self.has_expo = 0 #close the existing position
            reward = 0 


        #action 1 is to hold cash or existing position
        elif action==1 and self.has_expo==0: #hold cash, no reward
            reward = 0

        elif action ==1 and self.has_expo != 0: #hold existing exposure, get price delta as reward
            reward = next_state[1] * self.has_expo
        
        self.value += reward
        self.cur_step += 1
        self.dataset[self.cur_step][0] = self.has_expo #for the next step
        next_state[0]= self.has_expo
        done = self.cur_step == self.n_step - 1
        info = {'cur_val':self.value}
        '''print('cur_state')
        print(cur_state)
        print('next state')
        print(next_state)'''
        return next_state, reward, done, info, self.transactioncost_long, self.transactioncost_short, self.has_expo




In [None]:
def run_episode(agent, env, is_train):
    # note: after transforming states are already 1xD
    state = env.reset()
    state = scaler.transform([state])
    done = False
    n= 0
    has_expo = 0
    env.initial_investment = initial_investment
    env.tradecount_long = 0
    env.transactioncost_long =0.0
    env.tradecount_short = 0
    env.transactioncost_short =0.0
    env.value = initial_investment
    #print(env.n_step)
    ls =[]
    
    while not done:
        lsr=[]
        action = agent.act(state)
        
        next_state, reward, done, info, tc_l, tc_s, expo = env.step(action)
        next_state = scaler.transform([next_state])
        result=0
        lsr.append(action)
        lsr.append(reward)
        lsr.append(tc_l)
        lsr.append(tc_s)
        lsr.append(expo)
        ls.append(lsr)
        if is_train == 'train':
            
            agent.update_replay_memory(state, action, reward, next_state, done)
            results = agent.replay(batch_size)
        state = next_state
        n+=1
        #print(str(n)+ "; action:" + str(action) + " ; reward: " +str(reward))
    '''
    with open('test_result_fullcost_09-29.csv', 'w', newline='') as f:
        wr=csv.writer(f)
        wr.writerows(ls)
    '''
    return info['cur_val'], env.transactioncost_long, env.tradecount_long, env.transactioncost_short, env.tradecount_short

# training

In [None]:
#train the model
num_episodes = 5
batch_size = 500
initial_investment = 100000
cost_scaler =1.0
n_timesteps, _ = ds.shape
train_data = ds[:100]
env = TradingEnv(train_data, initial_investment)
state_size = env.state_dim
action_size = len(env.action_space)
agent = DQN_Agent(state_size, action_size)
scaler = create_scaler(env)
portfolio_value = []

In [None]:
train_data.shape

In [None]:
    #start tarining 
    traintest ='train'
    cost_scaler=1.0 # to adjust the trx cost, min 0, max 1.0
    initial_epsilon =1.0
    for e in range(num_episodes):
        if cost_scaler >=1.0:
            cost_scaler=1.0
        # load trained weights
        agent.load_model('dqn.h5') #load the trained weights only if continuing from previous training
        agent.epsilon = initial_epsilon
        agent.epsilon_min =0.05
        agent.epsilon_decay =0.999
        t0 = datetime.now()
        val, transactioncost_long, tradecount_long, transactioncost_short, tradecount_short = run_episode(agent, env, traintest)
        dt = datetime.now() - t0
        print(f"train: e: {e}/{num_episodes}, cost:{cost_scaler}, init_eps:{initial_epsilon}, val: {val:.2f}, cost_long: {transactioncost_long:.2f} , cnt_long: {tradecount_long}, cost_short: {transactioncost_short:.2f} , cnt_short: {tradecount_short}, dur: {dt}")
        agent.save_model('dqn.h5')
        initial_epsilon *=0.95
        #cost_scaler +=0.05 #only needed when stepping up the trx cost

    # save the model if training, but not saving if it's test model
    if traintest == 'train':
        agent.save_model('dqn.h5')
        with open('scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)

# test

In [None]:
test_data=ds[10000:]

In [None]:
# play the game num_episodes times
cost_scaler=1.0
initial_investment=100000

#load the scaler based on training data
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

env_test = TradingEnv(test_data, initial_investment)
state_size = env_test.state_dim
action_size = len(env_test.action_space)
agent_test = DQN_Agent(state_size, action_size)
    
agent_test.load_model('dqn.h5')
#agent_test.epsilon =1.0 #for random strategy
agent_test.epsilon = 0.0 #testing, it's zerothen derministic
agent_test.epsilon_min =0.0
agent_test.epsilon_decay=0.0

t0_test = datetime.now()
val_test, transactioncost_long_test, tradecount_long_test, transactioncost_short_test, tradecount_short_test= run_episode(agent_test, env_test, 'test')
dt_test = datetime.now() - t0_test
print(f"test: val: {val_test:.2f}, costLong: {transactioncost_long_test:.2f} , cntLong: {tradecount_long_test}, costShort: {transactioncost_short_test:.2f} , cntShort: {tradecount_short_test} ,dur: {dt_test}")

