## Reinforcement Learning on StockMarket

This project aim to make trade decisions using RL Agent. 

Weekly and daily candle data is used to train the agent to avoid noise and outliers. Most popular indicators and price levels such as RSI, ATR, VWAP, 52-week High/Low and etc is used as observations along with position data. 

The actions for the agents are Long, Short, and Do Nothing. The action for the observation is the argmax of the NN outputs.

Reward Fuction: Intial rewards for the agent is -500. 300 points for the closing the position with profit and No reward for making opening position and -100 will be for holding the position or not taking position, for closing the position with the loss will be -300 points. 

#### Data Mining and Feature Engineering

In [1]:
import pandas as pd 
import numpy as np 
import pandas_ta as ta 
import datetime as dt 

In [2]:
data = pd.read_csv('../DataSets/BankNifty_weekly.csv', index_col=0, parse_dates=True, dayfirst=True)

In [3]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03 09:15:00,1054.8101,1126.990,1054.8101,1126.990,0
2000-01-10 09:15:00,1088.9700,1088.970,1023.4100,1023.410,0
2000-01-17 09:15:00,1035.9900,1058.910,1021.6900,1058.910,0
2000-01-24 09:15:00,1049.8199,1153.350,1049.8199,1153.350,0
2000-01-31 09:15:00,1148.8900,1186.970,1148.8900,1172.210,0
...,...,...,...,...,...
2024-01-01 09:15:00,48203.4490,48450.000,47481.3520,48159.000,920084100
2024-01-08 09:15:00,48096.6480,48154.500,47010.8010,47709.801,881744948
2024-01-15 09:15:00,47891.0510,48305.398,45430.6990,46058.200,1620015250
2024-01-23 09:15:00,46495.4500,46580.301,44429.0000,44866.148,1158126249


In [4]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03 09:15:00,1054.8101,1126.99,1054.8101,1126.99,0
2000-01-10 09:15:00,1088.97,1088.97,1023.41,1023.41,0
2000-01-17 09:15:00,1035.99,1058.91,1021.69,1058.91,0
2000-01-24 09:15:00,1049.8199,1153.35,1049.8199,1153.35,0
2000-01-31 09:15:00,1148.89,1186.97,1148.89,1172.21,0


In [5]:
data.isna().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,1257.0,1257.0,1257.0,1257.0,1257.0
mean,14651.563968,14961.602598,14326.324328,14670.46651,396847400.0
std,12630.785125,12830.882515,12420.868418,12640.700118,597507200.0
min,749.83002,797.96997,743.70001,743.70001,0.0
25%,4264.5498,4370.5,3990.8999,4274.8501,0.0
50%,10592.65,10830.4,10340.4,10598.25,106932000.0
75%,23636.949,24074.15,23144.301,23670.4,568891800.0
max,48203.449,48636.449,47481.352,48292.25,3805277000.0


In [7]:
data.dtypes

Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object

In [8]:
data = data.round(2)

In [9]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03 09:15:00,1054.81,1126.99,1054.81,1126.99,0
2000-01-10 09:15:00,1088.97,1088.97,1023.41,1023.41,0
2000-01-17 09:15:00,1035.99,1058.91,1021.69,1058.91,0
2000-01-24 09:15:00,1049.82,1153.35,1049.82,1153.35,0
2000-01-31 09:15:00,1148.89,1186.97,1148.89,1172.21,0


In [12]:
def addObservations(data):
    data['200 EMA'] = ta.ema(data.Close, length=200)
    data['RSI'] = ta.rsi(data.Close)
    data['VWAP'] = ta.vwap(data.High, data.Low, data.Close, data.Volume).values
    data['ATR'] = ta.atr(data.High, data.Low, data.Close)
    data['Prev_Open'] = data.Open.shift()
    data['Prev_High'] = data.High.shift()
    data['Prev_Low'] = data.Low.shift()
    data['Prev_Close'] = data.Close.shift()
    data['52w-High'] = data.index.to_series().apply(lambda x: data[x - data.index < dt.timedelta(days=365)]['High'].max())
    data['52w-Low'] = data.index.to_series().apply(lambda x: data[x - data.index < dt.timedelta(days=365)]['High'].min())
    data['Month'] = data.index.month
    data['NextWeekClose'] = data.Close.shift(-1)


In [None]:

addObservations(data)

In [11]:
data = data.drop(data[data['Volume'] == 0].index)

In [12]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,200 EMA,RSI,VWAP,ATR,Prev_Open,Prev_High,Prev_Low,Prev_Close,52w-High,52w-Low,Month,NextWeekClose
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2007-03-26 09:15:00,5522.05,5593.35,5164.05,5308.5,35045000,3810.158431,46.323059,5355.3,391.640668,4900.15,5594.75,4871.45,5518.85,48636.45,3646.65,3,5129.2
2007-04-02 09:15:00,5267.7,5267.7,4946.55,5129.2,46890000,3823.283223,43.281784,5114.483333,389.519906,5522.05,5593.35,5164.05,5308.5,48636.45,3646.65,4,5362.1
2007-04-09 09:15:00,5129.05,5404.1,5129.05,5362.1,41289000,3838.594833,48.052635,5298.416667,381.343484,5267.7,5267.7,4946.55,5129.2,48636.45,3646.65,4,5597.55
2007-04-16 09:15:00,5400.3,5626.85,5382.2,5597.55,51627000,3856.096874,52.410726,5535.533333,373.015378,5129.05,5404.1,5129.05,5362.1,48636.45,3646.65,4,5752.9
2007-04-23 09:15:00,5622.0,5975.0,5511.7,5752.9,72080000,3874.970537,55.087998,5746.533333,379.464279,5400.3,5626.85,5382.2,5597.55,48636.45,3646.65,4,5656.55


In [13]:
data.dropna(axis=0, inplace=True)

In [14]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,200 EMA,RSI,VWAP,ATR,Prev_Open,Prev_High,Prev_Low,Prev_Close,52w-High,52w-Low,Month,NextWeekClose
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2007-03-26 09:15:00,5522.05,5593.35,5164.05,5308.5,35045000,3810.158431,46.323059,5355.3,391.640668,4900.15,5594.75,4871.45,5518.85,48636.45,3646.65,3,5129.2
2007-04-02 09:15:00,5267.7,5267.7,4946.55,5129.2,46890000,3823.283223,43.281784,5114.483333,389.519906,5522.05,5593.35,5164.05,5308.5,48636.45,3646.65,4,5362.1
2007-04-09 09:15:00,5129.05,5404.1,5129.05,5362.1,41289000,3838.594833,48.052635,5298.416667,381.343484,5267.7,5267.7,4946.55,5129.2,48636.45,3646.65,4,5597.55
2007-04-16 09:15:00,5400.3,5626.85,5382.2,5597.55,51627000,3856.096874,52.410726,5535.533333,373.015378,5129.05,5404.1,5129.05,5362.1,48636.45,3646.65,4,5752.9
2007-04-23 09:15:00,5622.0,5975.0,5511.7,5752.9,72080000,3874.970537,55.087998,5746.533333,379.464279,5400.3,5626.85,5382.2,5597.55,48636.45,3646.65,4,5656.55


#### Pre-Processing 

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

In [16]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [17]:
X.shape, y.shape

((879, 16), (879,))

In [18]:
X[0]

array([5.52205000e+03, 5.59335000e+03, 5.16405000e+03, 5.30850000e+03,
       3.50450000e+07, 3.81015843e+03, 4.63230594e+01, 5.35530000e+03,
       3.91640668e+02, 4.90015000e+03, 5.59475000e+03, 4.87145000e+03,
       5.51885000e+03, 4.86364500e+04, 3.64665000e+03, 3.00000000e+00])

In [19]:
y[0]

5129.2

In [20]:
scale = StandardScaler()

In [21]:

X = scale.fit(X).transform(X.astype(float))

In [22]:
y = scale.fit(y.reshape(-1, 1)).transform(y.reshape(-1, 1).astype(float))

In [23]:
X.shape, y.shape

((879, 16), (879, 1))

In [24]:
X[0]

array([-1.23924752, -1.2530702 , -1.24835141, -1.25855943, -0.82687604,
       -1.30544743, -0.80662189, -1.25368357, -1.08749248, -1.29152558,
       -1.25144644, -1.27230986, -1.23874701,  0.        , -1.16394082,
       -1.03594742])

In [25]:
y[0]

array([-1.27561738])

In [26]:
# import math
# TrainSize = math.floor(len(X) * 0.80)
# X_train, y_train, X_test, y_test = X[:TrainSize], y[:TrainSize], X[TrainSize:], y[TrainSize:]

In [63]:
# X_train.shape, y_train.shape, X_test.shape, y_test.shape

#### Building Model and Env

In [27]:
import tensorflow as tf 
from keras.models import Sequential, load_model
from keras.layers import (Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Activation)
from keras.optimizers import RMSprop 
from keras.callbacks import TensorBoard 

from tqdm import tqdm
from collections import deque 
import time, random, os

In [65]:
ACTION = [0, 1, 2] # [BUY, SELL, DO_NOTHING]

In [66]:
# REWARD FUNCTION: SMALL PENALTY FOR DOING NOTHING(MAKING SURE THE AGENT MAKE ACTION TO OUT PERFORM "BUY AND HOLD")
#                  EXPONENTIAL PENALTY FOR DRAWDOWN MAKING SURE THE AGENT WON'T HOLD THE STOCK ON THE DOWNSIDE
#                  EXPONENTILA REWARD ON REALISED PROFIT.

In [67]:
# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        # self.writer =  tf.summary.text() # tf.compat.v1.summary.FileWriter(self.log_dir) # tf.summary.FileWriter(self.log_dir)
        self._log_write_dir = self.log_dir

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        # self._write_logs(stats, self.step)
        # self._write_keras_model_train_graph() 
        # self.up
        pass


In [37]:
X[0].shape

(16,)

In [38]:
MarketEnv_DefVal = {   "IDLE_PENALTY" : -20,
    "LOSS_PENALTY" : -500,
    "PROFIT_REWARD" : 300,
    "OBSERVATIONS_SPACE_VALUES" : X.shape,
    "ACTION_SPACE_SIZE" : 3, 
    'STATE_ACTION' : 3,
    'POSITION' : 0, # 1 LONG POSITION, -1 SHORT POSITION, 0 NO POSITION
    'PREVIOUS_ENTRY' : np.nan   ,
    "episode_step" : 0}

class MarketEnv:
    IDLE_PENALTY = -20
    LOSS_PENALTY = -500
    PROFIT_REWARD = 300
    OBSERVATIONS_SPACE_VALUES = X.shape
    ACTION_SPACE_SIZE = 3
    STATE_ACTION = 3
    POSITION = 0 # 1 LONG POSITION, -1 SHORT POSITION, 0 NO POSITION
    PREVIOUS_ENTRY = np.nan   

    def reset(self):
        '''
        Reset the Environment and return initial observation.
        '''

        for key, value in MarketEnv_DefVal.items(): self.__setattr__(key, value)

        observation = np.append(X[0], 0)
        return observation 

    def step(self, action):
        '''
        Action taken on the new step
        Returns new observations, reward and done
        '''
        done = False
        if self.episode_step == len(data) - 1:
            done = True
        Obs_Yhat = data.iloc[self.episode_step]
        reward = self.PROFIT_REWARD if (action == 0 and self.POSITION == -1 and Obs_Yhat['Close'] < self.PREVIOUS_ENTRY) or (action == 1 and self.POSITION == 1 and Obs_Yhat['Close'] > self.PREVIOUS_ENTRY) else\
              self.LOSS_PENALTY if (action == 0 and self.POSITION == -1 and Obs_Yhat['Close'] > self.PREVIOUS_ENTRY) or (action == 1 and self.POSITION == 1 and Obs_Yhat['Close'] < self.PREVIOUS_ENTRY) else self.IDLE_PENALTY

        if action in [0, 1]:
            self.PREVIOUS_ENTRY = Obs_Yhat['Close'] if (action == 0 and self.POSITION == -1) or (action == 1 and self.POSITION == 1) else Obs_Yhat['Close'] 
            self.POSITION = 1 if action == 0 else -1 if action == 1 else 0 

        self.episode_step += 1 
        new_observation = np.append(X[self.episode_step] , self.POSITION) if self.episode_step < len(data) else np.append(X[0], 0)

        return new_observation, reward, done

env = MarketEnv()

# For stats
ep_rewards = [-200]

# For more repetitive results
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)

# Memory fraction, used mostly when trai8ning multiple agents
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
#backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))

# Create models folder 
if not os.path.isdir('models'):
    os.makedirs('models')

In [39]:
LOAD_MODEL = None # DIR of the Model

DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 2_000 # 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 128 # 1_000  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 64 # 64  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
MODEL_NAME = 'DQN_ON_FM'
MIN_REWARD = -200  # For model save
MEMORY_FRACTION = 0.20

# Environment settings
EPISODES = 2_000 #20_000

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.99 # 0.99975
MIN_EPSILON = 0.001

AGGREGATE_STATS_EVERY = 50

log_dir = f'logs/{MODEL_NAME}-{int(time.time())}'

class DQNAgent:

    def __init__(self) -> None:
        # MAIN MODEL: GET TRAINED EVERY STEP
        self.model = self.create_model()

        # TARGET MODEL: THIS IS WHAT WE PREDICT AGAINST EVERY STEP
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())

        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # self.tensorboard = ModifiedTensorBoard(log_dir=f'logs/{MODEL_NAME}-{int(time.time())}')
        self.tensorboard = TensorBoard(log_dir, histogram_freq=1)

        self.target_update_counter = 0

    def create_model(self):
            
        if LOAD_MODEL is not None:
            print('Loading the model')
            model = load_model(LOAD_MODEL)
            print('Model Loaded Successfully')
        else:
            model = Sequential() 

            model.add(Dense(64, activation='relu', input_shape = (17,)#env.OBSERVATIONS_SPACE_VALUES
                            ))
            model.add(Dropout(0.2))
            model.add(Dense(64, activation='relu'))
            model.add(Dropout(0.2))
            model.add(Dense(env.STATE_ACTION, activation='linear'))

            model.compile(optimizer=RMSprop(), loss='mse', metrics=['accuracy'])

        return model

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def get_qs(self, state):
        return self.model.predict(np.array(state), verbose=0)

    def train(self, terminal_state, step):
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return
        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        current_states = np.array([transition[0] for transition in minibatch]) 
        current_qs_list = self.model.predict(current_states, verbose=0)

        new_current_states = np.array([transition[3] for transition in minibatch])
        future_qs_list = self.target_model.predict(new_current_states, verbose=0)

        X = []
        y = []

        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q 
            else:
                new_q = reward 

            current_qs = current_qs_list[index] 
            current_qs[action] = new_q 

            X.append(current_state)
            y.append(current_qs)

        self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] 
                       # if terminal_state else None
                       )

        # UPDATE TO DETERMINE IF WE WANT TO UPDATE TARGET_MODEL YET
        if  terminal_state:
            self.target_update_counter += 1 

        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter 

#### Training Agent

In [None]:
agent = DQNAgent()

summary_writer = tf.summary.create_file_writer(log_dir)

for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = env.reset()

    # Reset flag and start iterating until episode ends
    done = False
    while not done:

        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state.reshape(1, -1)))
        else:
            # Get random action
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)

        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        # Every step we update replay memory and train main network
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)
        current_state = new_state
        step += 1

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        # agent.tensorboard(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

        # Save model, but only when min reward is greater or equal a set value
        if average_reward >= MIN_REWARD:
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    with summary_writer.as_default(step=episode):
      tf.summary.scalar('Min_Reward', min_reward)
      tf.summary.scalar('Average_Reward', average_reward)
      tf.summary.scalar('Max_Reward', max_reward)

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon) 

#### Evalute Agent

In [None]:
agent.model.summary()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/

In [6]:
from keras.models import load_model

In [None]:
from backtesting import Strategy, Backtest 

In [8]:
agentModelBT = load_model('models/DQN_ON_FM__-200.00max_-20410.00avg_-40620.00min__1707012773.model') 

In [9]:
import yfinance as yf 

dataBT =yf.download('^NSEI', period='5y', interval='1wk')

[*********************100%%**********************]  1 of 1 completed


In [16]:
import pandas_ta as ta 
import datetime as dt 
addObservations(dataBT) 

In [17]:
dataBT.drop(columns=['Adj Close'], inplace=True)

In [18]:
dataBT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,200 EMA,RSI,VWAP,ATR,Prev_Open,Prev_High,Prev_Low,Prev_Close,52w-High,52w-Low,Month,NextWeekClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-02-04,10876.75,11118.099609,10814.150391,10943.599609,1501200,,,10958.616536,,,,,,22126.800781,8678.299805,2,10724.400391
2019-02-11,10930.900391,10930.900391,10620.400391,10724.400391,2049200,,,10758.567057,,10876.75,11118.099609,10814.150391,10943.599609,22126.800781,8678.299805,2,10791.650391
2019-02-18,10738.650391,10808.849609,10585.650391,10791.650391,1602500,,,10728.716797,,10930.900391,10930.900391,10620.400391,10724.400391,22126.800781,8678.299805,2,10863.5
2019-02-25,10813.25,10939.700195,10729.299805,10863.5,2134700,,,10844.166667,,10738.650391,10808.849609,10585.650391,10791.650391,22126.800781,8678.299805,2,11035.400391
2019-03-04,10864.849609,11089.049805,10817.0,11035.400391,1393600,,,10980.483398,,10813.25,10939.700195,10729.299805,10863.5,22126.800781,8678.299805,3,11426.849609


In [19]:
dataBT.dropna(inplace=True, axis=0)

In [20]:
dataBT.shape

(61, 17)

In [21]:
X_test = dataBT.iloc[:, :-1].values

In [24]:
from sklearn.preprocessing import StandardScaler 
import numpy as np
scale = StandardScaler()
X_test = scale.fit(X_test).fit_transform(X_test.astype(np.float32))

In [25]:
X_test.shape

(61, 16)

In [None]:
import datetime
import pandas_ta as ta
import pandas as pd
from tqdm import tqdm

from backtesting import Backtest
from backtesting import Strategy
from backtesting.lib import crossover
from backtesting.test import GOOG 

_tqdm = tqdm(len(X_test), ascii=True, unit='Candles')

class RLAgenBT(Strategy):

    # Do as much initial computation as possible
    def init(self):
        self.count = 0
        self.POSITION = 0

    # Step through bars one by one
    # Note that multiple buys are a thing here
    def next(self): 

        if self.count>=X_test.shape[0]: print(self.count, end='\r') ; return

        action = np.argmax(agentModelBT.predict(np.append(X_test[self.count], self.POSITION).reshape(1, -1), verbose =0))

        if action == 0 and self.POSITION in [-1, 0]:
            if self.POSITION == -1:

                self.position.close()

            self.buy()
            self.POSITION = 1
        elif action == 1 and self.POSITION in [1, 0]:
            if self.POSITION == 1:

                self.position.close()
            self.sell()
            self.POSITION = -1


        self.count += 1

        _tqdm.update()

bt = Backtest(dataBT, RLAgenBT, cash=10_000_000, commission=.002)
stats = bt.run()

In [None]:
stats

In [None]:
bt.trades

In [None]:
bt.plot()