# Stock trading with reinforement learning
This notebook demonstrates the use of reinforcement learning for developing a stock trading strategy. It leverages the FinRL library and Stable Baselines3 for training and evaluating a PPO agent.

The notebook is structured as follows:

1. **Data Acquisition and Preprocessing**: Fetches historical stock data using yfinance, calculates technical indicators, and prepares the data for the trading environment.
2. **Trading Environment**: Defines a custom gym environment (StockTradingEnv) that simulates stock trading with discrete actions (buy, hold, sell).
3. **Model Training**: Trains a PPO agent on the prepared data within the trading environment.
4. **Trading and Backtesting**: Evaluates the trained agent's performance by simulating trades on a separate dataset and visualizes the results with a trading chart.

#installation and imports

In [None]:
# install all necessary packages
!pip install git+https://github.com/AI4Finance-LLC/FinRL-Library.git

In [None]:
# import all necessary packages
import pandas as pd
import yfinance as yf
from stockstats import StockDataFrame as sdf
from datetime import datetime, timedelta
import gym
import matplotlib.pyplot as plt
import numpy as np
from finrl.drl_agents.stablebaselines3.models import DRLAgent
from stable_baselines3.common.vec_env import DummyVecEnv

#defined functions

In [None]:
def fetch_data(period, interval, tic):
    # download and save the data in a pandas dataframe
    data_df = yf.download(tic, period=period, interval=interval)
    data_df = data_df.reset_index()
    data_df["tic"] = tic
    # convert to standardized names
    data_df.columns = [
        "date",
        "open",
        "high",
        "low",
        "close",
        "adjcp",
        "volume",
        "tic"
    ]
    data_df["close"] = data_df["adjcp"]
    data_df = data_df.drop("adjcp", 1)
    # convert date to standard string format
    data_df["date"] = data_df.date.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
    # drop missing data
    data_df = data_df.dropna()
    data_df = data_df.reset_index(drop=True)
    print("shape", data_df.shape)
    data_df = data_df.sort_values(by=['date']).reset_index(drop=True)

    return data_df


In [None]:
def add_technical_indicators(data, tech_indicator_list):
        # calculate technical indicators using stockstats package
        df = data.copy()
        df = df.sort_values(by=["date"])
        stock = sdf.retype(df.copy())

        for indicator in tech_indicator_list:
            indicator_df = pd.DataFrame()
            temp_indicator = stock[indicator]
            temp_indicator = pd.DataFrame(temp_indicator)
            temp_indicator["date"] = df["date"].to_list()
            indicator_df = indicator_df.append(
                temp_indicator, ignore_index=True
            )
            df = df.merge(
                indicator_df[["date", indicator]], on=["date"], how="left"
            )
        df = df.sort_values(by=["date"])
        return df

In [None]:
def add_changes(data, len):
    # add percentage changes
    df = data.copy()
    for i in range(1,len+1):
        df[f"change_{i}"] = df.close.pct_change(i)+1
    return df

In [None]:
def generate_trade_chart(data, ticks=24*12, name="chart_0.png", mult=12, stock="ETH-USD", scalar=1000): # interval times 5min

    df = data.copy()

    # define the upper limit, lower limit, interval of Y axis and colors
    y_UL = int(max(df["buy&hold"].max(), df["account_value"].max())//scalar*scalar+scalar)
    print(y_UL)
    y_LL = int(min(df["account_value"].min(), df["buy&hold"].min())//scalar*scalar)
    print(y_LL)
    y_interval = scalar
    mycolors = ['#f59e3b','#99f8ff']

    # draw plot and annotate
    fig, ax = plt.subplots(1,1, figsize=(16,9))

    columns = ["buy&hold", "account_value"]
    for i, column in enumerate(columns):
        plt.plot(df.date.values, df[column].values,
                 lw=3, color=mycolors[i], label=str(column))

    # draw horizontal lines
    for y in range(y_LL, y_UL, y_interval):
        plt.hlines(y, xmin=0, xmax=ticks,
                   colors='#fff99f', alpha=0.8, linestyles="--", lw=0.7)

    # draw vertical lines (buys and sells)
    buy_times = data.loc[data["actions"] > 0]["date"]
    sell_times = data.loc[data["actions"] < 0]["date"]
    for xc in buy_times:
        plt.axvline(x=xc, color="#2bad4a")

    for xc in sell_times:
        plt.axvline(x=xc, color="#f74343")

    # decorations
    plt.tick_params(axis="both", which="both", bottom=False, top=False,
                    labelbottom=True, left=False, right=False, labelleft=True)

    # lighten borders
    plt.gca().spines["bottom"].set_edgecolor('#fff99f')
    plt.gca().spines["left"].set_edgecolor('#fff99f')

    # title
    x_axis = df.date.values[::mult]
    x_hour = [w[-8:-3] for w in x_axis]
    day = x_axis[0][:-9]
    plt.title(f"{stock} : {day}", fontsize=24, color='#fff99f')

    # tick annotations
    plt.yticks(range(y_LL, y_UL, y_interval),
               [str(y) for y in range(y_LL, y_UL, y_interval)], fontsize=18, color='#fff99f')
    plt.xticks(range(0, ticks, mult), x_hour, rotation=90,
               horizontalalignment='left', fontsize=18, color='#fff99f')
    plt.ylim(y_LL, y_UL)
    plt.xlim(-1, ticks+1)

    # legend
    leg = plt.legend(fontsize=16, facecolor='#001133')
    for text in leg.get_texts():
        text.set_color('#fff99f')

    # background
    fig.patch.set_facecolor('#001133')
    ax.set_facecolor('#001133')

    plt.savefig(name, facecolor=fig.get_facecolor(), edgecolor='none')
    plt.show()

#data download and preprocess

In [None]:
data_df = fetch_data(period = "60d", interval = '5m', tic = "ETH-USD")

[*********************100%***********************]  1 of 1 completed
shape (17036, 7)


In [None]:
data_df.head()

Unnamed: 0,date,open,high,low,close,volume,tic
0,2021-10-18 00:00:00,3847.72998,3858.29541,3847.72998,3854.672852,0,ETH-USD
1,2021-10-18 00:05:00,3852.407471,3852.647705,3849.469482,3852.647705,0,ETH-USD
2,2021-10-18 00:10:00,3850.506592,3850.506592,3847.629395,3847.629395,0,ETH-USD
3,2021-10-18 00:15:00,3849.553955,3852.768311,3847.973877,3847.973877,460800,ETH-USD
4,2021-10-18 00:20:00,3848.338867,3848.840332,3846.90625,3846.90625,0,ETH-USD


In [None]:
# use either this
tech_indicator_list = ["macd", "boll", "cr", "kdjk", "cci", "dma", "trix", "vr"]
df = add_technical_indicators(data_df, tech_indicator_list)
state_features = tech_indicator_list

In [None]:
# or this
lookback_hours = 2
changes_size = lookback_hours*6
df = add_changes(data_df, changes_size)
state_features = [f"change_{i+1}" for i in range(changes_size)]

In [None]:
# close and features only needed
df = df.drop([
            "open",
            "high",
            "low",
            "volume",
            "tic"
        ], axis=1)

In [None]:
df.head()

Unnamed: 0,date,close,change_1,change_2,change_3,change_4,change_5,change_6,change_7,change_8,change_9,change_10,change_11,change_12
0,2021-10-21 00:00:00,4154.615723,,,,,,,,,,,,
1,2021-10-21 00:05:00,4146.311035,0.998001,,,,,,,,,,,
2,2021-10-21 00:10:00,4144.290039,0.999513,0.997515,,,,,,,,,,
3,2021-10-21 00:15:00,4156.082031,1.002845,1.002357,1.000353,,,,,,,,,
4,2021-10-21 00:20:00,4187.894531,1.007654,1.010522,1.010029,1.00801,,,,,,,,


In [None]:
start_day = df['date'].iloc[0]
start_day = datetime.strptime(start_day, "%Y-%m-%d %H:%M:%S")

day = timedelta(1)

start_init = start_day
# init [1] full first day
start_train = start_day + 1*day
# train [2,58]
start_trade = start_day + 58*day
# trade [59] (60 is cut to current hour)
end_trade = start_trade + day

start_init = start_init.strftime("%Y-%m-%d %H:%M:%S")
start_train = start_train.strftime("%Y-%m-%d %H:%M:%S")
start_trade = start_trade.strftime("%Y-%m-%d %H:%M:%S")
end_trade = end_trade.strftime("%Y-%m-%d %H:%M:%S")
print(start_init)
print(start_train)
print(start_trade)
print(end_trade)

2021-10-21 00:00:00
2021-10-22 00:00:00
2021-12-18 00:00:00
2021-12-19 00:00:00


In [None]:
init = df.loc[(df['date'] >= start_init) & (df['date'] <= start_train)].reset_index().drop(["index"], axis=1)
train = df.loc[(df['date'] >= start_train) & (df['date'] <= start_trade)].reset_index().drop(["index"], axis=1)
trade = df.loc[(df['date'] >= start_trade) & (df['date'] <= end_trade)].reset_index().drop(["index"], axis=1)

In [None]:
train.head()

Unnamed: 0,date,close,change_1,change_2,change_3,change_4,change_5,change_6,change_7,change_8,change_9,change_10,change_11,change_12
0,2021-10-22 00:00:00,4084.179688,1.003624,1.003918,1.003096,1.002749,1.004391,1.004577,1.005763,1.002015,0.999164,1.003224,1.003705,1.003009
1,2021-10-22 00:05:00,4098.96875,1.003621,1.007258,1.007553,1.006728,1.00638,1.008028,1.008215,1.009405,1.005643,1.002782,1.006857,1.00734
2,2021-10-22 00:10:00,4070.324463,0.993012,0.996608,1.00022,1.000512,0.999693,0.999348,1.000984,1.001169,1.002351,0.998615,0.995775,0.999821
3,2021-10-22 00:15:00,4069.398682,0.999773,0.992786,0.996381,0.999992,1.000285,0.999466,0.99912,1.000756,1.000941,1.002123,0.998388,0.995548
4,2021-10-22 00:20:00,4086.309082,1.004156,1.003927,0.996911,1.000521,1.004148,1.004441,1.003619,1.003272,1.004914,1.005101,1.006287,1.002537


#env2


In [None]:
class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""

    def __init__(self,
                df,
                initial_amount,
                trade_cost_pct,
                state_features,
                discrete,
                own_reward = False,
                moment = 0,
                initial = True,
                previous_state = [],
                model_name = '',
                make_plots = True,
                iteration = 0):
        self.moment = moment
        self.df = df
        self.initial_amount = initial_amount
        self.trade_cost_pct = trade_cost_pct
        self.state_features = state_features
        self.own_reward = own_reward
        self.discrete = discrete
        if (self.discrete):
            self.action_space = gym.spaces.Discrete(3)
        else:
            self.action_space = gym.spaces.Box(low = -1, high = 1,shape = (1,))
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape = (len(state_features)+3,))
        self.data = self.df.loc[self.moment,:]
        self.terminal = False
        self.make_plots = make_plots
        self.initial = initial
        self.previous_state = previous_state
        self.model_name = model_name
        self.iteration = iteration

        self.log = ''
        # initalize state
        self.state = self._initiate_state()

        # initialize reward
        self.reward = 0
        self.cost = 0
        self.trades = 0
        self.episode = 0

        # memorize balance change
        self.asset_memory = [self.initial_amount]
        self.rewards_memory = []
        self.actions_memory = []
        self.date_memory=[self._get_date()]
        # self.reset()
        self._seed()


    def _sell(self, actions):
        if self.state[2] > 0: # sell only if owned shares > 0
            sell_num_shares = self.state[2]*abs(actions)
            cost = sell_num_shares * self.state[1] * self.trade_cost_pct
            self.state[2] -= sell_num_shares
            self.state[0] += (sell_num_shares * self.state[1] - cost)

            self.cost += cost
            self.trades += 1
        else:
            sell_num_shares = 0

        return sell_num_shares


    def _buy(self, actions):
        if self.state[0] > 0: # buy only if account value > 0
            buy_num_shares = (self.state[0] / self.state[1])*actions
            cost = buy_num_shares * self.state[1] * self.trade_cost_pct
            self.state[2] += buy_num_shares
            self.state[0] -= (buy_num_shares * self.state[1] + cost)

            self.cost += cost
            self.trades += 1
        else:
            buy_num_shares = 0

        return buy_num_shares


    def step(self, actions):
        self.terminal = self.moment >= len(self.df.index.unique())-1
        if self.terminal:
            end_total_asset = self.state[0] + self.state[1] * self.state[2] # last timestep (balance + price of share * num of shares)
            print(f"episode: {self.episode}, iteration: {self.iteration}")
            print(f"begin_total_asset: {self.asset_memory[0]}")
            print(f"end_total_asset: {end_total_asset}")
            print(f"total_cost: {self.cost}")
            print(f"total_trades: {self.trades}")
            print("===")

            if (self.model_name!=''): # results of training for analyzing
                plt.plot(self.asset_memory,'r')
                plt.savefig(f'results/asset_{self.model_name}_{self.episode}.png')
                plt.close()

                df_total_value = pd.DataFrame(self.asset_memory)
                df_total_value.columns = ['account_value']
                df_total_value['date'] = self.date_memory
                df_rewards = pd.DataFrame(self.rewards_memory)
                df_rewards.columns = ['account_rewards']
                df_rewards['date'] = self.date_memory[:-1]
                df_actions = self.save_action_memory()
                df_actions.to_csv('results/2actions_{}_{}.csv'.format(self.model_name, self.iteration))
                df_total_value.to_csv('results/2account_value_{}_{}.csv'.format(self.model_name, self.iteration))
                df_rewards.to_csv('results/2account_rewards_{}_{}.csv'.format(self.model_name, self.iteration))
                plt.plot(self.asset_memory,'r')
                plt.savefig('results/2account_value_{}_{}.png'.format(self.model_name, self.iteration))
                plt.close()

            return self.state, self.reward, self.terminal, {}

        else:
            begin_owning = self.state[2]
            begin_price = self.state[1]
            begin_total_asset = self.state[0] + self.state[1] * self.state[2]

            if self.discrete:
                action = actions-1 # discrete space is 0,1,2, make it -1,0,1
            else:
                action = actions[0] # reduce shape

            action_val = 0 # final decision
            if (action < 0): # sell
                action_val = -self._sell(action)

            elif (action > 0): # buy
                action_val = self._buy(action)

            self.actions_memory.append(action_val)
            self.moment += 1
            self.data = self.df.loc[self.moment,:]
            self.state = self._update_state()

            end_total_asset = self.state[0] + self.state[1] * self.state[2] # balance after state transition
            self.asset_memory.append(end_total_asset)
            self.date_memory.append(self._get_date())
            self.reward = end_total_asset - begin_total_asset

            if self.own_reward:
                # check if could take action better in future
                tmp_df = self.df.loc[self.moment:self.moment+12*12,:] # 12 hours span
                diff_high = tmp_df["close"].max() - begin_price
                diff_low = begin_price-tmp_df["close"].min()
                diff_mean = tmp_df["close"].mean() - begin_price
                tax = action * begin_price * self.trade_cost_pct # trade cost

                if (action == 0 and begin_owning == 0): # not buy
                    self.reward -= diff_mean
                if (action == 0 and begin_owning > 0): # not sell
                    self.reward += diff_mean
                if (action < 0): # sell
                    self.reward -= (diff_mean + 10 * tax)
                if (action > 0): # buy
                    self.reward += (diff_mean - 10 * tax)

            self.rewards_memory.append(self.reward)
            self.reward = self.reward * 1e-4 # scaling factor good for training

        return self.state, self.reward, self.terminal, {}


    def reset(self):
        self.state = self._initiate_state()
        if self.initial:
            self.asset_memory = [self.initial_amount]
        else:
            previous_total_asset = self.previous_state[0]+ \
            sum(np.array(self.state[1])*np.array(self.previous_state[2]))
            self.asset_memory = [previous_total_asset]

        self.moment = 0
        self.data = self.df.loc[self.moment,:]
        self.cost = 0
        self.trades = 0
        self.terminal = False
        self.iteration += 1
        self.rewards_memory = []
        self.actions_memory = []
        self.date_memory = [self._get_date()]
        self.episode += 1

        return self.state


    def render(self, mode='human', close=False):
        return self.state


    def _initiate_state(self):
        if self.initial:
            state = [self.initial_amount] + \
                    [self.data.close] + \
                    [0] + \
                    sum([[self.data[feature]] for feature in self.state_features ], [])
        else:
            state = [self.previous_state[0]] + \
                    [self.data.close] + \
                    self.previous_state[2]  + \
                    sum([[self.data[feature]] for feature in self.state_features ], [])
        return np.array(state, dtype=np.float)


    def _update_state(self):
        state = [self.state[0]] + \
                [self.data.close] + \
                [self.state[2]] + \
                sum([[self.data[feature]] for feature in self.state_features ], [])
        return np.array(state, dtype=np.float)


    def _get_date(self):
        return self.data.date


    def save_asset_memory(self):
        date_list = self.date_memory
        asset_list = self.asset_memory
        df_account_value = pd.DataFrame({'date':date_list,'account_value':asset_list})
        return df_account_value


    def save_action_memory(self):
        date_list = self.date_memory[:-1]
        action_list = self.actions_memory
        df_actions = pd.DataFrame({'date':date_list,'actions':action_list})
        return df_actions


    def _seed(self, seed=None):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]


    def get_sb_env(self):
        e = DummyVecEnv([lambda: self])
        obs = e.reset()
        return e, obs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#training


In [None]:
state_space = len(state_features)+3
env_kwargs = {
    "initial_amount": 10000,
    "trade_cost_pct": 0.001,
    "state_features": state_features,
    "discrete": True,
    "model_name":'a', # for analyze, need /results dir
}

e_train_gym = StockTradingEnv(df = train, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()
agent = DRLAgent(env = env_train)

In [None]:
# optional
import os
if not os.path.exists("./results"):
    os.makedirs("./results")

In [None]:
model_ppo = agent.get_model("ppo")
trained_ppo = agent.train_model(model=model_ppo,
                             tb_log_name='ppo',
                             total_timesteps = 50000)

#trade

In [None]:
e_trade_gym = StockTradingEnv(df = trade, **env_kwargs)

df_account_value, df_actions = DRLAgent.DRL_prediction(
    model=trained_ppo,
    environment = e_trade_gym)

episode: 2, iteration: 2
begin_total_asset: 10000
end_total_asset: 10180.680104598801
total_cost: 90.03891478983357
total_trades: 9
===
hit end!


In [None]:
# create data for chart
temp_df = pd.merge(trade, df_actions, on="date")
temp_df = pd.merge(temp_df, df_account_value, on="date")

for i in range(0, len(temp_df)):
    temp_df.loc[i, 'buy&hold'] = temp_df.loc[i, 'close']*(10000/temp_df.loc[0, 'close'])

temp_df.drop(state_features+["close"], axis=1)

Unnamed: 0,date,actions,account_value,buy&hold
0,2021-12-18 00:00:00,0.000000,10000.000000,10000.000000
1,2021-12-18 00:05:00,2.590341,10000.000000,9969.269916
2,2021-12-18 00:10:00,0.000000,9958.353694,9937.720859
3,2021-12-18 00:15:00,0.000000,10002.109972,9981.342674
4,2021-12-18 00:20:00,0.000000,9970.656550,9949.985909
...,...,...,...,...
283,2021-12-18 23:35:00,0.000000,10196.530769,10248.768546
284,2021-12-18 23:40:00,0.000000,10195.653630,10247.887787
285,2021-12-18 23:45:00,0.000000,10200.458746,10252.712730
286,2021-12-18 23:50:00,0.000000,10195.172051,10247.404221


In [None]:
generate_trade_chart(temp_df, name="chart_1.png", scalar=100)

10400
9700
