# Intro

This idea of using reinforcement learning for crypto currencies was proposed by one of my colleagues and then subsequent searching landed me on some medium articles.

They are not super great (sloppy code and many logical holes) but at least it is a starting point. I will just throw down the code here and use the exploration notebook in this directory to modify the approach

[Link for those curious](https://towardsdatascience.com/creating-bitcoin-trading-bots-that-dont-lose-money-2e7165fb0b29)

# Graph visualisation

In [6]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from tqdm import tqdm
from datetime import datetime, timedelta

from matplotlib import style

# finance module is no longer part of matplotlib
# see: https://github.com/matplotlib/mpl_finance
import mplfinance as mpf



style.use('dark_background')

VOLUME_CHART_HEIGHT = 0.33

UP_COLOUR = '#27A59A'
DOWN_COLOUR = '#EF534F'
UP_TEXT_COLOUR = '#73D3CC'
DOWN_TEXT_COLOUR = '#DC2C27'



class CryptoTradingGraph:
    """A crypto trading visualization using matplotlib made to render OpenAI gym environments"""


    def __init__(self, df, title=None):
        self.df = df
        self.net_worths = np.zeros(len(df["open_time"]))

        # Create a figure on screen and set the title
        fig = plt.figure()
        fig.suptitle(title)

        # Create top subplot for net worth axis
        self.net_worth_ax = plt.subplot2grid(
            (6, 1),
            (0, 0),
            rowspan=2,
            colspan=1
        )

        # Create bottom subplot for shared price/volume axis
        self.price_ax = plt.subplot2grid(
            (6, 1),
            (2, 0),
            rowspan=8,
            colspan=1,
            sharex=self.net_worth_ax
        )

        # Create a new axis for volume which shares its x-axis with price
        self.volume_ax = self.price_ax.twinx()

        # Add padding to make graph easier to view
        plt.subplots_adjust(
            left=0.11,
            bottom=0.24,
            right=0.90,
            top=0.90,
            wspace=0.2,
            hspace=0
        )

        # Show the graph without blocking the rest of the program
        plt.show(block=False)


    def _render_net_worth(self, current_step, net_worth, step_range, dates):
        # Clear the frame rendered last step
        self.net_worth_ax.clear()

        # Plot net worths
        self.net_worth_ax.plot_date(
            dates,
            self.net_worths[step_range],
            '-',
            label="Net Worth"
        )

        # Show legend, which uses the label we defined for the plot above
        self.net_worth_ax.legend()
        legend = self.net_worth_ax.legend(loc=2, ncol=2, prop={"size": 8})
        legend.get_frame().set_alpha(0.4)

        last_date = self.df["open_time"].values[current_step]
        last_net_worth = self.net_worths[current_step]

        # Annotate the current net worth on the net worth graph
        self.net_worth_ax.annotate(
            f"{net_worth:,.2f}",
            (last_date, last_net_worth),
            xytext=(last_date, last_net_worth),
            bbox=dict(boxstyle="round", fc='w', ec='k', lw=1),
            color="black",
            fontsize="small"
        )

        # Add space above and below min/max net worth
        self.net_worth_ax.set_ylim(
            min(self.net_worths[np.nonzero(self.net_worths)]) / 1.25,
            max(self.net_worths) * 1.25
        )


    def _render_price(self, current_step, net_worth, dates, step_range):
        self.price_ax.clear()

        candlesticks = zip(
            dates,
            self.df["open"].values[step_range],
            self.df["close"].values[step_range],
            self.df["high"].values[step_range],
            self.df["low"].values[step_range]
        )

        # Plot price using candlestick graph from mpl_finance
        mpf.plot(
            self.price_ax,
            candlesticks,
            width=1,
            colorup=UP_COLOUR,
            colordown=DOWN_COLOUR,
            type="candle"
        )

        last_date = self.df["open_time"].values[current_step]
        last_close = self.df["close"].values[current_step]
        last_high = self.df["high"].values[current_step]

        # Print the current price to the price axis
        self.price_ax.annotate(
            f"{last_close}:,.2f",
            (last_date, last_close),
            xytext=(last_date, last_high),
            bbox=dict(boxstyle='round', fc='w', ec='k', lw=1),
            color="black",
            fontsize="small"
        )

        # Shift price axis up to give volume chart space
        ylim = self.price_ax.get_ylim()
        self.price_ax.set_ylim(
            ylim[0] - (ylim[1] - ylim[0]) * VOLUME_CHART_HEIGHT,
            ylim[1]
        )


    def _render_volume(self, current_step, net_worth, dates, step_range):
        self.volume_ax.clear()

        volume = np.array(self.df["volume"].values[step_range])

        pos = self.df["open"].values[step_range] - self.df["close"].values[step_range] < 0
        neg = self.df["open"].values[step_range] - self.df["close"].values[step_range] > 0

        self.volume_ax.bar(
            dates[pos],
            volume[pos],
            color=UP_COLOUR,
            alpha=0.4,
            width=1,
            align="center"
        )
        self.volume_ax.bar(
            dates[neg],
            volume[neg],
            color=DOWN_COLOUR,
            alpha=0.4,
            width=1,
            align="center"
        )

        # Cap volume axis height below price chart and hide ticks
        self.volume_ax.set_ylim(0, max(volume) / VOLUME_CHART_HEIGHT)
        self.volume_ax.yaxis.set_ticks([])


    def _render_trades(self, current_step, trades, step_range):
        for trade in trades:
            if trade["step"] in step_range:
                date = self.df["open_time"].values[trade["step"]]
                high = self.df["high"].values[trade["step"]]
                low = self.df["low"].values[trade["step"]]

                if trade["type"] == 'buy':
                    high_low = low
                    colour = UP_TEXT_COLOUR
                else:
                    high_low = high
                    colour = DOWN_TEXT_COLOUR

                total = f"{trade['total']:,.2f}"

                # Print the current price to the price axis
                self.price_ax.annotate(
                    f"${total:,.2f}",
                    (date, high_low),
                    xytext=(date, high_low),
                    color=colour,
                    fontsize=8,
                    arrowprops=(dict(color=colour))
                )

    def render(self, current_step, net_worth, trades, window_size=40):
        self.net_worths[current_step] = net_worth

        window_start = max(current_step - window_size, 0)
        step_range = range(window_start, current_step + 1)

        dates = np.array([
            datetime.utcfromtimestamp(x).strftime("%Y-%m-%d %H-%M") for x in self.df["open_time"].values[step_range]
        ])

        self._render_net_worth(current_step, net_worth, step_range, dates)
        self._render_price(current_step, net_worth, dates, step_range)
        self._render_volume(current_step, net_worth, dates, step_range)
        self._render_trades(current_step, trades, step_range)

        # Format the date ticks to be more easily read
        self.price_ax.set_xticklabels(
            self.df["open_time"].values[step_range],
            rotation=45,
            horizontalalignment="right"
        )

        # Hide duplicate net worth date labels
        plt.setp(self.net_worth_ax.get_xticklabels(), visible=False)

        # Necessary to view frames before they are unrendered
        plt.pause(0.001)


    def close(self):
        plt.close()


# The gym environment

In [22]:
import gym

from gym import spaces
from sklearn import preprocessing



MAX_TRADING_SESSION = 100000  # ~2 months



class CryptoTradingEnv(gym.Env):

    metadata = {"render.modes": ["live", "file", "none"]}
    scaler = preprocessing.MinMaxScaler()
    viewer = None


    def __init__(
            self,
            df: pd.DataFrame,
            lookback_window_size : int=50,
            commission : float = 0.00075,
            initial_balance : float = 10_000.0,
            serial : bool = False
    ):
        super(CryptoTradingEnv, self).__init__()

        self.df = df.dropna().reset_index()
        self.lookback_window_size=lookback_window_size
        self.initial_balance = initial_balance
        self.commission = commission
        self.serial = serial
        self.trades = []

        # The agent can buy, sell, hold, at certain amounts 1/10 through 10/10
        self.action_space = spaces.MultiDiscrete([3, 10])

        #Observes the OHCLV values, net worth, and trade history
        self.observation_space = spaces.Box(
            shape=(10, lookback_window_size + 1),
            low=0,
            high=1
        )


    def reset(self):
        # Reset the whole simulation
        self.balance = self.initial_balance
        self.net_worth = self.initial_balance
        self.assets_held = 0

        self._reset_session()

        self.account_history = np.repeat([
            [self.net_worth],
            [0],
            [0],
            [0],
            [0]
        ],
            self.lookback_window_size + 1,
            axis=1
        )

        self.trades = []

        return self._next_observation()


    def _reset_session(self):
        # I am not convinced on the "random traversal" approach here
        # but there is some supporting evidence that it works, so I will humour it

        self.current_step = 0

        if self.serial:
            self.steps_left = len(self.df) - self.lookback_window_size - 1
            self.frame_start = self.lookback_window_size
        else:
            # Random traversal
            self.steps_left = np.random.randint(1, MAX_TRADING_SESSION)
            self.frame_start = np.random.randint(
                self.lookback_window_size,
                len(self.df) - self.steps_left
            )

        self.active_df = self.df[
                         self.frame_start - self.lookback_window_size : self.frame_start + self.steps_left
                         ]


    def _next_observation(self):
        # It is really important to ONLY scale the data that the model has seen,
        # This is to prevent 'look-ahead bias'
        end = self.current_step + self.lookback_window_size + 1

        obs = np.array([
            self.active_df["open"].values[self.current_step:end],
            self.active_df["high"].values[self.current_step:end],
            self.active_df["low"].values[self.current_step:end],
            self.active_df["close"].values[self.current_step:end],
            self.active_df["volume"].values[self.current_step:end],
        ])

        scaled_history = self.scaler.fit_transform(self.account_history)

        obs = np.append(
            obs,
            scaled_history[:, -(self.lookback_window_size + 1):],
            axis=0
        )

        return obs


    def _get_current_price(self):
        return self.df["close"].values[self.frame_start + self.current_step]


    def step(self, action):
        current_price = self._get_current_price() + 0.01
        self._take_action(action, current_price)
        self.steps_left -= 1
        self.current_step += 1

        if self.steps_left == 0:
            self.balance += self.assets_held * current_price
            self.assets_held = 0
            self._reset_session()

        obs = self._next_observation()
        reward = self.net_worth
        done = self.net_worth <= 0

        return obs, reward, done, {}


    def _take_action(self, action, current_price):
        action_type = action[0]
        amount = action[1] / 10

        assets_bought = 0
        assets_sold = 0
        cost = 0
        sales = 0

        if action_type < 1:
            # Trigger a buy
            assets_bought = 0.0 if amount == 0 else self.balance / (current_price * amount)
            cost = assets_bought * current_price * (1 + self.commission)
            self.assets_held += assets_bought
            self.balance -= cost

        elif action_type < 2:
            # Trigger a sell
            assets_sold = self.assets_held * amount
            sales = assets_sold * current_price * (1 - self.commission)
            self.assets_held -= assets_sold
            self.balance += sales

        if assets_bought > 0 or assets_sold > 0:
            self.trades.append({
                "step": self.frame_start + self.current_step,
                "amount": assets_bought if assets_bought > 0 else assets_sold,
                "total": cost if assets_bought > 0 else sales,
                "type": "buy" if assets_bought > 0 else "sell"
            })

        self.net_worth = self.balance + self.assets_held * current_price
        self.account_history = np.append(
            self.account_history,
            [
                [self.net_worth],
                [assets_bought],
                [cost],
                [assets_sold],
                [sales]
            ],
            axis=1
        )


    def render(self, mode="human", **kwargs):
        if mode == "human":
            if self.viewer == None:
                self.viewer = CryptoTradingGraph(
                    self.df,
                    kwargs.get("title", None)
                )

            self.viewer.render(
                self.frame_start + self.current_step,
                self.net_worth,
                self.trades,
                window_size=self.lookback_window_size
            )


# Running it

In [8]:
import pandas as pd
import pyarrow as pa

In [34]:
symbol = "BTCUSDT"

df = pd.read_parquet(f"../data_binance_crypto/symbol={symbol}/", engine="pyarrow")

df.shape

(349290, 13)

In [35]:
columns = df.columns
for col in sorted(columns):
    print(f"Col: {col.ljust(40, ' '):40} Type: {df[col].dtype}")

Col: close                                    Type: object
Col: close_time                               Type: int64
Col: day                                      Type: category
Col: high                                     Type: object
Col: ignore_this                              Type: object
Col: low                                      Type: object
Col: num_trades                               Type: int64
Col: open                                     Type: object
Col: open_time                                Type: int64
Col: quote_asset_vol                          Type: object
Col: taker_buy_base_asset_vol                 Type: object
Col: taker_buy_quote_asset_vol                Type: object
Col: volume                                   Type: object


In [36]:
# The medium article was simple and just used OHCLV
cols = [
    "open",
    "high",
    "close",
    "low",
    "volume"
]
df = df[cols + ["open_time"]].copy()

In [37]:
for col in cols:
    df[col] = df[col].astype(float)

In [38]:
slice_point = int(len(df) - 100_000)

train_df = df[:slice_point]
test_df = df[slice_point:]

In [39]:
train_env = CryptoTradingEnv(train_df, commission=0.00075, serial=False)

test_env = CryptoTradingEnv(test_df, commission=0.00075, serial=True)

In [40]:
from stable_baselines3 import A2C

model = A2C(
    "MlpPolicy",
    train_env,
    verbose=1,
    tensorboard_log="./tensorboard/"
)

model.learn(total_timesteps=50_000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tensorboard/A2C_8
------------------------------------
| time/                 |          |
|    fps                | 471      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.84    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 4.93e+04 |
|    value_loss         | 9.96e+08 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 521      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.76    |
|    explained_variance | 0        |
|    learning_rate      

<stable_baselines3.a2c.a2c.A2C at 0x7fa310e53790>