# Imports

In [None]:
import pandas as pd
import pyarrow as pa
import numpy as np

from tqdm import tqdm
from datetime import datetime, timedelta

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# Read Data

If you do not have the data then please run the *data-collector.ipynb* notebook and collect the specific crypto symbol you want to simulate.

In the future I will probably change this to a streamlit utility to make it much more user friendly but I am lazy and this is a prototype

In [None]:
symbol = "BTCUSDT"

df = pd.read_parquet(f"../data_binance_crypto/symbol={symbol}/", engine="pyarrow")
df.shape

In [None]:
df.head()

## Quick and simple EDA

In [None]:
df.isna().sum()

In [None]:
columns = df.columns
for col in sorted(columns):
    print(f"Col: {col.ljust(40, ' '):40} Type: {df[col].dtype}")

In [None]:
float_columns = [
    "open",
    "high",
    "low",
    "close",
    "volume",
    "quote_asset_vol",
    "taker_buy_base_asset_vol",
    "taker_buy_quote_asset_vol",
    "ignore_this",
]

for col in float_columns:
    df[col] = df[col].astype(float)

In [None]:
df.describe()

# Feature engineering

Initially this is going to be a bunch of *traditionally useful* financial features. In the paraphrased words of our boy Ernest Chan, "throw heaps of features at your models during prototyping, let the model decide what it thinks is important, it is probably smarter than you."

^This is obviously a joke taken to extremes but you get the point. Who are we to decide what is a "good feature", if we want to find some form of market inefficieny then why not try everything and use feature selection to trim down our list for us.

According to this, https://arxiv.org/abs/2005.12483, LIME is just as promising as SHAP so I am eager to explore this. If it fails we can always fall back on good old SHAP

In [None]:
df.head()

## Sell Conditions
The "taker buy" columns come from when an order is filled based on an existing limit sell

This number will always be less than the raw traded volume. This is because the raw volume will include market price sells

We can therefore assume that when we take a ratio it will be a number between zero and one and represent the number of intentional sales. The inverse of this number will be the trades that come from panic selling at market or other market forces like liquidation

In [None]:
df["ratio_intentional_trades"] = df["taker_buy_base_asset_vol"] / df["volume"]

In [None]:
df["ratio_intentional_trades_ma_10"] = df.rolling(10)["ratio_intentional_trades"].mean()

In [None]:
df.head()

## Datetime semantics

### Year based
If assuming cyclical behaviour then we would need more cases of each cycle.

Because we have only collected ~8 months of training data we will have less than one cycle making this yearly data almost useless

In [None]:
# df["quarter"] = pd.to_datetime(df["day"]).dt.quarter

In [None]:
# df["month_of_year"] = pd.to_datetime(df["day"]).dt.month

In [None]:
# df["week_of_year"] = pd.to_datetime(df["day"]).dt.isocalendar().week

### Month based

Month based data is assuming "Fund Flow" interactions where you have large firms re-balancing on a monthly schedule

This might not be part of crypto but could prove valuable anyways

In [None]:
df["day_of_month"] = pd.to_datetime(df["day"]).dt.day

In [None]:
df["day_of_week"] = pd.to_datetime(df["day"]).dt.dayofweek

### Day based

Day based data will show the influence each region has on the price. 

Because crypto is traded 24/7 it doesn't have a traditional market open and close. However each region around the world still needs to sleep and work so you will end up getting cyclical regional activity.

Each region has different risks and economical situations so you should get different behaviours.

You might also catch algorithmic trading based on certain times of day

In [None]:
df["hour_of_day"] = pd.to_datetime(df["open_time"], unit="ms").dt.hour

In [None]:
df["minute_of_hour"] = pd.to_datetime(df["open_time"], unit="ms").dt.minute

In [None]:
df.head()

## Make trade values stationary

For timeseries data there is a concept called "stationarity", in short this is the act of transforming the data so that statistical properties such as mean, variance, autocorrelation, etc actually mean something for the future. 

Commonly this is done by converting changes in price to percentages and scaling them across your train data. The model can only act on information it has seen before, by making your data stationary it is possible that it can pick out recurring behaviour.

***Disclaimer***: To be frank, the choice of 10 for the rolling window size is totally arbitrary

### Price change

Here we have the raw price change and the price change as a percentage of the opening

We also want to look at a "smoother" price change in the form of a moving average. This is because typically when financial data is trending it isn't a nice line, it is a "spiky" line. By taking the moving average we lose some granularity but we can see the underlying momentum trend

In [None]:
df["price_change"] = df["close"] - df["open"]
df["price_change_perc"] =  df["price_change"] / df["open"]

In [None]:
df["price_change_ma_10"] = df.rolling(10)["price_change"].mean()
df["price_change_perc_smooth"] = df["price_change_ma_10"] / df["open"]

In [None]:
df[[
    "open",
    "close",
    "price_change,
    "price_change_perc",
    "price_change_ma_10",
    "price_change_perc_smooth"
]].head(100)

### Volatility

Variance or "volatility" can be used to determine how stable the trading period was.

Again we are using a smooth apporach to see if it helps the model.

In [None]:
df["volatility"] = df["high"] - df["low"]
df["volatility_perc"] = df["volatility"] / df["open"]

In [None]:
df["volatility_ma_10"] = df.rolling(10)["volatility"].mean()
df["volatility_perc_smooth"] = df["volatility_ma_10"] / df["open"]

In [None]:
df[[
    "high",
    "low",
    "volatility,
    "volatility_perc",
    "volatility_ma_10",
    "volatility_perc_smooth"
]].head(100)

### Volume change

Volume is another indicator of stability. If the volume suddenly jumps above the norm we might expect a change in market direction. 

For this reason we are looking at the volume change since last time period and another smoothed version

In [None]:
df["last_volume"] = df["volume"].shift(1)

df["volume_change"] = df["volume"] - df["last_volume"]
df["volume_change_perc"] = df["volume_change"] / df["last_volume"]

In [None]:
df["volume_change_ma_10"] = df.rolling(10)["volume_change"].mean()
df["volume_change_perc_smooth"] = df["volume_change_ma_10"] / df["last_volume"]

In [None]:
df[[
    "volume",
    "last_volume",
    "volume_change,
    "volume_change_perc",
    "volume_change_ma_10",
    "volume_change_perc_smooth"
]].head(100)

## Graphs

I am eager to get on to the model so will leave the feature engineering here for now, we can always return and add more to it

In [None]:
import plotly.express as px

# px.scatter(
#     df,
#     x="open_time",
#     y="ratio_intentional_vol",
#     height=1024
# )

In [None]:
import plotly.express as px

# px.scatter(
#     df,
#     x="open_time",
#     y="open",
#     height=1024
# )

# Gym

In [None]:
import gym

from gym import spaces
from sklearn import preprocessing

In [None]:
class CryptoTradingEnv(gym.Env):
    
    metadata = {"render.modes": ["live", "file", "none"]}
    scaler = preprocessing.MinMaxScaler()
    viewer = None
    
    def __init__(
        self, 
        df: pd.DataFrame, 
        lookback_window_size : int=50,
        commission : float = 0.00075,
        initial_balance : float = 1_000.0,
        serial : bool = False
    ):
        super(CryptoTradingEnv, self).__init__()
        
        self.df = df
        self.initial_balance = initial_balance
        self.commission = commission
        self.serial = serial
        
        # The agent can buy, sell, hold, at certain amounts 1/10 through 10/10
        self.action_space = spaces.MultiDiscrete([3, 10])
        
        #Observes the OHCLV values, net worth, and trade history
        self.observation_space = spaces.Box(
            shape(10, lookback_window_size + 1),
            low=0,
            high=1,
            dytpe=np.float16
        )
    
    