In [22]:
from copy import deepcopy
from datetime import date, datetime, timedelta
from functools import partial
from importlib import reload
from pathlib import Path

import metrics
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotting
import seaborn as sns
import tensorflow as tf
from datapoints import assets
from plotly.subplots import make_subplots
from query_datasets import get_data
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import estimator_checks
from tensorflow.keras import layers
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.metrics import Accuracy, Precision, Recall
from tensorflow.keras.models import Model
from tools import dataframe, training, wandb_api
from tqdm import tqdm
from wandb.keras import WandbCallback

log_wandb = False
repo_path = Path().resolve().parent
# pio.renderers.default = "browser"


In [5]:
if log_wandb:
    import wandb

    wandb_api.login()
    run = wandb.init(
        project="crypto_prediction",
        group="Adaboost LSTM",
        job_type="test",
    )
    config = wandb.config

else:
    config = {}

In [198]:
interesting_tickers = [
    "ETH",
]


In [208]:
def compute_features(data, timesteps=8, lag=5):
    features = data.copy(deep=True)

    for i in range(timesteps):
        for col in data.columns:
            features[f"{col}_{i}"] = features[col].pct_change().shift(i)

    labels = features["Close"].pct_change().shift(-lag)
    # features = features.drop(labels=data.columns, axis=1)

    scaler = MinMaxScaler()
    features = features.replace(
        to_replace=[np.inf, -np.inf, float("inf"), float("inf")],
        value=0,
    ).dropna()
    df_scaled = features #scaler.fit_transform(features)
    df_scaled = pd.DataFrame(df_scaled, columns=features.columns, index=features.index)
    return df_scaled, labels


def create_asset(
    ticker,
    interval,
    beginning_date,
    ending_date,
    compute_features=lambda x: x,
):

    klines = get_data.download_klines(
        ticker,
        interval,
        beginning_date=beginning_date,
        ending_date=ending_date,
        directory=Path().resolve().parent / "tmp",
    )
    # trends = get_data.download_trends(
    #     ticker,
    #     interval,
    #     beginning_date=beginning_date,
    #     ending_date=ending_date,
    #     directory=Path().resolve().parent / "tmp",
    # )
    # blockchain_infos = get_data.download_blockchain(
    #     "BTC",
    #     interval,
    #     beginning_date=beginning_date,
    #     ending_date=ending_date,
    #     directory=Path().resolve().parent / "tmp",
    # )
    # santiment = get_data.download_santiment(
    #     "BTC",
    #     interval,
    #     beginning_date=beginning_date,
    #     ending_date=ending_date,
    #     directory=Path().resolve().parent / "tmp",
    # )
    data = pd.concat([klines], axis=1).astype(
        "float32"
    )
    data = data.replace(
        to_replace=[np.inf, -np.inf, float("inf"), float("inf")],
        value=0,
    )

    features, labels = compute_features(data)

    return assets.TrainAsset(
        ticker=ticker,
        df=data,
        labels=labels,
        features=features,
        interval=interval,
        compute_features=compute_features,
    )


class DataModule:
    def __init__(
        self,
        config,
        compute_features=None,
        inputs=None,
        save_klines=True,
    ):
        self.config = config
        self.compute_features = compute_features
        self.inputs = inputs
        self.save_klines = save_klines

        self.setup()

    def setup(self):
        self.train_datapoints = []
        for input in self.inputs:
            dp = create_asset(
                **input,
                interval=self.config["interval"],
                compute_features=self.compute_features,
            )
            if dp == []:
                continue
            dp.df = dp.df.dropna()
            dp.labels = dp.labels.dropna()
            dp._features = dp._features.dropna()

            common_index = dp.df.index.intersection(dp.labels.index)
            common_index = common_index.intersection(dp._features.index)

            dp.df = dp.df.loc[common_index]
            dp.labels = dp.labels.loc[common_index]
            dp._features = dp._features.loc[common_index]

            train_dp = assets.TrainAsset(
                ticker=input["ticker"],
                df=dp.df,
                labels=dp.labels,
                features=dp._features,
                interval=self.config["interval"],
                compute_features=self.compute_features,
            )
            if not train_dp.isempty:
                self.train_datapoints.append(train_dp)
            else:
                print(
                    f"{dp.ticker} is empty from {input['beginning_date']} to {input['ending_date']}."
                )

    def clean_datapoints(self, datapoints):
        return datapoints

    def concat_and_shuffle(self, features, labels):
        assert len(features) == len(labels)
        _features = np.concatenate(features, axis=0)
        _labels = np.concatenate(labels, axis=0)
        assert len(_features) == len(_labels)
        p = np.random.permutation(len(_features))
        return _features[p], _labels[p]

    def nest_train_test_val_split(
        self, datapoints, offset, train_size, val_size, test_size=0
    ):
        train_features = []
        train_labels = []
        val_features = []
        val_labels = []
        test_datapoints = {}
        for dp in datapoints:
            train_beginning = offset
            train_ending = train_beginning + train_size
            val_beginning = train_ending
            val_ending = val_beginning + val_size

            test_beginning = val_beginning
            test_ending = val_ending
            # test_beginning = val_ending
            # test_ending = test_beginning + test_size

            train_features.append(dp._features[train_beginning:train_ending])
            train_labels.append(dp.labels[train_beginning:train_ending])
            val_features.append(dp._features[val_beginning:val_ending])
            val_labels.append(dp.labels[val_beginning:val_ending])

            test_datapoints[dp.ticker] = assets.TrainAsset(
                ticker=dp.ticker,
                df=dp.df.iloc[test_beginning:test_ending],
                labels=dp.labels.iloc[test_beginning:test_ending],
                features=dp._features.iloc[test_beginning:test_ending],
                interval=dp.interval,
                compute_features=dp.compute_features,
            )

        return (
            self.concat_and_shuffle(train_features, train_labels),
            self.concat_and_shuffle(val_features, val_labels),
            test_datapoints,
        )

    def _init_train_val_data(self, train_datapoints):
        train_datapoints = self.clean_datapoints(train_datapoints)
        if self.config["train_val_test_split"][0] > 1:
            train_size = int(self.config["train_val_test_split"][0])
        else:
            train_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][0]
            )
        if self.config["train_val_test_split"][1] > 1:
            val_size = int(self.config["train_val_test_split"][1])
        else:
            val_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][1]
            )
        if self.config["train_val_test_split"][2] > 1:
            test_size = int(self.config["train_val_test_split"][2])
        else:
            test_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][2]
            )
        print(f"train_size: {train_size}, val_size: {val_size}, test_size: {test_size}")
        max_offset = max(
            len(train_datapoints[0].df) - (train_size + val_size + test_size), 1
        )
        train_datasets = []
        val_datasets = []
        test_datapoints = []
        for offset in range(0, max_offset, val_size + test_size):
            train_dataset, val_dataset, test_datapoint = self.nest_train_test_val_split(
                train_datapoints, offset, train_size, val_size, test_size
            )
            train_datasets.append(train_dataset)
            val_datasets.append(val_dataset)
            test_datapoints.append(test_datapoint)
        return train_datasets, val_datasets, test_datapoints


config["job_type"] = run.job_type if "run" in locals() else "test"
config["train_val_test_split"] = [0, 1, 0]
config["interval"] = "1d"
config["timesteps"] = 1
config["lag"] = 1
config["ago"] = 3400
config["batch_size"] = 64
config["learning_rate"] = 0.0003

inputs = [
    {
        "ticker": ticker,
        "beginning_date": datetime.combine(date.today(), datetime.min.time())
        - dataframe.convert_to_timedelta(config["interval"], ago=config["ago"]),
        "ending_date": datetime(
            2022, 5, 21
        ),  # datetime.combine(date.today(), datetime.min.time()),
    }
    for ticker in interesting_tickers
]

dm = DataModule(
    config,
    partial(compute_features, timesteps=config["timesteps"], lag=config["lag"]),
    inputs,
    save_klines=True,
)
train_datasets, val_datasets, test_datapoints = dm._init_train_val_data(
    dm.train_datapoints
)
print(f"Length training dataset: {len(train_datasets)}")
print(f"Length validation dataset: {len(train_datasets)}")
print(f"Length test dataset: {len(train_datasets)}")

print(f"Shape training sample: {train_datasets[0][0].shape}")
print(f"Shape training sample per timestep: {train_datasets[0][0].shape[1] / config['timesteps']}")

print(f"Shape validation sample: {val_datasets[0][0].shape}")
config["input_size"] = train_datasets[0][0].shape[1]
assert (
    config["input_size"] // config["timesteps"]
    == config["input_size"] / config["timesteps"]
)


[*********************100%***********************]  1 of 1 completed
train_size: 0, val_size: 1653, test_size: 0
Length training dataset: 1
Length validation dataset: 1
Length test dataset: 1
Shape training sample: (0, 10)
Shape training sample per timestep: 10.0
Shape validation sample: (1653, 10)


In [209]:
df = test_datapoints[0][interesting_tickers[0]]._features
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask)


fig = go.Figure(
    go.Heatmap(
        z=corr.mask(mask),
        x=corr.columns,
        y=corr.columns,
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1,
    )
)
fig.update_layout(
    height=1200,
    width=1200,
)
fig.show()
fig.write_image("correlation.svg")

In [210]:
import matplotlib.pyplot as plt
from scipy import stats

data = []
n = 10
for i in percent_upper:
    rolling_max = i * df["Volume"].rolling(30).max()
    returns = df["Close"].pct_change().shift(-1)[df["Volume"] >= rolling_max]
    high = ((df["High"] - df["Close"].shift(1)) / df["Close"].shift(1)).shift(-1)[df["Volume"] >= rolling_max]

    current_data = {"percent_upper_volume": i}
    if len(returns) > 2:
        current_data["pvalue"] = stats.shapiro(returns).pvalue
    else:
        current_data["pvalue"] = np.nan
    current_data["percentage_occurence"] = len(returns) / len(df)

    current_data["mean_return"] = np.mean(returns.dropna())
    current_data["std_return"] = np.std(returns.dropna())

    current_data["min_high"] = np.sort(high.dropna())[int(0.20*len(high.dropna()))]
    current_data["max_high"] = np.sort(high.dropna())[int(0.95*len(high.dropna()))]
    current_data["mean_high"] = np.mean(high.dropna())
    current_data["median_high"] = np.median(high.dropna())
    
    data.append(current_data)

data = pd.DataFrame(data)
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(
        x=data["percent_upper_volume"],
        y=data["median_high"],
        text=data["percentage_occurence"],
        name="median_high",
    ),
    secondary_y=False,
)
fig.add_trace(
    go.Scatter(
        x=data["percent_upper_volume"],
        y=data["max_high"],
        text=data["percentage_occurence"],
        name="max_high",
    ),
    secondary_y=True,
)
fig.add_trace(
    go.Scatter(
        x=data["percent_upper_volume"],
        y=data[f"min_high"],
        text=data["percentage_occurence"],
        name=f"min 22% high",
    ),
    secondary_y=False,
)
fig.add_trace(
    go.Scatter(
        x=data["percent_upper_volume"],
        y=[2000e-6]*len(data),
        name=f"threshold to win",
        marker_color="black"
    ),
    secondary_y=False,
)
fig.update_layout(
    title=f"Returns when previous hour volume > i * max volume of the last 30 hours on {interesting_tickers[0]}",
    xaxis_title="i",
)
fig.update_yaxes(title_text="pct change compared to previous close", secondary_y=False)
fig.update_yaxes(title_text="same, but scale to max_high", secondary_y=True)
fig.show()
fig.write_image("next hour return.svg")

In [172]:
test_datapoints[0][interesting_tickers[0]].df.loc[high.dropna().idxmin()]

Open               6.308000e-01
High               6.586000e-01
Low                6.269000e-01
Close              6.389000e-01
Volume             7.973567e+07
num_trades         8.307800e+04
taker_base_vol     4.045306e+07
taker_quote_vol    2.599627e+07
Name: 2022-01-26 19:00:00+00:00, dtype: float32

In [155]:
pct_change = np.linspace(0, 0.01, 1000)
fees = 0.1
amount = 100
delta_amount = ((1 - fees / 100) ** 2 * (pct_change + 1) - 1) * amount
fig = go.Figure(
    go.Scatter(
        x=pct_change,
        y=delta_amount,
    )
)
fig.show()
