In [1]:
import sys
from pathlib import Path

IS_KAGGLE = "kaggle_secrets" in sys.modules
if IS_KAGGLE:
    repo_path = Path("../input/crypto_prediction")
else:
    repo_path = Path("/home/matias/crypto_prediction")
sys.path.append(str(repo_path))


In [2]:
from copy import deepcopy
from datetime import date, datetime, timedelta
from functools import partial
from importlib import reload
from pathlib import Path

import metrics
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import plotting
import tensorflow as tf
from datapoints import assets
from plotly.subplots import make_subplots
from query_datasets import get_data
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import estimator_checks
from tensorflow.keras import layers
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.metrics import Accuracy, Precision, Recall
from tensorflow.keras.models import Model
from tools import dataframe, training, wandb_api
from tqdm import tqdm
from wandb.keras import WandbCallback

log_wandb = False
repo_path = Path().resolve().parent
# pio.renderers.default = "browser"


2022-06-10 09:18:14.602022: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-10 09:18:14.602085: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
if log_wandb:
    import wandb

    wandb_api.login()
    run = wandb.init(
        project="crypto_prediction",
        group="Adaboost LSTM",
        job_type="test",
    )
    config = wandb.config

else:
    config = {}


In [4]:
interesting_tickers = ["BTC"]


In [31]:
def compute_features(data, timesteps=8, lag=5):
    features = data.copy(deep=True)

    for col in data.columns:
        for i in range(timesteps):
            features[f"{col}_pct_{i}"] = features[col].pct_change().shift(i)
            features[f"{col}_pct_{i}"] = features[col].shift(i)

    labels = features["Close"].shift(-lag)
    features = features.drop(labels=data.columns, axis=1)

    scaler = MinMaxScaler()
    features = features.replace(
        to_replace=[np.inf, -np.inf, float("inf"), float("inf")],
        value=0,
    ).dropna()
    # df_scaled = scaler.fit_transform(features)
    # df_scaled = pd.DataFrame(df_scaled, columns=features.columns, index=features.index)
    return features, labels


def create_asset(
    ticker,
    interval,
    beginning_date,
    ending_date,
    compute_features=lambda x: x,
):

    klines = get_data.download_klines(
        ticker,
        interval,
        beginning_date=beginning_date,
        ending_date=ending_date,
        directory=repo_path / "ada_lstm" / "tmp",
    )
    trends = get_data.download_trends(
        ticker,
        interval,
        beginning_date=beginning_date,
        ending_date=ending_date,
        directory=repo_path / "ada_lstm" / "tmp",
    )
    blockchain_infos = get_data.download_blockchain(
        "BTC",
        interval,
        beginning_date=beginning_date,
        ending_date=ending_date,
        directory=repo_path / "ada_lstm" / "tmp",
    )
    santiment = get_data.download_santiment(
        "BTC",
        interval,
        beginning_date=beginning_date,
        ending_date=ending_date,
        directory=repo_path / "ada_lstm" / "tmp",
    )
    data = pd.concat([klines, blockchain_infos, trends, santiment], axis=1).astype(
        "float32"
    )
    data = data.replace(
        to_replace=[np.inf, -np.inf, float("inf"), float("inf")],
        value=0,
    )

    features, labels = compute_features(data)

    return assets.TrainAsset(
        ticker=ticker,
        df=data,
        labels=labels,
        features=features,
        interval=interval,
        compute_features=compute_features,
    )


class DataModule:
    def __init__(
        self,
        config,
        compute_features=None,
        inputs=None,
        save_klines=True,
    ):
        self.config = config
        self.compute_features = compute_features
        self.inputs = inputs
        self.save_klines = save_klines

        self.setup()

    def setup(self):
        self.train_datapoints = []
        for input in self.inputs:
            dp = create_asset(
                **input,
                interval=self.config["interval"],
                compute_features=self.compute_features,
            )
            if dp == []:
                continue
            dp.df = dp.df.dropna()
            dp.labels = dp.labels.dropna()
            dp._features = dp._features.dropna()

            common_index = dp.df.index.intersection(dp.labels.index)
            common_index = common_index.intersection(dp._features.index)

            dp.df = dp.df.loc[common_index]
            dp.labels = dp.labels.loc[common_index]
            dp._features = dp._features.loc[common_index]

            train_dp = assets.TrainAsset(
                ticker=input["ticker"],
                df=dp.df,
                labels=dp.labels,
                features=dp._features,
                interval=self.config["interval"],
                compute_features=self.compute_features,
            )
            if not train_dp.isempty:
                self.train_datapoints.append(train_dp)
            else:
                print(
                    f"{dp.ticker} is empty from {input['beginning_date']} to {input['ending_date']}."
                )

    def clean_datapoints(self, datapoints):
        return datapoints

    def concat_and_shuffle(self, features, labels):
        assert len(features) == len(labels)
        _features = np.concatenate(features, axis=0)
        _labels = np.concatenate(labels, axis=0)
        assert len(_features) == len(_labels)
        p = np.random.permutation(len(_features))
        return _features[p], _labels[p]

    def nest_train_test_val_split(
        self, datapoints, offset, train_size, val_size, test_size=0
    ):
        train_features = []
        train_labels = []
        val_features = []
        val_labels = []
        test_datapoints = {}
        for dp in datapoints:
            train_beginning = offset
            train_ending = train_beginning + train_size
            val_beginning = train_ending
            val_ending = val_beginning + val_size

            test_beginning = val_beginning
            test_ending = val_ending
            # test_beginning = val_ending
            # test_ending = test_beginning + test_size

            train_features.append(dp._features[train_beginning:train_ending])
            train_labels.append(dp.labels[train_beginning:train_ending])
            val_features.append(dp._features[val_beginning:val_ending])
            val_labels.append(dp.labels[val_beginning:val_ending])

            test_datapoints[dp.ticker] = assets.TrainAsset(
                ticker=dp.ticker,
                df=dp.df.iloc[test_beginning:test_ending],
                labels=dp.labels.iloc[test_beginning:test_ending],
                features=dp._features.iloc[test_beginning:test_ending],
                interval=dp.interval,
                compute_features=dp.compute_features,
            )

        return (
            self.concat_and_shuffle(train_features, train_labels),
            self.concat_and_shuffle(val_features, val_labels),
            test_datapoints,
        )

    def _init_train_val_data(self, train_datapoints):
        train_datapoints = self.clean_datapoints(train_datapoints)
        if self.config["train_val_test_split"][0] > 1:
            train_size = int(self.config["train_val_test_split"][0])
        else:
            train_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][0]
            )
        if self.config["train_val_test_split"][1] > 1:
            val_size = int(self.config["train_val_test_split"][1])
        else:
            val_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][1]
            )
        if self.config["train_val_test_split"][2] > 1:
            test_size = int(self.config["train_val_test_split"][2])
        else:
            test_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][2]
            )
        print(f"train_size: {train_size}, val_size: {val_size}, test_size: {test_size}")
        max_offset = max(
            len(train_datapoints[0].df) - (train_size + val_size + test_size), 1
        )
        train_datasets = []
        val_datasets = []
        test_datapoints = []
        for offset in range(0, max_offset, val_size + test_size):
            train_dataset, val_dataset, test_datapoint = self.nest_train_test_val_split(
                train_datapoints, offset, train_size, val_size, test_size
            )
            train_datasets.append(train_dataset)
            val_datasets.append(val_dataset)
            test_datapoints.append(test_datapoint)
        return train_datasets, val_datasets, test_datapoints


config["job_type"] = run.job_type if "run" in locals() else "test"
config["train_val_test_split"] = [0.7, 1 - 0.7, 0]
config["interval"] = "1d"
config["timesteps"] = 15
config["lag"] = 1
config["ago"] = 3000
config["batch_size"] = 64
config["learning_rate"] = 0.0003

inputs = [
    {
        "ticker": ticker,
        "beginning_date": datetime.combine(date.today(), datetime.min.time())
        - dataframe.convert_to_timedelta(config["interval"], ago=config["ago"]),
        "ending_date": datetime(
            2022, 5, 21
        ),  # datetime.combine(date.today(), datetime.min.time()),
    }
    for ticker in interesting_tickers
]

dm = DataModule(
    config,
    partial(compute_features, timesteps=config["timesteps"], lag=config["lag"]),
    inputs,
    save_klines=True,
)
train_datasets, val_datasets, test_datapoints = dm._init_train_val_data(
    dm.train_datapoints
)
print(f"Length training dataset: {len(train_datasets)}")
print(f"Length validation dataset: {len(train_datasets)}")
print(f"Length test dataset: {len(train_datasets)}")

print(f"Shape training sample: {train_datasets[0][0].shape}")
print(f"Shape training sample: {train_datasets[0][0].shape[1] / config['timesteps']}")

print(f"Shape validation sample: {val_datasets[0][0].shape}")
config["input_size"] = train_datasets[0][0].shape[1]
assert (
    config["input_size"] // config["timesteps"]
    == config["input_size"] / config["timesteps"]
)


  features[f"{col}_pct_{i}"] = features[col].pct_change().shift(i)


train_size: 1463, val_size: 627, test_size: 0
Length training dataset: 1
Length validation dataset: 1
Length test dataset: 1
Shape training sample: (1463, 285)
Shape training sample: 19.0
Shape validation sample: (627, 285)


In [40]:
class LSTMModel(Model):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.model = self.build_model()

        self.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=config["learning_rate"]),
            loss="mse",
            metrics=metrics.tf_regression_metrics,
        )

    def build_model(self):
        inputs = layers.Input(
            shape=(self.config["input_size"]),
        )
        outputs = layers.Reshape(
            (
                self.config["timesteps"],
                -1,
            )
        )(inputs)
        outputs = layers.LSTM(512, activation=layers.ReLU(), return_sequences=True)(
            outputs
        )
        outputs = layers.LSTM(256, activation=layers.ReLU(), return_sequences=True)(
            outputs
        )
        outputs = layers.LSTM(128, activation=layers.ReLU(), dropout=0.3)(outputs)
        outputs = layers.Flatten()(outputs)
        outputs = layers.Dense(128, activation=layers.ReLU())(outputs)
        outputs = layers.Dense(1, activation=None)(outputs)
        return Model(inputs=inputs, outputs=outputs, name="model")

    def call(self, klines):
        return self.model(klines)


estimator = LSTMModel(config)
train_data = (
    tf.data.Dataset.from_tensor_slices(
        (
            train_datasets[0][0],
            train_datasets[0][1],
        )
    )
    .shuffle(len(train_datasets[0][0]), reshuffle_each_iteration=True, seed=42)
    .batch(
        config["batch_size"],
        drop_remainder=False,
        num_parallel_calls=tf.data.AUTOTUNE,
    )
)
val_data = (
    tf.data.Dataset.from_tensor_slices(
        (
            val_datasets[0][0],
            val_datasets[0][1],
        )
    )
    .shuffle(len(val_datasets[0][0]), reshuffle_each_iteration=True, seed=42)
    .batch(
        config["batch_size"],
        drop_remainder=False,
        num_parallel_calls=tf.data.AUTOTUNE,
    )
)
# estimator.fit(x=train_data, validation_data=val_data, epochs=100)

In [33]:
class KerasRegressor:
    def __init__(self, config, path, dm):
        self.config = config
        self.path = path
        self.dm = dm

    def get_params(self, deep=True):
        return {"config": self.config, "path": self.path, "dm": self.dm}

    def set_params(self, **kwargs):
        if "config" in kwargs.keys():
            self.config = kwargs["config"]
        if "path" in kwargs.keys():
            self.path = kwargs["path"]
        if "dm" in kwargs.keys():
            self.dm = kwargs["dm"]

    def fit(self, X, y, sample_weights=None):
        self.model = LSTMModel(self.config)

        if sample_weights is None:
            sample_weights = np.ones(len(X))
        train_data = (
            tf.data.Dataset.from_tensor_slices((X, y, sample_weights))
            .shuffle(len(X), reshuffle_each_iteration=True, seed=42)
            .batch(
                self.config["batch_size"],
                drop_remainder=False,
                num_parallel_calls=tf.data.AUTOTUNE,
            )
        )

        self.model.fit(
            train_data,
            epochs=10,
            verbose=False,
        )
        print(self.predict(X))
        return self

    def predict(self, X):
        return np.squeeze(self.model(X))

ada_regressor = AdaBoostRegressor(
    base_estimator=KerasRegressor(
        config, path=Path(run.dir) if "run" in locals() else Path("."), dm=dm
    ),
    n_estimators=5,
    random_state=42,
    loss="square",
    learning_rate=0.2
)
ada_regressor.fit(train_datasets[0][0], train_datasets[0][1])


  return super().fit(X, y, sample_weight)


AdaBoostRegressor(base_estimator=<__main__.KerasRegressor object at 0x7f19374e7610>,
                  learning_rate=0.2, loss='square', n_estimators=5,
                  random_state=42)

In [42]:
for index, test_datapoint in enumerate(test_datapoints):
    if index == 0:
        base_datapoints = {
            ticker: deepcopy(dp) for ticker, dp in test_datapoint.items()
        }
    else:
        for ticker, dp in test_datapoint.items():
            base_datapoints[ticker].df = pd.concat([base_datapoints[ticker].df, dp.df])
            base_datapoints[ticker].labels = pd.concat(
                [base_datapoints[ticker].labels, dp.labels]
            )
            base_datapoints[ticker]._features = pd.concat(
                (base_datapoints[ticker]._features, dp._features)
            )


for ticker, dp in base_datapoints.items():
    base_datapoints[ticker].predictions = estimator.predict(dp._features)




In [43]:
fig = plotting.regression_plot(base_datapoints)
fig.show()