In [11]:
from copy import deepcopy
from datetime import date, datetime
from functools import partial
from importlib import reload
from pathlib import Path

import metrics
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotting
from datapoints import assets
from plotly.subplots import make_subplots
from query_datasets import get_data
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from tools import dataframe, training, wandb_api

log_wandb = False
repo_path = Path().resolve().parent
# pio.renderers.default = "browser"


In [2]:
if log_wandb:
    import wandb

    wandb_api.login()
    run = wandb.init(
        project="crypto_prediction",
        group="Adaboost LSTM",
        job_type="test",
    )
    config = wandb.config

else:
    config = {}


In [38]:
interesting_tickers = [
    # "BTC",
    # "ETH",
    "XRP"
]


In [39]:
def compute_features(data, timesteps=8, lag=5):
    features = data.copy(deep=True)

    for i in list(range(0, 10)) + list(range(10, timesteps, 5)):
        for col in data.columns:
            features[f"{col}_{i}"] = data[col].pct_change().shift(i)

    labels = features["Close"].shift(-lag) > features["Close"]
    features = features.replace(
        to_replace=[np.inf, -np.inf, float("inf"), float("inf")],
        value=0,
    ).dropna()
    features = features.drop(labels=["Close", "Close_0"], axis=1)
    return features, labels


def create_asset(
    ticker,
    interval,
    beginning_date,
    ending_date,
    compute_features=lambda x: x,
):

    klines = get_data.download_klines(
        ticker,
        interval,
        beginning_date=beginning_date,
        ending_date=ending_date,
        directory=repo_path / "baselines" / "tmp",
    )
    data = klines.astype("float32")
    data = data.replace(
        to_replace=[np.inf, -np.inf, float("inf"), float("inf")],
        value=0,
    )

    features, labels = compute_features(data)

    return assets.TrainAsset(
        ticker=ticker,
        df=data,
        labels=labels,
        features=features,
        interval=interval,
        compute_features=compute_features,
    )


class DataModule:
    def __init__(
        self,
        config,
        compute_features=None,
        inputs=None,
        save_klines=True,
    ):
        self.config = config
        self.compute_features = compute_features
        self.inputs = inputs
        self.save_klines = save_klines

        self.setup()

    def setup(self):
        self.train_datapoints = []
        for input in self.inputs:
            dp = create_asset(
                **input,
                interval=self.config["interval"],
                compute_features=self.compute_features,
            )
            if dp == []:
                continue

            dp.df = dp.df.dropna()
            dp.labels = dp.labels.dropna()
            dp._features = dp._features.dropna()

            common_index = dp.df.index.intersection(dp.labels.index)
            common_index = common_index.intersection(dp._features.index)

            dp.df = dp.df.loc[common_index]
            dp.labels = dp.labels.loc[common_index]
            dp._features = dp._features.loc[common_index]

            train_dp = assets.TrainAsset(
                ticker=input["ticker"],
                df=dp.df,
                labels=dp.labels,
                features=dp._features,
                interval=self.config["interval"],
                compute_features=self.compute_features,
            )
            if not train_dp.isempty:
                self.train_datapoints.append(train_dp)
            else:
                print(
                    f"{dp.ticker} is empty from {input['beginning_date']} to {input['ending_date']}."
                )

    def clean_datapoints(self, datapoints):
        return datapoints

    def concat_and_shuffle(self, features, labels):
        assert len(features) == len(labels)
        _features = np.concatenate(features, axis=0)
        _labels = np.concatenate(labels, axis=0)
        assert len(_features) == len(_labels)
        p = np.random.permutation(len(_features))
        return _features[p], _labels[p]

    def nest_train_test_val_split(
        self, datapoints, offset, train_size, val_size, test_size=0
    ):
        train_features = []
        train_labels = []
        val_features = []
        val_labels = []
        test_datapoints = {}
        for dp in datapoints:
            train_beginning = offset
            train_ending = train_beginning + train_size
            val_beginning = train_ending
            val_ending = val_beginning + val_size

            test_beginning = val_beginning
            test_ending = val_ending
            # test_beginning = val_ending
            # test_ending = test_beginning + test_size

            train_features.append(dp._features[train_beginning:train_ending])
            train_labels.append(dp.labels[train_beginning:train_ending])
            val_features.append(dp._features[val_beginning:val_ending])
            val_labels.append(dp.labels[val_beginning:val_ending])

            test_datapoints[dp.ticker] = assets.TrainAsset(
                ticker=dp.ticker,
                df=dp.df.iloc[test_beginning:test_ending],
                labels=dp.labels.iloc[test_beginning:test_ending],
                features=dp._features.iloc[test_beginning:test_ending],
                interval=dp.interval,
                compute_features=dp.compute_features,
            )

        return (
            self.concat_and_shuffle(train_features, train_labels),
            self.concat_and_shuffle(val_features, val_labels),
            test_datapoints,
        )

    def _init_train_val_data(self, train_datapoints):
        train_datapoints = self.clean_datapoints(train_datapoints)
        if self.config["train_val_test_split"][0] > 1:
            train_size = int(self.config["train_val_test_split"][0])
        else:
            train_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][0]
            )
        if self.config["train_val_test_split"][1] > 1:
            val_size = int(self.config["train_val_test_split"][1])
        else:
            val_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][1]
            )
        if self.config["train_val_test_split"][2] > 1:
            test_size = int(self.config["train_val_test_split"][2])
        else:
            test_size = int(
                len(train_datapoints[0].df) * self.config["train_val_test_split"][2]
            )
        print(f"train_size: {train_size}, val_size: {val_size}, test_size: {test_size}")
        max_offset = max(
            len(train_datapoints[0].df) - (train_size + val_size + test_size), 1
        )
        train_datasets = []
        val_datasets = []
        test_datapoints = []
        for offset in range(0, max_offset, val_size + test_size):
            train_dataset, val_dataset, test_datapoint = self.nest_train_test_val_split(
                train_datapoints, offset, train_size, val_size, test_size
            )
            train_datasets.append(train_dataset)
            val_datasets.append(val_dataset)
            test_datapoints.append(test_datapoint)
        return train_datasets, val_datasets, test_datapoints


config["job_type"] = run.job_type if "run" in locals() else "test"
config["train_val_test_split"] = [0.5, 1 - 0.5, 0]
config["interval"] = "1d"
config["timesteps"] = 200
config["lag"] = 1
config["ago"] = 3000


inputs = [
    {
        "ticker": ticker,
        "beginning_date": datetime.combine(date.today(), datetime.min.time())
        - dataframe.convert_to_timedelta(config["interval"], ago=config["ago"]),
        "ending_date": datetime(
            2022, 5, 21
        ),  # datetime.combine(date.today(), datetime.min.time()),
    }
    for ticker in interesting_tickers
]

dm = DataModule(
    config,
    partial(compute_features, timesteps=config["timesteps"], lag=config["lag"]),
    inputs,
    save_klines=True,
)
train_datasets, val_datasets, test_datapoints = dm._init_train_val_data(
    dm.train_datapoints
)
print(f"Length training dataset: {len(train_datasets)}")
print(f"Length validation dataset: {len(train_datasets)}")
print(f"Length test dataset: {len(train_datasets)}")

print(f"Shape training sample: {train_datasets[0][0].shape}")

print(f"Shape validation sample: {val_datasets[0][0].shape}")
config["input_size"] = train_datasets[0][0].shape[1]


train_size: 729, val_size: 729, test_size: 0
Length training dataset: 1
Length validation dataset: 1
Length test dataset: 1
Shape training sample: (729, 243)
Shape validation sample: (729, 243)



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`



In [40]:
estimator = SVC(probability=True)
estimator.fit(train_datasets[0][0], train_datasets[0][1])

for index, test_datapoint in enumerate(test_datapoints):
    if index == 0:
        base_datapoints = {
            ticker: deepcopy(dp) for ticker, dp in test_datapoint.items()
        }
    else:
        for ticker, dp in test_datapoint.items():
            base_datapoints[ticker].df = pd.concat([base_datapoints[ticker].df, dp.df])
            base_datapoints[ticker].labels = pd.concat(
                [base_datapoints[ticker].labels, dp.labels]
            )
            base_datapoints[ticker]._features = pd.concat(
                (base_datapoints[ticker]._features, dp._features)
            )


for ticker, dp in base_datapoints.items():
    base_datapoints[ticker].predictions = estimator.predict(dp.features)
    base_datapoints[ticker].probabilities = np.squeeze(
        estimator.predict_proba(dp.features)[:, 1]
    )

    _metrics = metrics.classification_metrics(base_datapoints[ticker].predictions, base_datapoints[ticker].labels)
    setattr(base_datapoints[ticker], "metrics", _metrics)

In [47]:
fig = plotting.classification_plot(
    base_datapoints,
    config["interval"],
    config["lag"],
    # threshold=0.1
)

fig.show()
base_datapoints[interesting_tickers[0]].metrics

{'precision': 0.44554455445544555,
 'recall': 0.12162162162162163,
 'accuracy': 0.4773662551440329}

In [48]:
estimator.support_vectors_.shape

(691, 243)