In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.layers import (MultiHeadAttention, Dense, InputLayer, LayerNormalization, TimeDistributed, Layer, Dropout,
                          Embedding)
from keras.models import Model, Sequential
from transformers import GPT2Tokenizer
from AttentionAutoEncoder import AttentionEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from stock_indicators import indicators
from stock_indicators.indicators.common.quote import Quote

In [13]:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [14]:
# tokenizer.special_tokens_map

In [2]:
# inputs = InputLayer()
# positional_embedding = PositionalEncoding()(inputs)
#
# x = residual = inputs + positional_embedding
#
# x = MultiHeadAttention(num_heads=2, key_dim=2)(x, x)
# x += residual
# x = residual = LayerNormalization(epsilon=1e-6)(x)
# x = TimeDistributed(Dense(activation='relu'))(x)
# x += residual
# x = LayerNormalization(epsilon=1e-6)(x)
#
# x = MultiHeadAttention(num_heads=2, key_dim=2)(x, x)
# x += residual
# x = residual = LayerNormalization(epsilon=1e-6)(x)
# x = TimeDistributed(Dense(activation='relu'))(x)
# x += residual
# x = LayerNormalization(epsilon=1e-6)(x)
#
# outputs = Dense(activation='relu')(x)
#
# model = Model(inputs=inputs, outputs=outputs, name="Self-attention Stock Price Encoder")

autoencoder = AttentionEncoder(5 * 30 * 24 * 60, 12, 6, 2, 16, 2, 0.1)
autoencoder.compile(optimizer='adam', loss='mae')

In [13]:
def vectorized_stride(array, length, stride):
    start = 0
    max_time = len(array) - length - 1

    sub_windows = (
            start +
            np.expand_dims(np.arange(length), 0) +
            # Create a rightmost vector as [0, V, 2V, ...].
            np.expand_dims(np.arange(max_time + 1, step=stride), 0).T
    )

    return array[sub_windows]


# arr = np.array([[i] * 10 for i in range(10)])
# vectorized_stride(arr, length=4, stride=4)


def add_features(df):
    quotes_list = [
        Quote(d, o, h, l, c, v)
        for d, o, h, l, c, v
        in zip(df['Timestamp'], df['Open'], df['High'], df['Low'], df['Close'], df['Volume_(Currency)'])
    ]
    ...
    return ...


def make_standardised_segments(df, segment_len, segment_amp_range, stride):
    segments = vectorized_stride(df.to_numpy(), length=segment_len, stride=stride)
    scaler = MinMaxScaler(feature_range=segment_amp_range)
    return [pd.DataFrame(scaler.fit_transform(segment), columns=df.columns) for segment in segments]


# def tokenize(df, amplitude_range, resolution):
#     # quantities = np.linspace(0, window_height, resolution)
#     interval = amplitude_range / (resolution - 1)
#
#     df = interval * np.round(df / interval)  # nan ok?
#     # df = df.fillna('<NULL>')
#     # df[df.isna().any(axis=1)] = '<NULL>'
#     df = df.astype(str)  # verify this works
#
#     return df

def make_curriculum(df, window_length, window_range, stride):
    return [make_standardised_segments(df[i:], segment_len=window_length, segment_amp_range=window_range, stride=stride)
            for i in range(window_length)]


def ts_train_test_split(df, test_size, gap_size):
    gap_size = int(gap_size)
    train_end = int((1 - test_size) * (len(df) - gap_size))
    return df[:train_end], df[train_end + gap_size:]


def resample(df, freq):
    return df.resample(freq).agg({
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        # 'Volume_(BTC)': 'sum',
        'Volume_(Currency)': 'sum',
        # 'Weighted_Price': 'mean',
        # 'Missing': 'sum',
    })


array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]],

       [[4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
        [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
        [6, 6, 6, 6, 6, 6, 6, 6, 6, 6],
        [7, 7, 7, 7, 7, 7, 7, 7, 7, 7]]])

In [62]:


raw_data = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv', index_col='Timestamp')  # min by min
raw_data.index = pd.to_datetime(raw_data.index, unit='s')

# data['Missing'] = data.isna().any(axis=1).astype(int)
data = raw_data.interpolate(method='index')
# todo: show how much is interpolated
data = data[['Open', 'High', 'Low', 'Close', 'Volume_(Currency)']]

window_length = int(5 * 30 * 24 / 4)
# stride = int(10 / 25 * window_length)
stride = 16
window_range = (-1, 1)

train_data, val_data = ts_train_test_split(data, test_size=0.3, gap_size=window_length)
val_data, test_data = ts_train_test_split(val_data, test_size=0.15 * 0.3, gap_size=stride)

freq = '15min'
train_data = resample(train_data, freq)
val_data = resample(val_data, freq)
test_data = resample(test_data, freq)
# train_data = add_features(train_data)

# curriculum = []
# for i in range(window_size):
#     curriculum.append(tokenize(quantise(normalise(segment(data, offset=i))), mask))
train_data = make_standardised_segments(train_data, window_length, window_range, stride)
val_data = make_standardised_segments(val_data, window_length, window_range, stride)
test_data = make_standardised_segments(test_data, window_length, window_range, stride)

In [None]:

autoencoder.fit(train_data, train_data)

In [6]:
import binascii

with open(r"D:\Bitcoin\blocks\blk00000.dat", 'rb') as f:
    data = f.readline()
    print(data)
    data = binascii.hexlify(data)
    print(data)

b"\xf9\xbe\xb4\xd9\x1d\x01\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00;\xa3\xed\xfdz{\x12\xb2z\xc7,>gv\x8fa\x7f\xc8\x1b\xc3\x88\x8aQ2:\x9f\xb8\xaaK\x1e^J)\xab_I\xff\xff\x00\x1d\x1d\xac+|\x01\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xffM\x04\xff\xff\x00\x1d\x01\x04EThe Times 03/Jan/2009 Chancellor on brink of second bailout for banks\xff\xff\xff\xff\x01\x00\xf2\x05*\x01\x00\x00\x00CA\x04g\x8a\xfd\xb0\xfeUH'\x19g\xf1\xa6q0\xb7\x10\\\xd6\xa8(\xe09\t\xa6yb\xe0\xea\x1fa\xde\xb6I\xf6\xbc?L\xef8\xc4\xf3U\x04\xe5\x1e\xc1\x12\xde\\8M\xf7\xba\x0b\x8dW\x8aLp+k\xf1\x1d_\xac\x00\x00\x00\x00\xf9\xbe\xb4\xd9\xd7\x00\x00\x00\x01\x00\x00\x00o\xe2\x8c\n"
b'f9beb4d91d0100000100000000000000000000000000000000000000000000000000000000000000000000003ba3edfd7a7b12b27ac72c3e67768f617fc81b

In [None]:
data