In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Flatten, Dense
from sklearn.ensemble import RandomForestClassifier
import statistics as stats

In [3]:
data = pd.read_csv('../../raw_data/bitstampUSD.csv')

In [4]:
data.dropna()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
478,1325346600,4.39,4.39,4.39,4.39,48.000000,210.720000,4.390000
547,1325350740,4.50,4.57,4.50,4.57,37.862297,171.380338,4.526411
548,1325350800,4.58,4.58,4.58,4.58,9.000000,41.220000,4.580000
1224,1325391360,4.58,4.58,4.58,4.58,1.502000,6.879160,4.580000
...,...,...,...,...,...,...,...,...
4727772,1609372560,28801.47,28829.42,28785.64,28829.42,0.965221,27804.572129,28806.429798
4727773,1609372620,28829.42,28863.90,28829.42,28857.06,2.368831,68332.350629,28846.441863
4727774,1609372680,28850.49,28900.52,28850.49,28882.82,2.466590,71232.784464,28879.056266
4727775,1609372740,28910.54,28911.52,28867.60,28881.30,7.332773,211870.912660,28893.695831


In [5]:
def preprocessing_data(data, shift_size, h=1):
    data_pp = data[2798176:4727776].copy()
    data_pp['Timestamp'] = pd.to_datetime(data_pp['Timestamp'], unit='s', origin='unix')
    data_pp = data_pp[['Open', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')
    data_pp['diff_Open'] = data_pp['Open'].diff(h)
    data_pp['diff_Open'] = data_pp['diff_Open'].dropna()
    data_pp[f"t+{h}"] = data_pp['diff_Open'].shift(-h)
    for i in range(0, shift_size):
        data_pp[f't-{i}'] = data_pp['Open'].shift(i)
    data_shifted = data_pp.dropna()
    X = data_shifted.drop(columns=['Open', 'diff_Open', f"t+{h}"])
    y = data_shifted[f"t+{h}"].copy()
    y[y > 0] = 1
    y[y <= 0] = 0
    return X, y, data_shifted

In [6]:
preprocessing_data(data, 5, h=1)

(                          t-0       t-1       t-2       t-3       t-4
 Timestamp                                                            
 2017-05-01 00:04:00   1351.25   1350.11   1349.49   1352.41   1348.88
 2017-05-01 00:05:00   1351.24   1351.25   1350.11   1349.49   1352.41
 2017-05-01 00:06:00   1349.47   1351.24   1351.25   1350.11   1349.49
 2017-05-01 00:07:00   1351.24   1349.47   1351.24   1351.25   1350.11
 2017-05-01 00:08:00   1351.24   1351.24   1349.47   1351.24   1351.25
 ...                       ...       ...       ...       ...       ...
 2020-12-30 23:54:00  28800.00  28814.36  28826.49  28836.97  28816.76
 2020-12-30 23:55:00  28809.07  28800.00  28814.36  28826.49  28836.97
 2020-12-30 23:56:00  28801.47  28809.07  28800.00  28814.36  28826.49
 2020-12-30 23:57:00  28829.42  28801.47  28809.07  28800.00  28814.36
 2020-12-30 23:58:00  28850.49  28829.42  28801.47  28809.07  28800.00
 
 [1929595 rows x 5 columns],
 Timestamp
 2017-05-01 00:04:00    0.0
 2017-0

In [7]:
def input_data(data, sample_size, shift_size, train_size, h=1, w=0):
    X, y, data_shifted = preprocessing_data(data, shift_size, h)
    data_size = data_shifted.shape[0]
    test_size = sample_size - train_size
    sample_X = X.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    sample_y = y.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    X_train = sample_X.iloc[0:train_size]
    y_train = sample_y.iloc[0:train_size]
    X_test = sample_X.iloc[(train_size+h-1):(sample_size-shift_size)]
    y_test = sample_y.iloc[(train_size+h-1):(sample_size-shift_size)]
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = input_data(data, 30000, 5, 20000, h=1, w=0)

In [9]:
def ridge_classifier(X_train, X_test, y_train, y_test):
    log_reg = RidgeClassifier()
    log_reg = log_reg.fit(X_train, y_train)
    results = log_reg.predict(X_test)
    score = log_reg.score(X_test, y_test)
    return score

In [10]:
ridge_classifier(X_train, X_test, y_train, y_test)

0.5140570285142572

In [11]:
def manual_cross_val_ridge(data):
    results = []
    for w in range(5):
        X_train, X_test, y_train, y_test = input_data(data, 1000, 5, 700, h=1, w=w)
        score = ridge_classifier(X_train, X_test, y_train, y_test)
        results.append(score)
    return stats.mean(results)

In [12]:
manual_cross_val_ridge(data)

0.5057627118644068

In [13]:
def random_forest_classifier(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier()
    rf = rf.fit(X_train, y_train)
    rf_score = rf.score(X_test, y_test)
    return rf_score

In [14]:
def manual_cross_val_forest(data):
    results = []
    for w in range(5):
        X_train, X_test, y_train, y_test = input_data(data, 1000, 5, 700, h=1, w=w)
        score = random_forest_classifier(X_train, X_test, y_train, y_test)
        results.append(score)
    return stats.mean(results)

In [15]:
manual_cross_val_forest(data)

0.4976271186440678

In [16]:
# def initialize_model():
#     model = RandomForestClassifier()

In [17]:
# def compile_model(model):
#     model.compile(loss='binary_crossentropy', 
#               optimizer='rmsprop',
#               metrics=['accuracy'])
#     return model

In [18]:
# model = initialize_model()
# model = compile_model(model)

In [19]:
# es = EarlyStopping(patience=10, restore_best_weights=True)
# history = model.fit(X_train[:1000], y_train[:1000],
#                     validation_split=0.3,
#                     epochs=200,
#                     batch_size=32,
#                     callbacks=[es], 
#                     verbose=1)

In [20]:
# model.evaluate(X_test, y_test, verbose=2)