In [1]:
!pip install fastai -Uq

In [2]:
data_path = '/content/drive/MyDrive/Colab Notebooks/datasets/bitcoin/btc_feature_engineering.csv'
data_path_5m = '/content/drive/MyDrive/Colab Notebooks/datasets/bitcoin/btc_5m_feature_engineering.csv'

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from fastai.tabular.all import *
from fastai import *

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
def get_df(path):
    df = pd.read_csv(path)
    df = df.dropna()
    df = df.reset_index(drop=True)

    return df

In [5]:
df_4h = get_df(data_path)
df_5m = get_df(data_path_5m)

In [6]:
def create_targets(df):
    df_copy = df.copy()
    # daily_returns = df_copy.close.pct_change().shift(-1)
    df_copy['daily_returns'] = np.log(df_copy.close / df_copy.close.shift())
    df_copy['daily_change'] = np.where(df_copy.daily_returns > 0, 1, 0)

    # weekly_returns = df_copy.close.pct_change(7).shift(-7)
    df_copy['weekly_returns'] = np.log(df_copy.close / df_copy.close.shift(7))
    df_copy['weekly_change'] = np.where(df_copy.weekly_returns > 0, 1, 0)

    df_copy.dropna(inplace=True)

    return df_copy

def create_splits(df, pct_split=0.85, normalize=True):
    df_copy = df.copy()
    
    # Firts separate the target
    target_columns = ['daily_returns', 'daily_change', 'weekly_returns', 'weekly_change']
    targets = df_copy[target_columns]
    df_copy.drop(target_columns + ['open_time'], axis=1, inplace=True)
    columns = df_copy.columns

    split = int(len(df_copy) * pct_split)
    train = df_copy.iloc[:split]
    train_targets = targets.iloc[:split]
    test = df_copy.iloc[split:]
    test_targets = targets.iloc[split:]

    if normalize:
        scaler = MinMaxScaler()
        train = scaler.fit_transform(train)
        test = scaler.transform(test)
        train = pd.DataFrame(train, columns=columns)
        test = pd.DataFrame(test, columns=columns)

    print(f'train shape: {train.shape}')
    print(f'test shape: {test.shape}')

    return train, test, train_targets, test_targets

In [7]:
print('4h dataset')
btc_4h = create_targets(df_4h)
X_train_4h, X_test_4h, y_train_4h, y_test_4h = create_splits(btc_4h)
print('5m dataset')
btc_5m = create_targets(df_5m)
X_train_5m, X_test_5m, y_train_5m, y_test_5m = create_splits(btc_5m, 0.98)

4h dataset
train shape: (6219, 56)
test shape: (1098, 56)
5m dataset
train shape: (352997, 56)
test shape: (7205, 56)


In [8]:
def create_model(classifier=True, hl=1, hu=128, dropout=False, rate=0.3,
                 regularize=False, reg=None,
                 optimizer=None, input_dim=None):
    if not regularize:
        reg = None
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(hu, input_dim=input_dim,
                                    activity_regularizer=reg,
                                    activation='relu'))
    if dropout:
        model.add(tf.keras.layers.Dropout(rate))

    for _ in range(hl):
        model.add(tf.keras.layers.Dense(hu, activation='relu',
                                        activity_regularizer=reg))
        if dropout:
            model.add(tf.keras.layers.Dropout(rate))

    if classifier:
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=optimizer,
                    metrics=['accuracy'])
    else:
        model.add(tf.keras.layers.Dense(1))
        model.compile(loss='mean_squared_logarithmic_error', optimizer=optimizer)
  
    return model

# Check for imbalanced data 

In [9]:
print(y_train_4h.daily_change.value_counts())
print(y_train_5m.daily_change.value_counts())

1    3207
0    3012
Name: daily_change, dtype: int64
0    176501
1    176496
Name: daily_change, dtype: int64


In [10]:
regularizer = l2(0.001)

model = create_model(classifier=True, hl=4, hu=256, dropout=True, rate=0.3, 
                     regularize=True, reg=regularizer, optimizer='adam',
                     input_dim=X_train_4h.shape[1])

model.fit(X_train_5m, y_train_5m.daily_change.values,
          batch_size=512, epochs=10, verbose=1,
          validation_split=0.15, shuffle=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4260d88fd0>

In [11]:
preds = model.predict(X_test_5m)


In [12]:
predictions = pd.DataFrame({'y_true': y_test_5m.daily_change.values, 'predictions': preds.squeeze(), })

In [13]:
predictions['preds'] = np.where(predictions['predictions'] > 0.9, 1, 0)
predictions

Unnamed: 0,y_true,predictions,preds
0,1,0.644313,0
1,0,0.638624,0
2,0,0.623412,0
3,1,0.999888,1
4,1,0.997390,1
...,...,...,...
7200,1,0.999012,1
7201,0,0.001102,0
7202,1,0.782647,0
7203,1,0.790830,0


In [14]:
y_test_5m.daily_change.value_counts()

1    3634
0    3571
Name: daily_change, dtype: int64

In [15]:
confusion_matrix(predictions.y_true, predictions.preds)

array([[3252,  319],
       [1973, 1661]])

In [16]:
print(classification_report(predictions.y_true, predictions.preds))

              precision    recall  f1-score   support

           0       0.62      0.91      0.74      3571
           1       0.84      0.46      0.59      3634

    accuracy                           0.68      7205
   macro avg       0.73      0.68      0.67      7205
weighted avg       0.73      0.68      0.66      7205

