## Imports ##

In [195]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import xgboost as xgb

## Global Variables ##

In [196]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

features = [
    # Fundamental price data
    "open",
    "high",
    "low",
    "close",
    # Auxiliary data
    "turnover",
    "color",
    # Volume-related
    "volume",
    "avg_vol_last_100",
    "obv",
    # Momentum and trend indicators
    "RSI_5",
    "RSI_10",
    "RSI_14",
    "MACD_12_26_9",
    "MACDh_12_26_9",
    "MACDs_12_26_9",
    "MACD_6_13_5_6_13_5",
    "MACDh_6_13_5_6_13_5",
    "MACDs_6_13_5_6_13_5",
    # Moving averages
    "SMA_20",
    "SMA_5",
    "SMA_10",
    "EMA_2",
    "EMA_5",
    "EMA_10",
    # Bollinger Bands
    "BBP_10_2.0_10",
    "BBL_15_2.0_15",
    "BBM_15_2.0_15",
    "BBU_15_2.0_15",
    "BBB_15_2.0_15",
    "BBP_15_2.0_15",
    "BBL_20_2.0_20",
    "BBM_20_2.0_20",
    "BBU_20_2.0_20",
    "BBB_20_2.0_20",
    "BBP_20_2.0_20",
    "bollinger_bandwidth",
    "BBL_5_2.0_5",
    "BBM_5_2.0_5",
    "BBU_5_2.0_5",
    "BBB_5_2.0_5",
    "BBP_5_2.0_5",
    "BBL_10_2.0_10",
    "BBM_10_2.0_10",
    "BBU_10_2.0_10",
    "BBB_10_2.0_10",
    "BBP_10_2.0_10",
    # Stochastic Oscillator
    "STOCHd_14_3_3",
    "STOCHk_14_3_3_7_3_3",
    "STOCHd_14_3_3_7_3_3",
    "STOCHk_14_3_3_10_3_3",
    "STOCHd_14_3_3_10_3_3",
    # Volatility
    "ATR_14",
    "ATR_10",
    "ATR_5",
    # Other momentum oscillators
    "ROC_14",
    "ROC_10",
    "ROC_5",
    # Other versatile indicators
    "CCI_14",
    "CCI_10",
    "CCI_5",
    # Money Flow Index and Chaikin Money Flow
    "cmf",
    "mfi",
    # Relative Vigor Index (RVI)
    "RVI_15",
    "RVI_10",
    "RVI_5",
    # Pivot Points
    "PP",
    "R1",
    "S1",
    "R2",
    "S2",
    "R3",
    "S3",
    # Parabolic SAR (PSAR)
    "PSARl_0.01_0.1",
    "PSARs_0.01_0.1",
    "PSARaf_0.01_0.1",
    "PSARr_0.01_0.1",
    # Triple Exponential Average (TRIX)
    "TRIX_18_9",
    "TRIXs_18_9",
    "TRIX_12_6",
    "TRIXs_12_6",
    "TRIX_10_5",
    "TRIXs_10_5",
    # Ichimoku Cloud (ISA, ISB, ITS, IKS, ICS)
    "ISA_5",
    "ISB_15",
    "ITS_5",
    "IKS_15",
    "ICS_15",
]
drop_features = [
    "MACDs_12_26_9",
    "EMA_5",
    "RVI_10",
    "ISA_5",
    "SMA_20",
    "S2",
    "BBM_10_2.0_10",
    "PSARl_0.01_0.1",
    "IKS_15",
    "BBU_15_2.0_15",
    "S1",
    "PSARr_0.01_0.1",
    "BBL_10_2.0_10",
    "ATR_10",
    "SMA_10",
    "STOCHd_14_3_3_10_3_3",
    "ICS_15",
    "BBL_20_2.0_20",
    "bollinger_bandwidth",
    "BBL_5_2.0_5",
    "PP",
    "TRIX_12_6",
    "volume",
    "BBU_20_2.0_20",
    "S3",
    "R3",
    "BBU_5_2.0_5",
    "BBL_15_2.0_15",
    "low",
    "R1",
    "BBU_10_2.0_10",
    "close",
    "BBM_15_2.0_15",
    "R2",
    "BBM_20_2.0_20",
    "high",
    "ISB_15",
    "BBM_5_2.0_5",
    "EMA_2",
    "SMA_5",
    "open",
    "PSARs_0.01_0.1",
    "ITS_5",
    "EMA_10",
    "PSARaf_0.01_0.1",
    "ATR_14",
    "MACD_6_13_5_6_13_5",
    "cmf",
    "CCI_14",
    "STOCHk_14_3_3_10_3_3",
    "TRIX_18_9",
    "BBB_10_2.0_10",
    "RSI_10",
    "MACDh_6_13_5_6_13_5",
    "TRIXs_10_5",
]

# List of features to create lags for
lag_features = [
    "turnover",
    "CCI_5",
    "BBP_5_2.0_5",
    "color",
    "BBP_10_2.0_10",
    "RVI_5",
    "CCI_10",
    "BBB_5_2.0_5",
]

## Functions ##

In [197]:
def create_model(input_shape):
    model = Sequential()

    model.add(LSTM(50, activation="relu", input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=["accuracy"])

    return model


def create_lag_features(df, lag_features, max_lag):
    for feature in lag_features:
        for lag in range(1, max_lag + 1):
            df[f"{feature}_lag{lag}"] = df[feature].shift(lag)
    return df


def create_rolling_features(df, rolling_features, window_size):
    for feature in rolling_features:
        df[f"{feature}_rolling_mean{window_size}"] = (
            df[feature].rolling(window_size).mean()
        )
        df[f"{feature}_rolling_std{window_size}"] = (
            df[feature].rolling(window_size).std()
        )
    return df

## Preprocessing ##

In [198]:
# Load the data
df = pd.read_csv(
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_15min_ha_ti.csv"
)

# Convert color to 0 for 'red' and 1 for 'green'
df["color"] = df["color"].map({"red": 0, "green": 1})

# Add 'color_change' column: 1 if color changes from the previous row, 0 otherwise
df["color_change"] = df["color"].diff().abs()

# Fill the first row's 'color_change' with 0
df["color_change"].fillna(0, inplace=True)

# Drop 'time'
df = df.drop(["time"], axis=1)

# Drop the features
df = df.drop(columns=drop_features)

# Fill NaNs in specific columns with 0
fill_cols = ["PSARl_0.01_0.1", "PSARs_0.01_0.1"]
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Identify the first non-null row
first_valid_index = df.dropna().index[0]

# Drop the rows before this index in both features and target data
df = df.loc[first_valid_index:]
target = df["color_change"].loc[first_valid_index:]

# Use ffill to fill any remaining missing values
df.ffill(inplace=True)

# Separate the target column before scaling
X = df.drop("color_change", axis=1)
y = df["color_change"]

# Initialize the Scaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the data to the scaler object
X_scaled = scaler.fit_transform(X)

# Reset the index of target
y = y.reset_index(drop=True)

# print(df.head(20))

## Feature Engineering ##

In [199]:
# Creating lag features
# df = create_lag_features(df, lag_features, max_lag=3)

# # Fill NaNs
# df.fillna(method="ffill", inplace=True)

## Train Test Split ##

In [200]:
target = target.reset_index(drop=True)

data = X_scaled  # this is your scaled data
target = y  # this is your target

# number of splits
tscv = TimeSeriesSplit(n_splits=3)

X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

# for each split train a new model
for train_index, test_index in tscv.split(data):
    # separating the data into train and test splits
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # LSTM requires the input to be in the shape [samples, time steps, features]
    # here we are using 1 time step and 'n' features
    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

    # append to lists
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

## Test Model ##

In [201]:
# Define a function to create LSTM model
def create_model(input_shape):
    model = Sequential()
    model.add(
        LSTM(50, activation="relu", input_shape=input_shape, return_sequences=True)
    )
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


# Initialize a list to store models
models = []

for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    # Initialize the model with the input shape
    model = create_model((X_train.shape[1], X_train.shape[2]))

    # We add an EarlyStopping callback from Keras to stop the training if the validation loss doesn't decrease for 5 consecutive epochs.
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)

    # Fit the model
    model.fit(
        X_train,
        y_train,
        epochs=50,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0,
    )

    # Append the trained model to the list
    models.append(model)

for i, model in enumerate(models):
    # Predict the probabilities on the test set
    y_pred_probs = model.predict(X_test_list[i])

    # Convert probabilities into class labels
    y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]

    # Calculate the accuracy
    acc = accuracy_score(y_test_list[i], y_pred)

    # Print the accuracy
    print(f"Model {i+1} accuracy: {acc:.2f}")

## XGBoost ##

In [None]:
# Initialize a list to store models
models = []


# Store feature names from DataFrame
feature_names = X.columns.tolist()

for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    # Reshape data back into 2D because XGBoost does not accept 3D data like LSTM does
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[2])

    # Also reshape X_test for eval_set
    X_test = X_test_list[i].reshape(X_test_list[i].shape[0], X_test_list[i].shape[2])

    # Initialize the model
    model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    # Fit the model
    model.fit(
        X_train,
        y_train,
        early_stopping_rounds=5,
        eval_set=[(X_test, y_test_list[i])],
        verbose=False,
    )

    # Append the trained model to the list
    models.append(model)

    # Print feature names
    print(f"Model {i+1} features: {feature_names}")

for i, model in enumerate(models):
    # Reshape data back into 2D because XGBoost does not accept 3D data like LSTM does
    X_test = X_test_list[i].reshape(X_test_list[i].shape[0], X_test_list[i].shape[2])

    # Predict the classes on the test set
    y_pred = model.predict(X_test)

    # Calculate the accuracy score
    accuracy = accuracy_score(y_test_list[i], y_pred)

    # Print the accuracy
    print(f"Model {i+1} Accuracy: {accuracy}")

Model 1 features: ['turnover', 'color', 'avg_vol_last_100', 'RSI_5', 'RSI_14', 'ATR_5', 'ROC_14', 'ROC_10', 'ROC_5', 'CCI_10', 'CCI_5', 'obv', 'mfi', 'RVI_15', 'RVI_5', 'TRIXs_18_9', 'TRIXs_12_6', 'TRIX_10_5', 'BBB_5_2.0_5', 'BBP_5_2.0_5', 'BBP_10_2.0_10', 'BBB_15_2.0_15', 'BBP_15_2.0_15', 'BBB_20_2.0_20', 'BBP_20_2.0_20', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_6_13_5_6_13_5', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'STOCHk_14_3_3_7_3_3', 'STOCHd_14_3_3_7_3_3']
Model 2 features: ['turnover', 'color', 'avg_vol_last_100', 'RSI_5', 'RSI_14', 'ATR_5', 'ROC_14', 'ROC_10', 'ROC_5', 'CCI_10', 'CCI_5', 'obv', 'mfi', 'RVI_15', 'RVI_5', 'TRIXs_18_9', 'TRIXs_12_6', 'TRIX_10_5', 'BBB_5_2.0_5', 'BBP_5_2.0_5', 'BBP_10_2.0_10', 'BBB_15_2.0_15', 'BBP_15_2.0_15', 'BBB_20_2.0_20', 'BBP_20_2.0_20', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_6_13_5_6_13_5', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'STOCHk_14_3_3_7_3_3', 'STOCHd_14_3_3_7_3_3']




Model 3 features: ['turnover', 'color', 'avg_vol_last_100', 'RSI_5', 'RSI_14', 'ATR_5', 'ROC_14', 'ROC_10', 'ROC_5', 'CCI_10', 'CCI_5', 'obv', 'mfi', 'RVI_15', 'RVI_5', 'TRIXs_18_9', 'TRIXs_12_6', 'TRIX_10_5', 'BBB_5_2.0_5', 'BBP_5_2.0_5', 'BBP_10_2.0_10', 'BBB_15_2.0_15', 'BBP_15_2.0_15', 'BBB_20_2.0_20', 'BBP_20_2.0_20', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_6_13_5_6_13_5', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'STOCHk_14_3_3_7_3_3', 'STOCHd_14_3_3_7_3_3']
Model 1 Accuracy: 0.746885899352267
Model 2 Accuracy: 0.7648231190832088
Model 3 Accuracy: 0.7742899850523169


## CNN ##

In [None]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, MaxPooling1D
from keras.optimizers import Adam

# Initialize a list to store models
models = []

for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    # Reshape input data for 1D CNN (num_samples, num_features, num_channels)
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[2], 1)
    X_test = X_test_list[i].reshape(X_test_list[i].shape[0], X_test_list[i].shape[2], 1)

    # Initialize the 1D CNN model
    model = Sequential(
        [
            Conv1D(
                32, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)
            ),
            MaxPooling1D(pool_size=2),
            Dropout(0.25),
            Flatten(),
            Dense(64, activation="relu"),
            Dropout(0.5),
            Dense(1, activation="sigmoid"),
        ]
    )

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    # Fit the model
    model.fit(
        X_train,
        y_train,
        epochs=50,
        batch_size=128,
        validation_data=(X_test, y_test_list[i]),
        verbose=0,
    )

    # Append the trained model to the list
    models.append(model)

for i, model in enumerate(models):
    # Reshape input data for 1D CNN (num_samples, num_features, num_channels)
    X_test = X_test_list[i].reshape(X_test_list[i].shape[0], X_test_list[i].shape[2], 1)

    # Predict the probability of the chosen class on the test set
    y_pred_proba = model.predict(X_test)

    # Threshold prediction probabilities for binary classification
    threshold = 0.5
    y_pred = np.where(y_pred_proba > threshold, 1, 0)

    # Calculate the accuracy score
    accuracy = accuracy_score(y_test_list[i], y_pred)

    # Print the accuracy
    print(f"Model {i+1} Accuracy: {accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5