## Imports ##

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import xgboost as xgb

## Global Variables ##

In [10]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

features = [
    # Fundamental price data
    "time",
    "open",
    "high",
    "low",
    "close",
    # Auxiliary data
    "turnover",
    "color",
    # Volume-related
    "volume",
    "avg_vol_last_100",
    "obv",
    # Momentum and trend indicators
    "RSI_5",
    "RSI_10",
    "RSI_14",
    "MACD_12_26_9",
    "MACDh_12_26_9",
    "MACDs_12_26_9",
    "MACD_6_13_5_6_13_5",
    "MACDh_6_13_5_6_13_5",
    "MACDs_6_13_5_6_13_5",
    # Moving averages
    "SMA_20",
    "SMA_5",
    "SMA_10",
    "EMA_2",
    "EMA_5",
    "EMA_10",
    # Bollinger Bands
    "BBL_15_2.0_15",
    "BBM_15_2.0_15",
    "BBU_15_2.0_15",
    "BBB_15_2.0_15",
    "BBP_15_2.0_15",
    "BBL_20_2.0_20",
    "BBM_20_2.0_20",
    "BBU_20_2.0_20",
    "BBB_20_2.0_20",
    "BBP_20_2.0_20",
    "bollinger_bandwidth",
    "BBL_5_2.0_5",
    "BBM_5_2.0_5",
    "BBU_5_2.0_5",
    "BBB_5_2.0_5",
    "BBP_5_2.0_5",
    "BBL_10_2.0_10",
    "BBM_10_2.0_10",
    "BBU_10_2.0_10",
    "BBB_10_2.0_10",
    "BBP_10_2.0_10",
    # Stochastic Oscillator
    "STOCHd_14_3_3",
    "STOCHk_14_3_3_7_3_3",
    "STOCHd_14_3_3_7_3_3",
    "STOCHk_14_3_3_10_3_3",
    "STOCHd_14_3_3_10_3_3",
    # Volatility
    "ATR_14",
    "ATR_10",
    "ATR_5",
    # Other momentum oscillators
    "ROC_14",
    "ROC_10",
    "ROC_5",
    # Other versatile indicators
    "CCI_14",
    "CCI_10",
    "CCI_5",
    # Money Flow Index and Chaikin Money Flow
    "cmf",
    "mfi",
    # Relative Vigor Index (RVI)
    "RVI_15",
    "RVI_10",
    "RVI_5",
    # Pivot Points
    "PP",
    "R1",
    "S1",
    "R2",
    "S2",
    "R3",
    "S3",
    # Parabolic SAR (PSAR)
    "PSARl_0.01_0.1",
    "PSARs_0.01_0.1",
    "PSARaf_0.01_0.1",
    "PSARr_0.01_0.1",
    # Triple Exponential Average (TRIX)
    "TRIX_18_9",
    "TRIXs_18_9",
    "TRIX_12_6",
    "TRIXs_12_6",
    "TRIX_10_5",
    "TRIXs_10_5",
    # Ichimoku Cloud (ISA, ISB, ITS, IKS, ICS)
    "ISA_5",
    "ISB_15",
    "ITS_5",
    "IKS_15",
    "ICS_15",
]

drop_features = [
    "bollinger_bandwidth",
    "BBB_10_2.0_10",
    "BBM_10_2.0_10",
    "BBM_15_2.0_15",
    "BBM_20_2.0_20",
    "BBM_5_2.0_5",
    "BBL_10_2.0_10",
    "BBL_20_2.0_20",
    "BBL_5_2.0_5",
    "BBU_10_2.0_10",
    "BBU_15_2.0_15",
    "BBU_20_2.0_20",
    "BBU_5_2.0_5",
    "MACD_6_13_5_6_13_5",
    "MACDh_6_13_5_6_13_5",
    "MACDs_12_26_9",
    "RSI_10",
    "RVI_10",
    # Ichimoku Cloud (ISA, ISB, ITS, IKS, ICS)
    "ISA_5",
    "ISB_15",
    "ITS_5",
    "IKS_15",
    # Triple Exponential Average (TRIX)
    "TRIX_18_9",
    "TRIX_12_6",
    "TRIXs_12_6",
    "TRIX_10_5",
    "TRIXs_10_5",
    # Parabolic SAR (PSAR)
    "PSARl_0.01_0.1",
    "PSARaf_0.01_0.1",
    "PSARr_0.01_0.1",
    # Pivot Points
    "PP",
    "R1",
    "S1",
    "R2",
    "S2",
    "R3",
    "S3",
    # DON'T USE THESE # Money Flow Index and Chaikin Money Flow #
    "cmf",
    "mfi",
    # Volatility
    "ATR_14",
    "ATR_10",
    # Stochastic Oscillator
    "STOCHd_14_3_3",
    "STOCHk_14_3_3_7_3_3",
    "STOCHd_14_3_3_7_3_3",
    # Moving averages
    "SMA_20",
    "SMA_10",
    "EMA_2",
    "EMA_10",
]

## Functions ##

In [11]:
def create_model(input_shape):
    model = Sequential()

    model.add(LSTM(50, activation="tanh", input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=["accuracy"])

    return model


def create_lag_features(df, lag_features, max_lag):
    for feature in lag_features:
        for lag in range(1, max_lag + 1):
            new_feature = f"{feature}_lag{lag}"
            df[new_feature] = df[feature].shift(lag)
            # print(new_feature)
    return df


def create_rolling_features(df, rolling_features, window_size):
    for feature in rolling_features:
        df[f"{feature}_rolling_mean{window_size}"] = (
            df[feature].rolling(window_size).mean()
        )
        df[f"{feature}_rolling_std{window_size}"] = (
            df[feature].rolling(window_size).std()
        )
    return df

## Preprocessing ##

In [17]:
# Load the data
df = pd.read_csv(
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_15min_ha_ti.csv"
)

# Convert color to 0 for 'red' and 1 for 'green'
df["color"] = df["color"].map({"red": 0, "green": 1})

# Add 'color_change' column: 1 if color changes from the previous row, 0 otherwise
df["color_change"] = df["color"].diff().abs()

# Fill the first row's 'color_change' with 0
df["color_change"].fillna(0, inplace=True)

# Ensure the target column is also included
if "color_change" not in features:
    features.append("color_change")

# Subset the DataFrame
df = df[features]

# Drop 'time'
df = df.drop(["time"], axis=1)

# # Drop the features
df = df.drop(columns=drop_features)

# Fill NaNs in specific columns with 0
fill_cols = ["PSARl_0.01_0.1", "PSARs_0.01_0.1"]
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Identify the first non-null row
first_valid_index = df.dropna().index[0]

# Drop the rows before this index in both features and target data
df = df.loc[first_valid_index:]

# Use ffill to fill any remaining missing values
df.ffill(inplace=True)

# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))

# Separate the target column before scaling
X = df.drop("color_change", axis=1)
y = df["color_change"]

# Reset the index of target
y = y.reset_index(drop=True)

# Fit and transform the data to the scaler object
X_scaled = scaler.fit_transform(X)

# print(df.head(20))

            open          high           low      close      turnover  color  \
33  16722.141050  16722.141050  16714.000000  16714.900  1.051636e+05      1   
34  16718.520525  16718.520525  16688.200000  16702.725  1.606743e+06      0   
35  16710.622762  16710.622762  16690.900000  16695.600  2.463748e+05      1   
36  16703.111381  16706.700000  16693.200000  16700.400  8.219268e+05      1   
37  16701.755691  16704.600000  16689.000000  16696.700  6.529317e+05      0   
38  16699.227845  16699.400000  16669.600000  16689.350  1.324177e+06      1   
39  16694.288923  16699.400000  16694.288923  16697.850  3.666776e+05      0   
40  16696.069461  16708.300000  16693.800000  16701.850  7.789163e+05      1   
41  16698.959731  16721.700000  16698.959731  16714.050  9.536910e+05      1   
42  16706.504865  16748.800000  16706.504865  16731.525  8.991609e+05      1   
43  16719.014933  16741.100000  16719.014933  16735.600  1.197490e+06      0   
44  16727.307466  16736.200000  16720.60

## Train Test Split ##

In [13]:
# Convert X_scaled and y to numpy arrays
data = np.array(X_scaled)  # this is your scaled data
target = y.to_numpy()  # this is your target


# number of splits
tscv = TimeSeriesSplit(n_splits=3)

X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

# for each split train a new model
for train_index, test_index in tscv.split(data):
    # separating the data into train and test splits
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # LSTM requires the input to be in the shape [samples, time steps, features]
    # here we are using 1 time step and 'n' features
    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

    # append to lists
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

## Test Model ##

In [14]:
# Define a function to create LSTM model
def create_model(input_shape):
    model = Sequential()
    model.add(
        LSTM(50, activation="tanh", input_shape=input_shape, return_sequences=True)
    )
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation="tanh"))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


# Initialize a list to store models
models = []

for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    # Initialize the model with the input shape
    model = create_model((X_train.shape[1], X_train.shape[2]))

    # We add an EarlyStopping callback from Keras to stop the training if the validation loss doesn't decrease for 5 consecutive epochs.
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)

    # Fit the model
    model.fit(
        X_train,
        y_train,
        epochs=50,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0,
    )

    # Append the trained model to the list
    models.append(model)

for i, model in enumerate(models):
    # Predict the probabilities on the test set
    y_pred_probs = model.predict(X_test_list[i])

    # Convert probabilities into class labels
    y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]

    # Calculate the accuracy
    acc = accuracy_score(y_test_list[i], y_pred)

    # Print the accuracy
    print(f"Model {i+1} accuracy: {acc:.2f}")

Model 1 accuracy: 0.76
Model 2 accuracy: 0.76
Model 3 accuracy: 0.77


## XGBoost ##

In [15]:
# Define a list to keep all the models
models = []

for i in range(len(X_train_list)):
    # Get the data for the current fold
    X_train = X_train_list[i]
    y_train = y_train_list[i]
    X_test = X_test_list[i]
    y_test = y_test_list[i]

    # XGBoost requires 2D array-like input, so reshape your data if needed
    if len(X_train.shape) == 3:
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[2])
    if len(X_test.shape) == 3:
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[2])

    print("Training on fold ", i + 1, "\n")
    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape, "\n")

    # Check features
    print("Training on following features:")
    for feature_name in df.columns:
        print(feature_name)
    print("\n")

    # Define the model
    model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Append the model to the models list
    models.append(model)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy on fold ", i + 1, ": ", accuracy, "\n\n")

    # Print feature importances
importances = model.feature_importances_
feature_list = df.drop("color_change", axis=1).columns.tolist()
feature_importances = dict(zip(feature_list, importances))

for feature, importance in feature_importances.items():
    print(f"Feature: {feature}, Importance: {importance}")

Training on fold  1 

X_train shape:  (2010, 38)
y_train shape:  (2010,) 

Training on following features:
open
high
low
close
turnover
color
volume
avg_vol_last_100
obv
RSI_5
RSI_14
MACD_12_26_9
MACDh_12_26_9
MACDs_6_13_5_6_13_5
SMA_5
EMA_5
BBL_15_2.0_15
BBB_15_2.0_15
BBP_15_2.0_15
BBB_20_2.0_20
BBP_20_2.0_20
BBB_5_2.0_5
BBP_5_2.0_5
BBP_10_2.0_10
STOCHk_14_3_3_10_3_3
STOCHd_14_3_3_10_3_3
ATR_5
ROC_14
ROC_10
ROC_5
CCI_14
CCI_10
CCI_5
RVI_15
RVI_5
PSARs_0.01_0.1
TRIXs_18_9
ICS_15
color_change


Accuracy on fold  1 :  0.737419033383159 


Training on fold  2 

X_train shape:  (4017, 38)
y_train shape:  (4017,) 

Training on following features:
open
high
low
close
turnover
color
volume
avg_vol_last_100
obv
RSI_5
RSI_14
MACD_12_26_9
MACDh_12_26_9
MACDs_6_13_5_6_13_5
SMA_5
EMA_5
BBL_15_2.0_15
BBB_15_2.0_15
BBP_15_2.0_15
BBB_20_2.0_20
BBP_20_2.0_20
BBB_5_2.0_5
BBP_5_2.0_5
BBP_10_2.0_10
STOCHk_14_3_3_10_3_3
STOCHd_14_3_3_10_3_3
ATR_5
ROC_14
ROC_10
ROC_5
CCI_14
CCI_10
CCI_5
RVI_15
RVI_5
PSARs_

## CNN ##

In [16]:
# Initialize a list to store models
models = []

# Store feature names from DataFrame
feature_names = X.columns.tolist()

for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    # Reshape input data for 1D CNN (num_samples, num_features, num_channels)
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[2], 1)
    X_test = X_test_list[i].reshape(X_test_list[i].shape[0], X_test_list[i].shape[2], 1)

    # Initialize the 1D CNN model
    model = Sequential(
        [
            Conv1D(
                32, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)
            ),
            MaxPooling1D(pool_size=2),
            Dropout(0.25),
            Flatten(),
            Dense(64, activation="relu"),
            Dropout(0.5),
            Dense(1, activation="sigmoid"),
        ]
    )

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    # Fit the model
    model.fit(
        X_train,
        y_train,
        epochs=50,
        batch_size=128,
        validation_data=(X_test, y_test_list[i]),
        verbose=0,
    )

    # Append the trained model to the list
    models.append(model)

    # Print feature names
    print(f"Model {i+1} features: {feature_names}")

for i, model in enumerate(models):
    # Reshape input data for 1D CNN (num_samples, num_features, num_channels)
    X_test = X_test_list[i].reshape(X_test_list[i].shape[0], X_test_list[i].shape[2], 1)

    # Predict the probability of the chosen class on the test set
    y_pred_proba = model.predict(X_test)

    # Threshold prediction probabilities for binary classification
    threshold = 0.5
    y_pred = np.where(y_pred_proba > threshold, 1, 0)

    # Calculate the accuracy score
    accuracy = accuracy_score(y_test_list[i], y_pred)

    # Print the accuracy
    print(f"Model {i+1} Accuracy: {accuracy}")

Model 1 features: ['open', 'high', 'low', 'close', 'turnover', 'color', 'volume', 'avg_vol_last_100', 'obv', 'RSI_5', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_6_13_5_6_13_5', 'SMA_5', 'EMA_5', 'BBL_15_2.0_15', 'BBB_15_2.0_15', 'BBP_15_2.0_15', 'BBB_20_2.0_20', 'BBP_20_2.0_20', 'BBB_5_2.0_5', 'BBP_5_2.0_5', 'BBP_10_2.0_10', 'STOCHk_14_3_3_10_3_3', 'STOCHd_14_3_3_10_3_3', 'ATR_5', 'ROC_14', 'ROC_10', 'ROC_5', 'CCI_14', 'CCI_10', 'CCI_5', 'RVI_15', 'RVI_5', 'PSARs_0.01_0.1', 'TRIXs_18_9', 'ICS_15']
Model 2 features: ['open', 'high', 'low', 'close', 'turnover', 'color', 'volume', 'avg_vol_last_100', 'obv', 'RSI_5', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_6_13_5_6_13_5', 'SMA_5', 'EMA_5', 'BBL_15_2.0_15', 'BBB_15_2.0_15', 'BBP_15_2.0_15', 'BBB_20_2.0_20', 'BBP_20_2.0_20', 'BBB_5_2.0_5', 'BBP_5_2.0_5', 'BBP_10_2.0_10', 'STOCHk_14_3_3_10_3_3', 'STOCHd_14_3_3_10_3_3', 'ATR_5', 'ROC_14', 'ROC_10', 'ROC_5', 'CCI_14', 'CCI_10', 'CCI_5', 'RVI_15', 'RVI_5', 'PSARs_0.01_0.1', 'TRI