## Imports ##

In [128]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import xgboost as xgb

## Functions ##

In [129]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i : (i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)


def fibonacci(n):
    fib_series = [0, 1]
    while fib_series[-1] < n:
        fib_series.append(fib_series[-1] + fib_series[-2])
    return fib_series[1:-1]  # ignore first and last


def process_all_files(fib_nums, filepath_template):
    for f in fib_nums:
        filepath = filepath_template.format(f)
        X_scaled, y_scaled = preprocess_data(filepath, included_features)
        # You can save or use X_scaled and y_scaled here


def preprocess_data(filepath, included_features):
    # Load the data
    df = pd.read_csv(filepath)

    # Convert color to 0 for 'red' and 1 for 'green'
    df["color"] = df["color"].map({"red": 0, "green": 1})

    # Add 'color_change' column: 1 if color changes from the previous row, 0 otherwise
    df["color_change"] = df["color"].diff().abs()

    # Fill the first row's 'color_change' with 0
    df["color_change"].fillna(0, inplace=True)

    # Fill NaNs in specific columns with 0
    fill_cols = ["PSARs_0.01_0.1"]
    for col in fill_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # Identify the first non-null row
    first_valid_index = df.dropna().index[0]

    # Drop the rows before this index
    df = df.loc[first_valid_index:]

    # Use ffill to fill any remaining missing values
    df.ffill(inplace=True)

    # Keep only the included features
    df = df[included_features]

    # Separating the features and target
    X = df.drop(["color", "color_change"], axis=1)
    y = df[["color", "color_change"]]

    # Normalize the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_scaled = scaler.fit_transform(X)
    y_scaled = scaler.fit_transform(y)

    return X_scaled, y_scaled

## Global Variables ##

In [130]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

max_lookback = 60  # or any other number based on your understanding of the data
fib_nums = fibonacci(max_lookback)

included_features = [
    "color",
    "color_change",
    "PSARs_0.01_0.1",
    "CCI_5",
    "BBP_5_2.0_5",
    "obv",
    "turnover",
    "volume",
    "avg_vol_last_100",
    "RSI_5",
    "open",
    "high",
    "low",
    "close",
]
# Drop unnecessary features
drop_features = [
    "time",
    # "low",
    "bollinger_bandwidth",
    "BBB_10_2.0_10",
    "BBB_20_2.0_20",
    "BBL_10_2.0_10",
    "BBL_20_2.0_20",
    "BBL_5_2.0_5",
    "BBM_10_2.0_10",
    "BBM_15_2.0_15",
    "BBM_20_2.0_20",
    "BBM_5_2.0_5",
    "BBP_20_2.0_20",
    "BBU_10_2.0_10",
    "BBU_15_2.0_15",
    "BBU_20_2.0_20",
    "BBU_5_2.0_5",
    "MACD_6_13_5_6_13_5",
    "MACDh_6_13_5_6_13_5",
    "MACDs_12_26_9",
    "RSI_10",
    "RVI_10",
    # "ICS_15",
    # Ichimoku Cloud (ISA, ISB, ITS, IKS, ICS)
    "ISA_5",
    "ISB_15",
    "ITS_5",
    "IKS_15",
    # Triple Exponential Average (TRIX)
    "TRIX_18_9",
    "TRIX_12_6",
    "TRIXs_12_6",
    "TRIX_10_5",
    "TRIXs_10_5",
    # "TRIXs_18_9",
    # Parabolic SAR (PSAR)
    "PSARl_0.01_0.1",
    "PSARaf_0.01_0.1",
    "PSARr_0.01_0.1",
    # Pivot Points
    "PP",
    "R1",
    "S1",
    "R2",
    "S2",
    "R3",
    "S3",
    # DON'T USE THESE # Money Flow Index and Chaikin Money Flow #
    "cmf",
    "mfi",
    # Volatility
    "ATR_14",
    "ATR_10",
    # Stochastic Oscillator
    "STOCHd_14_3_3",
    "STOCHk_14_3_3",
    "STOCHk_14_3_3_7_3_3",
    "STOCHd_14_3_3_7_3_3",
    # Moving averages
    "SMA_20",
    "SMA_10",
    "EMA_2",
    "EMA_10",
]

## Preprocessing ##

In [131]:
# Load the data
df = pd.read_csv(
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_12min_ha_ti.csv"
)

# Convert color to 0 for 'red' and 1 for 'green'
df["color"] = df["color"].map({"red": 0, "green": 1})

# Add 'color_change' column: 1 if color changes from the previous row, 0 otherwise
df["color_change"] = df["color"].diff().abs()

# Fill the first row's 'color_change' with 0
df["color_change"].fillna(0, inplace=True)


df.drop(drop_features, axis=1, inplace=True)

# Fill NaNs in specific columns with 0
fill_cols = ["PSARs_0.01_0.1"]
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Identify the first non-null row
first_valid_index = df.dropna().index[0]

# Drop the rows before this index
df = df.loc[first_valid_index:]

# Use ffill to fill any remaining missing values
df.ffill(inplace=True)


# Separating the features and target
X = df.drop(["color", "color_change"], axis=1)
y = df["color_change"]  # Here, only use "color_change" as the target.

y = y.values.reshape(-1, 1)


# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y)

# print(df.head(20))

## Alter > Candle Lengths ##

In [132]:
filepath_template = (
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_{}min_ha_ti.csv"
)
process_all_files(fib_nums, filepath_template)

## Test Train Split ##

In [133]:
# Number of splits
n_splits = 5

tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize lists to store train and test sets
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

for train_index, test_index in tscv.split(X_scaled):
    # Split the data
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_scaled[train_index], y_scaled[test_index]

    # LSTM requires the input to be in the shape [samples, time steps, features]
    # here we are using 1 time step and 'n' features
    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

    # Append to lists
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

## Test Model ##

In [134]:
# Define a list to keep all the models
models = []

for i in range(len(X_train_list)):
    # Get the data for the current fold
    X_train = X_train_list[i]
    y_train = y_train_list[i]
    X_test = X_test_list[i]
    y_test = y_test_list[i]

    # XGBoost requires 2D array-like input, so reshape your data if needed
    if len(X_train.shape) == 3:
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[2])
    if len(X_test.shape) == 3:
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[2])

    print("Training on fold ", i + 1, "\n")
    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape, "\n")

    # # Check features
    # print("Training on following features:")
    # for feature_name in df.columns:
    #     print(feature_name)
    # print("\n")

    # Define the model
    model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Append the model to the models list
    models.append(model)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy on fold ", i + 1, ": ", accuracy, "\n\n")

Training on fold  1 

X_train shape:  (1341, 35)
y_train shape:  (1341, 1) 

Accuracy on fold  1 :  0.6128550074738416 


Training on fold  2 

X_train shape:  (2679, 35)
y_train shape:  (2679, 1) 

Accuracy on fold  2 :  0.6195814648729447 


Training on fold  3 

X_train shape:  (4017, 35)
y_train shape:  (4017, 1) 

Accuracy on fold  3 :  0.609118086696562 


Training on fold  4 

X_train shape:  (5355, 35)
y_train shape:  (5355, 1) 

Accuracy on fold  4 :  0.5881913303437967 


Training on fold  5 

X_train shape:  (6693, 35)
y_train shape:  (6693, 1) 

Accuracy on fold  5 :  0.6412556053811659 




## Base Model ##

In [135]:
# Define a function to create LSTM model
def create_model(input_shape):
    model = Sequential()
    model.add(
        LSTM(50, activation="relu", return_sequences=True, input_shape=input_shape)
    )
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation="relu", return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


# Initialize a list to store models
models = []

# For each split train a new model
for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    # Initialize the model with the input shape
    model = create_model((X_train.shape[1], X_train.shape[2]))

    # We add an EarlyStopping callback from Keras to stop the training if the validation loss doesn't decrease for 5 consecutive epochs.
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)

    # Fit the model
    model.fit(
        X_train,
        y_train,
        epochs=50,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0,
    )

    # Append the trained model to the list
    models.append(model)

for i, model in enumerate(models):
    # Predict the probabilities on the test set
    y_pred_probs = model.predict(X_test_list[i])

    # Convert probabilities into class labels
    y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]

    # Calculate the accuracy
    acc = accuracy_score(y_test_list[i], y_pred)

    # Print the accuracy
    print(f"Model {i+1} accuracy: {acc:.2f}")

Model 1 accuracy: 0.59
Model 2 accuracy: 0.63
Model 3 accuracy: 0.61
Model 4 accuracy: 0.61
Model 5 accuracy: 0.62


## Fib Model ##

In [136]:
time_steps = fib_nums[-1]  # This is the maximum number of time steps per sequence

X_train_lstm, y_train_lstm = create_dataset(
    pd.DataFrame(X_train), pd.DataFrame(y_train), time_steps
)
X_test_lstm, y_test_lstm = create_dataset(
    pd.DataFrame(X_test), pd.DataFrame(y_test), time_steps
)

# Define model parameters
n_steps = fib_nums[-1]  # This is the maximum number of time steps per sequence
n_features = X_train.shape[1]  # This is the number of features
n_units = 50  # Number of units in the LSTM layer
dropout_rate = 0.2  # Dropout rate for regularization

# Initialize the model
model = Sequential()

# Add LSTM layer
model.add(LSTM(n_units, activation="tanh", input_shape=(n_steps, n_features)))

# Add dropout for regularization
model.add(Dropout(dropout_rate))

# Add output layer
model.add(Dense(1))

# Compile the model
model.compile(optimizer="adam", loss="mse")


# Reshape the data to fit the LSTM layer input shape
X_train_lstm = X_train.reshape((X_train.shape[0], n_steps, n_features))

# Train the model
model.fit(X_train_lstm, y_train, epochs=50, verbose=0)

# Reshape the test data
X_test_lstm = X_test.reshape((X_test.shape[0], n_steps, n_features))

# Make predictions
y_pred = model.predict(X_test_lstm)

ValueError: Must pass 2-d input. shape=(6693, 1, 35)