## Imports

In [96]:
import numpy as np
import pandas as pd
import pandas_ta as ta
from keras.layers import LSTM, Dense
from keras.models import Sequential
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from stocktrends import Renko

## Functions

In [97]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    return df


def preprocess_data(df):
    df["color_change"] = df["color"].diff().ne(0).astype(int)
    df["color_change"].fillna(0, inplace=True)

    return df


def scale_data(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_scaled


def timeseries_cv_score(X, y, n_splits):
    tscv = TimeSeriesSplit(n_splits=n_splits)

    f1_scores = []
    auc_scores = []  # list to store ROC AUC scores for each split
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Define LSTM model
        model = Sequential()
        model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dense(1, activation="sigmoid"))  # because of binary classification

        model.compile(
            loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
        )

        # Train the model
        model.fit(X_train, y_train, epochs=10, verbose=0)

        # Make predictions on the test set
        y_pred = model.predict(X_test).ravel()

        # Calculate F1 score of the model on the test set
        f1 = f1_score(y_test, (y_pred > 0.5).astype("int32"))
        f1_scores.append(f1)

        # Calculate ROC AUC score of the model on the test set
        auc = roc_auc_score(y_test, y_pred)
        auc_scores.append(auc)

    return np.mean(f1_scores), np.mean(auc_scores)


def add_heiken_ashi_features(df):
    # Create Heiken Ashi DataFrame
    ha_df = df.ta.ha()

    # Rename the HA columns
    ha_df.columns = [f"HA_{col}" for col in ha_df.columns]

    # Join the HA columns to the original dataframe
    df = df.join(ha_df)

    # Heiken Ashi Close to Open
    df["HA_close_open"] = df["HA_close"] - df["HA_open"]

    # Heiken Ashi High Low Range
    df["HA_high_low"] = df["HA_high"] - df["HA_low"]

    # Heiken Ashi Body Range
    df["HA_body"] = abs(df["HA_close"] - df["HA_open"])

    # Heiken Ashi Price Direction
    df["HA_direction"] = (df["HA_close"] > df["HA_open"]).astype(int)

    # Heiken Ashi Volume-weighted Price
    df["HA_vwap"] = (df["HA_close"] * df["volume"]).cumsum() / df["volume"].cumsum()

    # Lag 1 feature
    df["HA_close_lag1"] = df["HA_close"].shift(1)

    # Close Change
    df["HA_close_change"] = df["HA_close"].diff()

    # Close % Change
    df["HA_close_pct_change"] = df["HA_close"].pct_change()

    # 5-period Simple Moving Average
    df["HA_sma5"] = df["HA_close"].rolling(5).mean()

    # 5-period Exponential Moving Average
    df["HA_ema5"] = df["HA_close"].ewm(span=5).mean()

    # Additional features
    df["HA_ema10"] = df["HA_close"].ewm(span=10).mean()
    df["HA_ema15"] = df["HA_close"].ewm(span=15).mean()
    df["HA_pct_diff_ema5_15"] = (
        (df["HA_ema5"] - df["HA_ema15"]) / df["HA_ema15"]
    ) * 100
    df["HA_rsi"] = ta.rsi(df["HA_close"])
    df["HA_macd"], df["HA_macdh"], df["HA_macds"] = ta.macd(df["HA_close"])
    df["HA_cci"] = ta.cci(df["HA_high"], df["HA_low"], df["HA_close"])
    df["HA_atr"] = ta.atr(df["HA_high"], df["HA_low"], df["HA_close"])
    df["HA_ha_close_bbp50_std"] = (
        ta.stdev(df["HA_close"], 50) / df["HA_close"]
    )  # Bollinger Bands normalized width
    df["HA_mfi"] = ta.mfi(df["HA_high"], df["HA_low"], df["HA_close"], df["volume"])

    return df


def add_renko_features(df, brick_size=1):
    # Drop the time index if it exists
    if "time" in df.index.names:
        df.reset_index("time", drop=True, inplace=True)

    # Rename 'time' column to 'date' and reset index
    df_renko = df.rename(columns={"time": "date"}).reset_index(drop=True)

    # Create Renko object and configure brick size
    renko_obj = Renko(df_renko)
    renko_obj.brick_size = brick_size

    # Create Renko chart
    renko_chart = renko_obj.get_ohlc_data()

    # Transform Renko chart data into a pandas DataFrame
    renko_df = pd.DataFrame(renko_chart)

    # Join Renko data with the original DataFrame on the 'date' column
    df = df.merge(
        renko_df, how="left", left_on="time", right_on="date", suffixes=("", "_renko")
    )

    # Derived features
    df["close_open_renko"] = df["close_renko"] - df["open_renko"]
    df["high_low_renko"] = df["high_renko"] - df["low_renko"]
    df["close_renko_lag1"] = df["close_renko"].shift(1)
    df["close_change_renko"] = df["close_renko"].diff()
    # Fill NaN values using forward fill and backward fill
    df["uptrend"].ffill(inplace=True)
    df["uptrend"].bfill(inplace=True)

    df["direction_renko"] = df["uptrend"].astype(int)

    return df

## Globals

In [98]:
# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load the data
data_path = "../../../data/kc/btc/raw/kc_btc_15min.csv"

## Preprocessing

In [99]:
df = load_data(data_path)

# Preprocess the data
df = preprocess_data(df)

# Prepare TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

## FEATURE PROCESSING ##

df["time"] = pd.to_datetime(df["time"])
df.set_index("time", inplace=True, drop=False)

# print(df.index.duplicated().any())

df = df.loc[~df.index.duplicated(keep="first")]

# print(df.ta.categories)

df.ta.strategy("all")

# Check your results and exclude as necessary.
df.ta.strategy(fast=10, slow=50, verbose=True)

## Heiken Ashi ##
df_with_ha = add_heiken_ashi_features(df)

## Renko ##
df_with_renko = add_renko_features(df, brick_size=1)

# Convert time
df["hour"] = df["time"].dt.hour
df["minute"] = df["time"].dt.minute
df["day"] = df["time"].dt.day
df["month"] = df["time"].dt.month
df = df.drop(["time"], axis=1)


# Sanity check. Make sure all the columns and types
print(df.columns)
# print(df.dtypes)


# Forward Fill
df.ffill(inplace=True)

# Backward Fill
df.bfill(inplace=True)


# # Re-scale the data to include the new feature
# df_scaled = scale_data(df)

# X = df.drop("color_change", axis=1)
# y = df["color_change"]

# df.tail()
# print(df.isna().sum())

131it [00:02, 53.01it/s]


[+] Strategy: All
[i] Indicator arguments: {'fast': 10, 'slow': 50, 'append': True}
[i] Excluded[12]: above, above_value, below, below_value, cross, cross_value, long_run, short_run, td_seq, tsignals, vp, xsignals
[i] Multiprocessing 131 indicators with 3 chunks and 12/12 cpus.


131it [00:02, 48.09it/s]


[i] Total indicators: 131
[i] Columns added: 30
[i] Last Run: Tuesday June 20, 2023, NYSE: 21:53:03, Local: 1:53:03 Pacific Daylight Time, Day 171/365 (47.00%)
Index(['open', 'close', 'high', 'low', 'volume', 'color', 'color_change',
       'ABER_ZG_5_15', 'ABER_SG_5_15', 'ABER_XG_5_15',
       ...
       'STC_10_10_50_0.5', 'STCmacd_10_10_50_0.5', 'STCstoch_10_10_50_0.5',
       'TSI_10_50_13', 'TSIs_10_50_13', 'UO_10_14_50', 'hour', 'minute', 'day',
       'month'],
      dtype='object', length=319)
open                           float64
close                          float64
high                           float64
low                            float64
volume                         float64
color                            int64
color_change                     int32
ABER_ZG_5_15                   float64
ABER_SG_5_15                   float64
ABER_XG_5_15                   float64
ABER_ATR_5_15                  float64
ACCBL_20                       float64
ACCBM_20                 

## Univariate Feature Selection Process

In [100]:
# Re-scale the data to include the new feature
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# feature selection
selector = SelectKBest(score_func=f_classif, k="all")
selector.fit(X_scaled, y)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
features_df_new = X.iloc[:, cols]

# Store the scores of each feature in a dictionary
feature_scores = {
    feature_name: score for feature_name, score in zip(X.columns, selector.scores_)
}

# Sort the dictionary by value in descending order and print the scores
for feature_name, score in sorted(
    feature_scores.items(), key=lambda item: item[1], reverse=True
):
    print(f"{feature_name}: {score}")

# Now we can apply Logistic Regression and Random Forests on the new features_df_new
# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, features_df_new, y, cv=tscv, scoring="f1")

print(f"\nLogistic Regression CV F1 score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, features_df_new, y, cv=tscv, scoring="f1")

print(f"Random Forest CV F1 score: {np.mean(cv_scores)}")

# Reshape input to be 3D [samples, timesteps, features]
X_array = X.values
X_reshaped = X_array.reshape((X_array.shape[0], 1, X_array.shape[1]))

# Call the function
mean_f1_score = timeseries_cv_score(X_reshaped, y.values, n_splits=5)
print(f"\nLSTM CV F1 score: {mean_f1_score}")

print("\n", features_df_new.columns)

NameError: name 'X' is not defined

## Base Model

In [101]:
# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")

print(f"Logistic Regression CV ROC AUC score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")

print(f"Random Forest CV ROC AUC score: {np.mean(cv_scores)}")

print("\n", X.columns)

NameError: name 'X' is not defined

## LSTM

In [None]:
# Reshape input to be 3D [samples, timesteps, features]
X_array = X.values
X_reshaped = X_array.reshape((X_array.shape[0], 1, X_array.shape[1]))

# Call the function
mean_auc_score = timeseries_cv_score(X_reshaped, y.values, n_splits=5)
print(f"\nLSTM CV ROC AUC score: {mean_auc_score}")


LSTM CV ROC AUC score: (0.6959629098786181, 0.4926542232731929)
