## Imports

In [363]:
import numpy as np
import pandas as pd
import pandas_ta as ta
import talib
from keras.layers import LSTM, Dense
from keras.models import Sequential
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler

## Functions

In [364]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    return df


def preprocess_data(df):
    df["color_change"] = df["color"].diff().ne(0).astype(int)
    df["color_change"].fillna(0, inplace=True)

    return df


def scale_data(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_scaled


def add_heiken_ashi_features(df):
    # Create Heiken Ashi DataFrame
    ha_df = df.ta.ha()

    # Rename the HA columns
    ha_df.columns = [f"HA_{col}" for col in ha_df.columns]

    # Join the HA columns to the original dataframe
    df = df.join(ha_df)

    # Heiken Ashi Close to Open
    df["HA_close_open"] = df["HA_close"] - df["HA_open"]

    # Heiken Ashi High Low Range
    df["HA_high_low"] = df["HA_high"] - df["HA_low"]

    # Heiken Ashi Body Range
    df["HA_body"] = abs(df["HA_close"] - df["HA_open"])

    # Heiken Ashi Price Direction
    df["HA_direction"] = (df["HA_close"] > df["HA_open"]).astype(int)

    # Heiken Ashi Volume-weighted Price
    df["HA_vwap"] = (df["HA_close"] * df["volume"]).cumsum() / df["volume"].cumsum()

    # Lag 1 feature
    df["HA_close_lag1"] = df["HA_close"].shift(1)

    # Close Change
    df["HA_close_change"] = df["HA_close"].diff()

    # Close % Change
    df["HA_close_pct_change"] = df["HA_close"].pct_change()

    # 5-period Simple Moving Average
    df["HA_sma5"] = df["HA_close"].rolling(5).mean()

    # 5-period Exponential Moving Average
    df["HA_ema5"] = df["HA_close"].ewm(span=5).mean()

    # Additional features
    df["HA_ema10"] = df["HA_close"].ewm(span=10).mean()
    df["HA_ema15"] = df["HA_close"].ewm(span=15).mean()
    df["HA_pct_diff_ema5_15"] = (
        (df["HA_ema5"] - df["HA_ema15"]) / df["HA_ema15"]
    ) * 100
    df["HA_rsi"] = ta.rsi(df["HA_close"])
    # Calculate Short Term Exponential Moving Average
    df["short_ema"] = df["HA_close"].ewm(span=12, adjust=False).mean()

    # Calculate Long Term Exponential Moving Average
    df["long_ema"] = df["HA_close"].ewm(span=26, adjust=False).mean()

    # Calculate Moving Average Convergence Divergence (MACD)
    df["HA_macd"] = df["short_ema"] - df["long_ema"]

    # Calculate Signal Line
    df["HA_macds"] = df["HA_macd"].ewm(span=9, adjust=False).mean()

    # Calculate MACD Histogram
    df["HA_macdh"] = df["HA_macd"] - df["HA_macds"]

    # Drop temporary short_ema and long_ema columns
    df = df.drop(["short_ema", "long_ema"], axis=1)

    df["HA_cci"] = ta.cci(df["HA_high"], df["HA_low"], df["HA_close"])
    df["HA_atr"] = ta.atr(df["HA_high"], df["HA_low"], df["HA_close"])
    df["HA_ha_close_bbp50_std"] = (
        ta.stdev(df["HA_close"], 50) / df["HA_close"]
    )  # Bollinger Bands normalized width
    df["HA_mfi"] = ta.mfi(df["HA_high"], df["HA_low"], df["HA_close"], df["volume"])

    return df


def compute_renko(df, timeperiod=14, multiplier=0.25):
    # Calculate ATR
    atr_values = talib.ATR(df["high"], df["low"], df["close"], timeperiod)

    # Drop rows with NaN ATR values
    df = df[atr_values.notna()]
    atr_values = atr_values.dropna()

    # Compute average ATR
    average_atr = atr_values.mean()

    # Set brick size
    brick_size = average_atr * multiplier

    renko_df = pd.DataFrame(
        index=df.index, columns=["open", "high", "low", "close", "time_unix"]
    )

    current_price = df["close"][0]
    last_reset_price = current_price

    for idx in df.index[1:]:
        current_price = df.loc[idx, "close"]
        renko_df.loc[idx, "time_unix"] = df.loc[
            idx, "time_unix"
        ]  # Copy 'time_unix' from df

        while last_reset_price + brick_size <= current_price:
            renko_df.loc[idx] = [
                last_reset_price,
                last_reset_price + brick_size,
                last_reset_price,
                last_reset_price + brick_size,
                df.loc[idx, "time_unix"],
            ]
            last_reset_price += brick_size

        while last_reset_price - brick_size >= current_price:
            renko_df.loc[idx] = [
                last_reset_price,
                last_reset_price - brick_size,
                last_reset_price - brick_size,
                last_reset_price,
                df.loc[idx, "time_unix"],
            ]
            last_reset_price -= brick_size

    return renko_df.dropna()


def add_renko_features(df, renko_df):
    # Append "_renko" to the column names of renko_df
    renko_df.columns = [str(col) + "_renko" for col in renko_df.columns]

    # Add EMA, SMA, WMA, RSI, and Bollinger Bands to renko_df
    renko_df["sma_renko"] = talib.SMA(renko_df["close_renko"], timeperiod=5)
    renko_df["ema_renko"] = talib.EMA(renko_df["close_renko"], timeperiod=5)
    renko_df["wma_renko"] = talib.WMA(renko_df["close_renko"], timeperiod=5)
    renko_df["rsi_renko"] = talib.RSI(renko_df["close_renko"], timeperiod=5)
    (
        renko_df["upper_band_renko"],
        renko_df["middle_band_renko"],
        renko_df["lower_band_renko"],
    ) = talib.BBANDS(renko_df["close_renko"], timeperiod=5)
    # Add ROC and Momentum to renko_df
    renko_df["roc_renko"] = talib.ROC(renko_df["close_renko"], timeperiod=5)
    renko_df["momentum_renko"] = renko_df["close_renko"].diff()

    # Add Stochastic Oscillator to renko_df
    renko_df["k_renko"], renko_df["d_renko"] = talib.STOCH(
        renko_df["high_renko"],
        renko_df["low_renko"],
        renko_df["close_renko"],
        fastk_period=5,
        slowk_period=3,
        slowd_period=3,
    )
    df = df.join(
        renko_df.set_index("time_unix_renko"), on="time_unix"
    )  # Join Renko data with the original DataFrame

    # Derived features
    df["close_open_renko"] = df["close_renko"] - df["open_renko"]
    df["high_low_renko"] = df["high_renko"] - df["low_renko"]
    df["close_change_renko"] = df["close_renko"].diff()
    df["direction_renko"] = df["close_change_renko"].apply(lambda x: int(x > 0))
    df["direction_change"] = df["direction_renko"].diff().abs()

    return df


def kagi(df):
    kagi_data = []
    cur_col = {
        "kagi_dir": "up",
        "kagi_start": df.loc[df.index[0], "close"],
        "time_unix": df.loc[df.index[0], "time_unix"],
    }
    for index, row in df.iterrows():
        cp = row["close"]
        diff = cp - cur_col["kagi_start"]
        if cur_col["kagi_dir"] == "up":
            if diff < 0:
                kagi_data.append(cur_col.copy())
                cur_col["kagi_dir"] = "down"
        else:
            if diff > 0:
                kagi_data.append(cur_col.copy())
                cur_col["kagi_dir"] = "up"
        cur_col["kagi_start"] = cp
        cur_col["time_unix"] = row["time_unix"]
    kagi_data.append(cur_col)
    kagi_df = pd.DataFrame(kagi_data)
    return kagi_df


def add_kagi_features(df, kagi_df):
    # Calculate simple moving averages
    kagi_df["kagi_sma5"] = talib.SMA(kagi_df["kagi_start"], timeperiod=5)
    kagi_df["kagi_sma10"] = talib.SMA(kagi_df["kagi_start"], timeperiod=10)
    kagi_df["kagi_sma20"] = talib.SMA(kagi_df["kagi_start"], timeperiod=20)

    # Calculate Bollinger Bands
    kagi_df["kagi_upper"], kagi_df["kagi_middle"], kagi_df["kagi_lower"] = talib.BBANDS(
        kagi_df["kagi_start"], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0
    )

    # Calculate MACD
    (
        kagi_df["kagi_macd"],
        kagi_df["kagi_macdsignal"],
        kagi_df["kagi_macdhist"],
    ) = talib.MACD(kagi_df["kagi_start"], fastperiod=12, slowperiod=26, signalperiod=9)

    # Calculate RSI
    kagi_df["kagi_rsi"] = talib.RSI(kagi_df["kagi_start"], timeperiod=14)

    # Calculate Stochastic Oscillator
    kagi_df["kagi_slowk"], kagi_df["kagi_slowd"] = talib.STOCH(
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        5,
        3,
        0,
        3,
        0,
    )

    kagi_df["kagi_atr"] = talib.ATR(
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        timeperiod=14,
    )

    kagi_df["kagi_adx"] = talib.ADX(
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        timeperiod=14,
    )

    # Convert df.index to Unix timestamp
    df.index = df.index.astype(np.int64) // 10**9

    # Align the volume series with kagi_df
    aligned_volume = df["volume"].reindex(kagi_df["time_unix"], method="pad")

    # Calculate Accumulation / Distribution Line
    kagi_df["kagi_ad"] = talib.AD(
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        kagi_df["kagi_start"],
        aligned_volume,
    )

    return kagi_df


def three_line_break(df):
    df_tlb = pd.DataFrame()
    df_tlb["open"] = df["open"].reset_index(drop=True)
    df_tlb["close"] = df["close"].reset_index(drop=True)
    df_tlb["high"] = df["high"].reset_index(drop=True)
    df_tlb["low"] = df["low"].reset_index(drop=True)
    df_tlb["volume"] = df["volume"].reset_index(drop=True)
    df_tlb["tlb_direction"] = None
    df_tlb["time_unix"] = df[
        "time_unix"
    ].values  # Copy 'time_unix' from original dataframe

    if df_tlb["close"][1] > df_tlb["close"][0]:
        df_tlb.loc[1, "tlb_direction"] = "up"
    else:
        df_tlb.loc[1, "tlb_direction"] = "down"

    for i in range(2, len(df_tlb)):
        up_condition = (df_tlb["close"][i] > df_tlb["close"][i - 3 : i].max()) and (
            df_tlb["tlb_direction"][i - 1] == "down"
        )
        down_condition = (df_tlb["close"][i] < df_tlb["close"][i - 3 : i].min()) and (
            df_tlb["tlb_direction"][i - 1] == "up"
        )

        if up_condition:
            df_tlb.loc[i, "tlb_direction"] = "up"
        elif down_condition:
            df_tlb.loc[i, "tlb_direction"] = "down"
        else:
            df_tlb.loc[i, "tlb_direction"] = df_tlb.loc[i - 1, "tlb_direction"]

    return df_tlb


def add_tlb_features(df):
    # Add Relative Strength Index (RSI) of the close price
    df["tlb_RSI"] = talib.RSI(df["close"], timeperiod=14)

    # Add Moving Average Convergence Divergence (MACD) of the close price
    df["tlb_MACD"], df["tlb_MACD_signal"], df["tlb_MACD_hist"] = talib.MACD(
        df["close"], fastperiod=12, slowperiod=26, signalperiod=9
    )

    # Add Bollinger Bands of the close price
    df["tlb_upper_band"], df["tlb_middle_band"], df["tlb_lower_band"] = talib.BBANDS(
        df["close"], timeperiod=20
    )

    # Add Simple Moving Average (SMA) of the close price
    df["tlb_SMA"] = talib.SMA(df["close"], timeperiod=14)

    # Add Exponential Moving Average (EMA) of the close price
    df["tlb_EMA"] = talib.EMA(df["close"], timeperiod=14)

    # Add Rate of Change (ROC) of the close price
    df["tlb_ROC"] = talib.ROC(df["close"], timeperiod=10)

    # Add Average True Range (ATR) of the close price
    df["tlb_ATR"] = talib.ATR(df["high"], df["low"], df["close"], timeperiod=14)

    # Add Momentum of the close price
    df["tlb_momentum"] = talib.MOM(df["close"], timeperiod=10)

    # Add Stochastic Oscillator %K and %D of the close price
    df["tlb_slowk"], df["tlb_slowd"] = talib.STOCH(
        df["high"],
        df["low"],
        df["close"],
        fastk_period=5,
        slowk_period=3,
        slowk_matype=0,
        slowd_period=3,
        slowd_matype=0,
    )

    # Add Commodity Channel Index (CCI) of the close price
    df["tlb_CCI"] = talib.CCI(df["high"], df["low"], df["close"], timeperiod=14)

    # Add On Balance Volume (OBV)
    df["tlb_OBV"] = talib.OBV(df["close"], df["volume"])

    # Add Moving Average of High, Low, Open prices
    df["tlb_MA_high"] = talib.MA(df["high"], timeperiod=14)
    df["tlb_MA_low"] = talib.MA(df["low"], timeperiod=14)
    df["tlb_MA_open"] = talib.MA(df["open"], timeperiod=14)

    # Add Historical volatility
    df["tlb_volatility"] = (
        talib.ATR(df["high"], df["low"], df["close"], timeperiod=14) / df["tlb_SMA"]
    )

    # Add Money Flow Index (MFI)
    df["tlb_MFI"] = talib.MFI(
        df["high"], df["low"], df["close"], df["volume"], timeperiod=14
    )

    # Add Chaikin Money Flow (CMF)
    df["tlb_CMF"] = (
        df["close"] - df["low"] - (df["high"] - df["close"]) / (df["high"] - df["low"])
    ) * df["volume"]

    # Add William’s %R
    df["tlb_WilliamsR"] = talib.WILLR(df["high"], df["low"], df["close"], timeperiod=14)

    # Add Ultimate Oscillator
    df["tlb_UO"] = talib.ULTOSC(
        df["high"],
        df["low"],
        df["close"],
        timeperiod1=7,
        timeperiod2=14,
        timeperiod3=28,
    )

    # Add Accumulation/Distribution Line (ADL)
    df["tlb_ADL"] = talib.AD(df["high"], df["low"], df["close"], df["volume"])

    # Add Average Directional Index (ADX)
    df["tlb_ADX"] = talib.ADX(df["high"], df["low"], df["close"], timeperiod=14)

    return df


def point_and_figure(df, box_size):
    # Create a copy of the data frame
    df_pnf = df.copy()

    # Calculate column direction
    df_pnf["direction"] = (
        df_pnf["close"]
        .diff()
        .apply(lambda x: 1 if x > box_size else 0 if x < -box_size else np.nan)
    )

    # Initialize the first row's Direction
    if len(df_pnf) > 0:
        df_pnf.at[df_pnf.index[0], "direction"] = (
            1 if df_pnf["close"].iloc[0] - df_pnf["open"].iloc[0] >= box_size else 0
        )

    # Forward fill direction column
    df_pnf["direction"] = df_pnf["direction"].fillna(method="ffill")

    # Calculate price changes
    df_pnf["change"] = (
        df_pnf["close"].diff().apply(lambda x: np.nan if abs(x) < box_size else x)
    )

    # Drop rows without price change
    df_pnf = df_pnf.dropna()

    # Create the new columns for PnF values
    df_pnf["pnf_open"] = df_pnf["open"]
    df_pnf["pnf_close"] = df_pnf["close"]
    df_pnf["pnf_high"] = df_pnf["high"]
    df_pnf["pnf_low"] = df_pnf["low"]

    # Drop duplicates
    df_pnf = df_pnf.drop_duplicates()

    return df_pnf


def add_pnf_features(df):
    # Calculate difference between open and close prices
    df["pnf_O-C"] = df["pnf_open"] - df["pnf_close"]

    # Calculate difference between high and low prices
    df["pnf_H-L"] = df["pnf_high"] - df["pnf_low"]

    # Calculate difference between high and open prices
    df["pnf_H-O"] = df["pnf_high"] - df["pnf_open"]

    # Calculate difference between low and close prices
    df["pnf_L-C"] = df["pnf_low"] - df["pnf_close"]

    # # Calculate moving average
    # df["pnf_MA"] = talib.MA(df["close"], timeperiod=20)

    # # Calculate exponential moving average
    # df["pnf_EMA"] = talib.EMA(df["close"], timeperiod=20)

    # # Calculate RSI
    # df["pnf_RSI"] = talib.RSI(df["close"], timeperiod=14)

    # # Calculate MACD
    # MACD_line, signal_line, hist = talib.MACD(
    #     df["close"], fastperiod=12, slowperiod=26, signalperiod=9
    # )
    # df["pnf_MACD"] = MACD_line - signal_line

    # # Calculate Stochastic
    # slowk, slowd = talib.STOCH(
    #     df["high"],
    #     df["low"],
    #     df["close"],
    #     fastk_period=14,
    #     slowk_period=3,
    #     slowk_matype=0,
    #     slowd_period=3,
    #     slowd_matype=0,
    # )
    # df["pnf_Stochastic"] = slowk

    # # Calculate Bollinger Bands
    # upper, middle, lower = talib.BBANDS(df["close"], timeperiod=20)
    # df["pnf_UpperBB"] = upper
    # df["pnf_MiddleBB"] = middle
    # df["pnf_LowerBB"] = lower

    # # Calculate ADX
    # df["pnf_ADX"] = talib.ADX(df["high"], df["low"], df["close"], timeperiod=14)

    # # Calculate CCI
    # df["pnf_CCI"] = talib.CCI(df["high"], df["low"], df["close"], timeperiod=14)

    # # Calculate ROC
    # df["pnf_ROC"] = talib.ROC(df["close"], timeperiod=10)

    # # Calculate ATR
    # df["pnf_ATR"] = talib.ATR(df["high"], df["low"], df["close"], timeperiod=14)

    return df


def create_features(df):
    ## DIFFERENCES ##

    # Price differences
    df["price_diff"] = df["close"].diff()
    df["op_cl_diff"] = df["open"] - df["close"]

    # Moving averages
    df["ma_5"] = df["close"].rolling(window=5).mean()
    df["ma_10"] = df["close"].rolling(window=10).mean()

    # Price percentage change
    df["pct_change"] = df["close"].pct_change()

    # RSI
    delta = df["close"].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df["rsi"] = 100 - 100 / (1 + rs)

    # Other popular difference features
    for i in range(1, 35):
        df["diff_{}".format(i)] = df["close"].diff(i)

    sma = df["close"].rolling(window=20).mean()
    std = df["close"].rolling(window=20).std()
    df["upper_band"] = sma + (2 * std)
    df["lower_band"] = sma - (2 * std)

    highest_high = df["high"].rolling(window=14).max()
    lowest_low = df["low"].rolling(window=14).min()
    df["williams_r"] = (highest_high - df["close"]) / (highest_high - lowest_low) * -100

    df["obv"] = (np.sign(df["close"].diff()) * df["volume"]).fillna(0).cumsum()

    tp = (df["high"] + df["low"] + df["close"]) / 3
    sma = tp.rolling(window=20).mean()
    mean_deviation = tp.rolling(window=20).apply(
        lambda x: np.mean(np.abs(x - x.mean()))
    )
    df["cci"] = (tp - sma) / (0.015 * mean_deviation)

    true_range = pd.concat(
        [
            df["high"] - df["low"],
            (df["high"] - df["close"].shift()).abs(),
            (df["close"].shift() - df["low"]).abs(),
        ],
        axis=1,
    ).max(axis=1)
    df["atr"] = true_range.rolling(window=14).mean()

    money_flow_vol = (
        ((df["close"] - df["low"]) - (df["high"] - df["close"]))
        / (df["high"] - df["low"])
        * df["volume"]
    )
    cmf = (
        money_flow_vol.rolling(window=20).sum() / df["volume"].rolling(window=20).sum()
    )
    df["cmf"] = cmf

    ema_12 = df["close"].ewm(span=12).mean()
    ema_26 = df["close"].ewm(span=26).mean()
    ppo = (ema_12 - ema_26) / ema_26 * 100
    df["ppo"] = ppo

    ### LAG ###

    # Lag closing prices
    for i in range(1, 11):
        df["lag_close_{}".format(i)] = df["close"].shift(i)

    # Lag daily returns
    returns = df["close"].pct_change()
    for i in range(1, 6):
        df["lag_return_{}".format(i)] = returns.shift(i)

    # Lag high and low prices
    for i in range(1, 4):
        df["lag_high_{}".format(i)] = df["high"].shift(i)
        df["lag_low_{}".format(i)] = df["low"].shift(i)

    # Historical volatility
    df["hist_volatility_10"] = returns.rolling(window=10).std()
    df["hist_volatility_20"] = returns.rolling(window=20).std()
    df["hist_volatility_30"] = returns.rolling(window=30).std()

    # Previous day's RSI value
    delta = df["close"].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - 100 / (1 + rs)
    df["lag_rsi_1"] = rsi.shift(1)

    ### ROLLING ###

    # Moving averages
    df["ma_5"] = df["close"].rolling(window=5).mean()
    df["ma_10"] = df["close"].rolling(window=10).mean()
    df["ma_20"] = df["close"].rolling(window=20).mean()

    # Exponential moving averages
    df["ema_5"] = df["close"].ewm(span=5).mean()
    df["ema_10"] = df["close"].ewm(span=10).mean()
    df["ema_20"] = df["close"].ewm(span=20).mean()

    # Bollinger Bands
    sma = df["close"].rolling(window=20).mean()
    std = df["close"].rolling(window=20).std()
    df["upper_band"] = sma + (2 * std)
    df["lower_band"] = sma - (2 * std)

    # Rate of Change (ROC)
    df["roc_5"] = df["close"].pct_change(periods=5)
    df["roc_10"] = df["close"].pct_change(periods=10)

    # Standard deviation
    df["std_5"] = df["close"].rolling(window=5).std()
    df["std_10"] = df["close"].rolling(window=10).std()

    # Average True Range (ATR)
    true_range = pd.concat(
        [
            df["high"] - df["low"],
            (df["high"] - df["close"].shift()).abs(),
            (df["low"] - df["close"].shift()).abs(),
        ],
        axis=1,
    ).max(axis=1)
    df["atr_14"] = true_range.rolling(window=14).mean()

    # Keltner Channels
    middle_line = df["close"].rolling(window=20).mean()
    upper_keltner = middle_line + 2 * df["atr_14"]
    lower_keltner = middle_line - 2 * df["atr_14"]
    df["upper_keltner"] = upper_keltner
    df["lower_keltner"] = lower_keltner

    # RSI
    delta = df["close"].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df["rsi"] = 100 - 100 / (1 + rs)

    ### RATIO ###

    # Calculate some basic features first
    returns = df["close"].pct_change()
    ema_12 = df["close"].ewm(span=12).mean()
    ema_26 = df["close"].ewm(span=26).mean()

    # 1. Price-to-moving-average ratios
    df["price_to_ma_5"] = df["close"] / df["ma_5"]
    df["price_to_ma_10"] = df["close"] / df["ma_10"]
    df["price_to_ma_20"] = df["close"] / df["ma_20"]

    # 2. Price-to-EMA ratios
    df["price_to_ema_5"] = df["close"] / df["ema_5"]
    df["price_to_ema_10"] = df["close"] / df["ema_10"]
    df["price_to_ema_20"] = df["close"] / df["ema_20"]

    # 3. EMA-to-moving-average ratios
    df["ema_to_ma_5"] = df["ema_5"] / df["ma_5"]
    df["ema_to_ma_10"] = df["ema_10"] / df["ma_10"]
    df["ema_to_ma_20"] = df["ema_20"] / df["ma_20"]

    # 4. EMA-to-EMA ratios
    df["ema5_to_ema10"] = df["ema_5"] / df["ema_10"]
    df["ema5_to_ema20"] = df["ema_5"] / df["ema_20"]
    df["ema10_to_ema20"] = df["ema_10"] / df["ema_20"]

    # 5. Moving-average-to-moving-average ratios
    df["ma5_to_ma10"] = df["ma_5"] / df["ma_10"]
    df["ma5_to_ma20"] = df["ma_5"] / df["ma_20"]
    df["ma10_to_ma20"] = df["ma_10"] / df["ma_20"]

    # 6. Price-to-Bollinger-Band ratios
    df["price_to_upper_band"] = df["close"] / df["upper_band"]
    df["price_to_lower_band"] = df["close"] / df["lower_band"]

    # 7. Price-to-Keltner-Channel ratios
    df["price_to_upper_keltner"] = df["close"] / df["upper_keltner"]
    df["price_to_lower_keltner"] = df["close"] / df["lower_keltner"]

    # 8. Price-to-previous-day-high ratio
    df["price_to_prev_high"] = df["close"] / df["high"].shift(1)

    # 9. Price-to-previous-day-low ratio
    df["price_to_prev_low"] = df["close"] / df["low"].shift(1)

    # 10. RSI-to-moving-average ratio
    df["rsi_to_ma_5"] = df["rsi"] / df["ma_5"]

    ### CORRELATION ####

    # Calculate some basic features first
    returns = df["close"].pct_change()
    volume_change = df["volume"].pct_change()

    # 1. Correlation with returns and volume (short-term)
    df["return_volume_corr_5"] = returns.rolling(window=5).corr(df["volume"])
    df["return_volume_corr_10"] = returns.rolling(window=10).corr(df["volume"])
    df["return_volume_corr_20"] = returns.rolling(window=20).corr(df["volume"])

    # 2. Correlation with returns and volume (medium-term)
    df["return_volume_corr_50"] = returns.rolling(window=50).corr(df["volume"])
    df["return_volume_corr_100"] = returns.rolling(window=100).corr(df["volume"])

    # 3. Correlation with price and volume
    df["price_volume_corr_5"] = df["close"].rolling(window=5).corr(df["volume"])
    df["price_volume_corr_10"] = df["close"].rolling(window=10).corr(df["volume"])
    df["price_volume_corr_20"] = df["close"].rolling(window=20).corr(df["volume"])

    # 4. Correlation with price and volume (medium-term)
    df["price_volume_corr_50"] = df["close"].rolling(window=50).corr(df["volume"])
    df["price_volume_corr_100"] = df["close"].rolling(window=100).corr(df["volume"])

    # 5. Price / volume
    df["price_to_volume"] = df["close"] / df["volume"]

    # 6. Volume Relative Strength Index (VRSI)
    delta_vol = df["volume"].diff()
    gain_vol = delta_vol.where(delta_vol > 0, 0)
    loss_vol = -delta_vol.where(delta_vol < 0, 0)
    avg_gain_vol = gain_vol.rolling(window=14).mean()
    avg_loss_vol = loss_vol.rolling(window=14).mean()
    rs_vol = avg_gain_vol / avg_loss_vol
    df["vrsi"] = 100 - 100 / (1 + rs_vol)

    # 7. Change in volume
    df["volume_change"] = volume_change

    # 8. Moving averages of volume
    df["volume_ma_5"] = df["volume"].rolling(window=5).mean()
    df["volume_ma_10"] = df["volume"].rolling(window=10).mean()
    df["volume_ma_20"] = df["volume"].rolling(window=20).mean()

    # 9. Standard deviation of volume
    df["volume_std_5"] = df["volume"].rolling(window=5).std()
    df["volume_std_10"] = df["volume"].rolling(window=10).std()
    df["volume_std_20"] = df["volume"].rolling(window=20).std()

    # 10. Ratio of volume to moving average volume
    df["volume_to_ma_volume"] = df["volume"] / df["volume_ma_20"]

    ### EXTRA ###

    # Calculate necessary base features
    returns = df["close"].pct_change()
    delta = df["close"].diff()

    # close_change
    df["close_change"] = df["close"].diff()

    # high_pct
    df["high_pct"] = df["high"].pct_change()

    # close_change_roll5
    df["close_change_roll5"] = df["close_change"].rolling(window=5).mean()

    # RSI_14_roll5
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - 100 / (1 + rs)
    df["RSI_14_roll5"] = rsi.rolling(window=5).mean()

    # ATR_14_roll5
    true_range = pd.concat(
        [
            df["high"] - df["low"],
            (df["high"] - df["close"].shift()).abs(),
            (df["low"] - df["close"].shift()).abs(),
        ],
        axis=1,
    ).max(axis=1)
    df["ATR_14_roll5"] = true_range.rolling(window=14).mean().rolling(window=5).mean()

    # volume_roll5
    df["volume_roll5"] = df["volume"].rolling(window=5).mean()

    # high_pct_roll5
    df["high_pct_roll5"] = df["high_pct"].rolling(window=5).mean()

    # volatility_5
    df["volatility_5"] = df["close"].rolling(window=5).std()

    # price_ema5
    df["price_ema5"] = df["close"].ewm(span=5).mean()

    # volume_ema5
    df["volume_ema5"] = df["volume"].ewm(span=5).mean()

    # price_to_ema5
    df["price_to_ema5"] = df["close"] / df["price_ema5"]

    # volume_change_roll5
    df["volume_change_roll5"] = volume_change.rolling(window=5).mean()

    # avg_vol_last_100
    df["avg_vol_last_100"] = df["volume"].rolling(window=100).mean()

    # turnover
    df["turnover"] = df["volume"] * df["close"]

    return df


def timeseries_cv_score(X, y, n_splits):
    tscv = TimeSeriesSplit(n_splits=n_splits)

    f1_scores = []
    auc_scores = []  # list to store ROC AUC scores for each split
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Define LSTM model
        model = Sequential()
        model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dense(1, activation="sigmoid"))  # because of binary classification

        model.compile(
            loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
        )

        # Train the model
        model.fit(X_train, y_train, epochs=10, verbose=0)

        # Make predictions on the test set
        y_pred = model.predict(X_test).ravel()

        # Calculate F1 score of the model on the test set
        f1 = f1_score(y_test, (y_pred > 0.5).astype("int32"))
        f1_scores.append(f1)

        # Calculate ROC AUC score of the model on the test set
        auc = roc_auc_score(y_test, y_pred)
        auc_scores.append(auc)

    return np.mean(f1_scores), np.mean(auc_scores)

## Globals

In [365]:
# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
# Prepare TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
# Load the data
data_path = "../../../data/kc/btc/raw/kc_btc_15min.csv"

## Preprocessing

In [366]:
# Load and preprocess the data
df = load_data(data_path)
df = preprocess_data(df)

# print(df.columns)

# Timestamp conversion and index setting
df["time_unix"] = df["time"]
df["time"] = pd.to_datetime(df["time"], unit="s")  # Assuming 'time' is in seconds
df.set_index("time", inplace=True)
df = df.loc[~df.index.duplicated(keep="first")]

df_pnf = point_and_figure(df, 3)
df_pnf.tail(10)


# # Technical Analysis features
# df.ta.strategy("all")

# # Check your results and exclude as necessary.
# df.ta.strategy(fast=10, slow=50, verbose=True)


# # Heiken Ashi
# df = add_heiken_ashi_features(df)

# print("'time_unix' in df after Heiken Ashi features: ", "time_unix" in df.columns)
# print(df["time_unix"].head())
# print("Rows in df after Heiken Ashi features: ", len(df))


# ## RENKO
# # CONSIDER CHANGING multiplyer to .5 or even 1
# renko_df = compute_renko(df)

# print("'time_unix' in renko_df: ", "time_unix" in renko_df.columns)
# print(renko_df["time_unix"].head())
# print("Rows in renko_df: ", len(renko_df))

# df = add_renko_features(df, renko_df)

# print("'time_unix' in df after Renko features: ", "time_unix" in df.columns)
# print(df["time_unix"].head())
# print("Rows in df after Renko features: ", len(df))

# ## KAGI ##

# kagi_df = kagi(df)
# print("'time_unix' in kagi_df: ", "time_unix" in kagi_df.columns)
# print(kagi_df["time_unix"].head())
# print("Rows in kagi_df: ", len(kagi_df))

# # Add Kagi features
# kagi_df = add_kagi_features(df, kagi_df)

# print("'time_unix' in kagi_df after Kagi features: ", "time_unix" in kagi_df.columns)
# print(kagi_df["time_unix"].head())
# print("Rows in kagi_df after Kagi features: ", len(kagi_df))

# df = df.join(kagi_df.set_index("time_unix"), on="time_unix")

# print("'time_unix' in df after joining with kagi_df: ", "time_unix" in df.columns)
# print(df["time_unix"].head())
# print("Rows in df after joining with kagi_df: ", len(df))

# ## Three Line Break ##

# df_tlb = three_line_break(df)
# direction_reversals = (
#     df_tlb["tlb_direction"].shift(1) != df_tlb["tlb_direction"]
# ).sum()
# print("The number of direction reversals:", direction_reversals)


# # Create the TLB dataframe
# tlb_df = three_line_break(df)
# print("'time_unix' in tlb_df: ", "time_unix" in tlb_df.columns)
# print(tlb_df["time_unix"].head())
# print("Rows in tlb_df: ", len(tlb_df))

# # Add TLB features
# tlb_df = add_tlb_features(tlb_df)

# print("'time_unix' in tlb_df after TLB features: ", "time_unix" in tlb_df.columns)
# print(tlb_df["time_unix"].head())
# print("Rows in tlb_df after TLB features: ", len(tlb_df))

# # Join TLB dataframe with original dataframe
# df = df.join(tlb_df.set_index("time_unix"), on="time_unix", rsuffix="_tlb")

# print("'time_unix' in df after joining with tlb_df: ", "time_unix" in df.columns)
# print(df["time_unix"].head())
# print("Rows in df after joining with tlb_df: ", len(df))


# print(df.columns)


# # Additional features
# df = create_features(df)

# # Now extract additional date information from the original dataframe
# df["minute"] = df.index.minute
# df["hour"] = df.index.hour
# df["day"] = df.index.day
# df["month"] = df.index.month

# # Sanity check. Make sure all the columns and types
# # print(df.columns)

# # Forward Fill
# df.ffill(inplace=True)

# # Backward Fill
# df.bfill(inplace=True)

# # # Finding problem features for standard scalar
# # non_num_features = df.select_dtypes(exclude=["int32", "int64", "float32", "float64"])
# # print("Fix these features:\n")
# # for col, dtype in non_num_features.dtypes.items():
# #     print(f"{col}: {dtype}")


# # Select numeric columns which need to be scaled
# do_not_scale_columns = [
#     "time_unix",
#     "minute",
#     "hour",
#     "day",
#     "month",
# ]
# scaler = StandardScaler()
# for col in df.columns:
#     if col not in do_not_scale_columns:
#         df[col] = scaler.fit_transform(df[[col]])


# X = df.drop("color_change", axis=1)
# y = df["color_change"]

# df.tail()
# duplicate features
# duplicated_features = df.columns.duplicated()
# print("Duplicate Features: ", df.columns[duplicated_features])
# Total features
# print("Total features in DataFrame: ", df.shape[1])

Unnamed: 0_level_0,open,close,high,low,volume,color,color_change,time_unix,direction,change,pnf_open,pnf_close,pnf_high,pnf_low
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-03-27 09:00:00,27827.6,27866.9,27869.0,27778.3,32.308052,1,1,1679907600,1.0,39.3,27827.6,27866.9,27869.0,27778.3
2023-03-27 09:15:00,27866.9,27848.9,27873.4,27839.5,21.288271,0,1,1679908500,0.0,-18.0,27866.9,27848.9,27873.4,27839.5
2023-03-27 09:30:00,27848.7,27852.0,27867.8,27814.5,31.791354,1,1,1679909400,1.0,3.1,27848.7,27852.0,27867.8,27814.5
2023-03-27 09:45:00,27852.0,27882.2,27893.9,27838.1,21.251086,1,0,1679910300,1.0,30.2,27852.0,27882.2,27893.9,27838.1
2023-03-27 10:00:00,27882.1,27888.3,27902.9,27863.4,21.638241,1,0,1679911200,1.0,6.1,27882.1,27888.3,27902.9,27863.4
2023-03-27 10:15:00,27888.2,27839.8,27888.3,27839.7,25.172174,0,1,1679912100,0.0,-48.5,27888.2,27839.8,27888.3,27839.7
2023-03-27 10:30:00,27839.8,27887.9,27901.0,27828.4,23.995132,1,1,1679913000,1.0,48.1,27839.8,27887.9,27901.0,27828.4
2023-03-27 11:00:00,27889.6,27930.3,27930.9,27872.4,29.465377,1,1,1679914800,1.0,40.8,27889.6,27930.3,27930.9,27872.4
2023-03-27 11:30:00,27930.0,27889.2,27930.0,27889.1,18.444079,0,0,1679916600,0.0,-40.9,27930.0,27889.2,27930.0,27889.1
2023-03-27 11:45:00,27889.2,27903.4,27916.6,27874.6,14.993546,1,1,1679917500,1.0,14.2,27889.2,27903.4,27916.6,27874.6


## Univariate Feature Selection Process

In [367]:
# Re-scale the data to include the new feature
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# feature selection
selector = SelectKBest(score_func=f_classif, k="all")
selector.fit(X_scaled, y)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
features_df_new = X.iloc[:, cols]

# Store the scores of each feature in a dictionary
feature_scores = {
    feature_name: score for feature_name, score in zip(X.columns, selector.scores_)
}

# Sort the dictionary by value in descending order and print the scores
for feature_name, score in sorted(
    feature_scores.items(), key=lambda item: item[1], reverse=True
):
    print(f"{feature_name}: {score}")

# Now we can apply Logistic Regression and Random Forests on the new features_df_new
# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, features_df_new, y, cv=tscv, scoring="f1")

print(f"\nLogistic Regression CV F1 score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, features_df_new, y, cv=tscv, scoring="f1")

print(f"Random Forest CV F1 score: {np.mean(cv_scores)}")

# Reshape input to be 3D [samples, timesteps, features]
X_array = X.values
X_reshaped = X_array.reshape((X_array.shape[0], 1, X_array.shape[1]))

# Call the function
mean_f1_score = timeseries_cv_score(X_reshaped, y.values, n_splits=5)
print(f"\nLSTM CV F1 score: {mean_f1_score}")

print("\n", features_df_new.columns)

NameError: name 'X' is not defined

## Base Model

In [None]:
# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")

print(f"Logistic Regression CV ROC AUC score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")

print(f"Random Forest CV ROC AUC score: {np.mean(cv_scores)}")

print("\n", X.columns)

NameError: name 'X' is not defined

## LSTM

In [None]:
# Reshape input to be 3D [samples, timesteps, features]
X_array = X.values
X_reshaped = X_array.reshape((X_array.shape[0], 1, X_array.shape[1]))

# Call the function
mean_auc_score = timeseries_cv_score(X_reshaped, y.values, n_splits=5)
print(f"\nLSTM CV ROC AUC score: {mean_auc_score}")


LSTM CV ROC AUC score: (0.6953945419128389, 0.4942367515727568)
