## Imports ##

In [385]:
import numpy as np
import pandas as pd
import pandas_ta as pta
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, RFECV
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from xgboost import XGBClassifier

## Functions ##

In [386]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    return df


def preprocess_data(df):
    # Convert 'color' column to binary representation
    df["color"] = df["color"].map({"red": 0, "green": 1})

    df["color_change"] = df["color"].diff().ne(0).astype(int)
    df["color_change"].fillna(0, inplace=True)

    # Fill NaNs in specified columns with 0
    df["PSARl_0.01_0.1"].fillna(0, inplace=True)
    df["PSARs_0.01_0.1"].fillna(0, inplace=True)
    df["ICS_15"].fillna(0, inplace=True)

    # # Diff features
    # df["volume_diff"] = df["volume"].diff()
    # df["turnover_diff"] = df["turnover"].diff()
    # df["close_open"] = df["close"] - df["open"]
    # df["open_close"] = df["open"] - df["close"]
    # df["high_low"] = df["high"] - df["low"]
    df["close_change"] = df["close"].diff()
    # df["open_change"] = df["open"].diff()
    # df["h_diff"] = df["high"] - df["high"].shift(1)
    # df["l_diff"] = df["low"] - df["low"].shift(1)
    # df["hl_shift"] = df["high"] - df["low"].shift(1)
    df["high_pct"] = df["high"].pct_change()
    # df["low_pct"] = df["low"].pct_change()
    # df["close_open_lag"] = df["close_open"].shift(1)
    # df["high_low_lag"] = df["high_low"].shift(1)

    # # Ratio features
    # N = 10
    # df["PSARr_to_RVI"] = df["PSARr_0.01_0.1"] / df["RVI_5"]
    # df["High_low_pct_diff"] = df["high_pct"] - df["low_pct"]
    # consecutive_same_color = (
    #     (df["color"] == df["color"].shift(1))
    #     .astype(int)
    #     .groupby(df["color"].ne(df["color"].shift()).cumsum())
    #     .cumcount()
    # )
    # df["past_candle_color_ratio"] = consecutive_same_color.rolling(N).sum() / N
    # df["volume_to_avg_vol_ratio"] = df["volume"] / df["avg_vol_last_100"]
    # df["turnover_to_avg_turnover_ratio"] = (
    #     df["turnover"] / df["turnover"].rolling(N).mean()
    # )
    # upper_shadow = (df["high"] - df[["open", "close"]].max(axis=1)).apply(abs)
    # lower_shadow = (df["low"] - df[["open", "close"]].min(axis=1)).apply(abs)
    # df["shadow_ratio"] = upper_shadow / lower_shadow
    # df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # # Lagged features
    # df["lag5_high_pct"] = df["high_pct"].shift(5)
    # df["lag5_close_change"] = df["close_change"].shift(5)
    # df["lag5_close_change"] = df["close_change"].shift(1)

    # Rolling features
    df["close_change_roll5"] = df["close_change"].rolling(20).mean()
    df["RSI_14_roll5"] = df["RSI_14"].rolling(20).mean()
    df["ATR_14_roll5"] = df["ATR_14"].rolling(20).mean()
    df["volume_roll5"] = df["volume"].rolling(20).mean()
    df["high_pct_roll5"] = df["high_pct"].rolling(20).mean()
    df["volatility_5"] = df["close"].rolling(20).std()
    df["price_ema5"] = df["close"].ewm(span=20).mean()
    df["volume_ema5"] = df["volume"].ewm(span=20).mean()
    df["price_to_ema5"] = df["close"] / df["price_ema5"] - 1
    df["volume_change_roll5"] = df["volume"].pct_change().rolling(20).mean()

    return df


def scale_data(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_scaled

## Globals ##

In [387]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load the data
data_path = (
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_12min_ha_ti.csv"
)

# List of features to drop
features_to_drop = [
    ########
    # Univariate Feature Selection Features #
    #######
    "cmf",
    "MACDs_6_13_5_6_13_5",
    "RVI_10",
    "BBL_5_2.0_5",
    "STOCHk_14_3_3",
    "STOCHk_14_3_3_7_3_3",
    "STOCHk_14_3_3_10_3_3",
    "MACDh_6_13_5_6_13_5",
    "MACDh_12_26_9",
    "BBL_10_2.0_10",
    "R3",
    "R2",
    "R1",
    "open",
    "BBL_15_2.0_15",
    "SMA_5",
    "BBM_5_2.0_5",
    "PP",
    "BBL_20_2.0_20",
    "EMA_5",
    "ITS_5",
    "SMA_10",
    "BBM_10_2.0_10",
    "EMA_10",
    "S1",
    "SMA_20",
    "EMA_2",
    "BBM_15_2.0_15",
    "S2",
    "BBM_20_2.0_20",
    "IKS_15",
    "close",
    "PSARaf_0.01_0.1",
    "S3",
    "MACD_6_13_5_6_13_5",
    "ISA_5",
    "ISB_15",
    "BBU_15_2.0_15",
    "BBU_10_2.0_10",
    "BBB_15_2.0_15",
    "obv",
    "BBU_5_2.0_5",
    "high",
    "STOCHd_14_3_3",
    "STOCHd_14_3_3_7_3_3",
    "STOCHd_14_3_3_10_3_3",
    "TRIXs_10_5",
    "MACD_12_26_9",
    "PSARs_0.01_0.1",
    "TRIXs_18_9",
    "ICS_15",
    "ROC_10",
    "BBB_20_2.0_20",
    "bollinger_bandwidth",
    "RVI_15",
    "MACDs_12_26_9",
    "PSARl_0.01_0.1",
    "TRIX_18_9",
    "TRIXs_12_6",
    "ROC_5",
    "BBU_20_2.0_20",
    "low",
    "TRIX_12_6",
    "ROC_14",
]

## Preprocessing

In [388]:
df = load_data(data_path)

# Preprocess the data
df = preprocess_data(df)

# Drop the specified features
df = df.drop(columns=features_to_drop, errors="ignore")

# Prepare TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

## Scale Data

In [389]:
# Forward Fill
df.ffill(inplace=True)

# Backward Fill
df.bfill(inplace=True)

# Re-scale the data to include the new feature
df_scaled = scale_data(df)

# print(df.isna().sum())

## Ablation ##

In [390]:
# # Define initial features for ablation process
# ablation_features = [
#     "cmf",
#     "MACDs_6_13_5_6_13_5",
#     "RVI_10",
#     "BBL_5_2.0_5",
#     "STOCHk_14_3_3",
#     "STOCHk_14_3_3_7_3_3",
#     "STOCHk_14_3_3_10_3_3",
#     "MACDh_6_13_5_6_13_5",
#     "MACDh_12_26_9",
#     "BBL_10_2.0_10",
#     "R3",
#     "R2",
#     "o_shift",
#     "R1",
#     "open",
#     "BBL_15_2.0_15",
#     "SMA_5",
#     "BBM_5_2.0_5",
#     "c_shift",
#     "PP",
#     "BBL_20_2.0_20",
#     "EMA_5",
#     "ITS_5",
#     "SMA_10",
#     "BBM_10_2.0_10",
#     "EMA_10",
#     "S1",
#     "SMA_20",
#     "EMA_2",
#     "BBM_15_2.0_15",
#     "S2",
#     "BBM_20_2.0_20",
#     "IKS_15",
#     "close",
#     "PSARaf_0.01_0.1",
#     "S3",
#     "MACD_6_13_5_6_13_5",
#     "ISA_5",
#     "ISB_15",
#     "BBU_15_2.0_15",
#     "BBU_10_2.0_10",
#     "BBB_15_2.0_15",
#     "obv",
#     "BBU_5_2.0_5",
#     "high",
#     "STOCHd_14_3_3",
#     "STOCHd_14_3_3_7_3_3",
#     "STOCHd_14_3_3_10_3_3",
#     "TRIXs_10_5",
#     "MACD_12_26_9",
#     "PSARs_0.01_0.1",
#     "TRIXs_18_9",
#     "ICS_15",
#     "ROC_10",
#     "BBB_20_2.0_20",
#     "bollinger_bandwidth",
#     "RVI_15",
#     "MACDs_12_26_9",
#     "PSARl_0.01_0.1",
#     "TRIX_18_9",
#     "volume_diff_lag",
#     "TRIXs_12_6",
#     "ROC_5",
#     "BBU_20_2.0_20",
#     "low",
#     "TRIX_12_6",
#     "ROC_14",
# ]

# # All the features in the data
# all_features = list(df.columns)
# all_features.remove("color_change")  # we remove the target variable

# log_reg_scores = {}
# rf_scores = {}

# for feature in ablation_features:
#     # Remove one feature from the all features list
#     current_features = [f for f in all_features if f != feature]

#     # Prepare your features and target
#     X = df[current_features]
#     y = df["color_change"]

#     # Logistic Regression
#     log_reg = LogisticRegression(random_state=42, max_iter=500)

#     # Cross-validation
#     cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")
#     log_reg_scores[feature] = np.mean(cv_scores)

# # Print logistic regression scores sorted by score
# print("Logistic Regression Scores")
# for feature, score in sorted(
#     log_reg_scores.items(), key=lambda item: item[1], reverse=True
# ):
#     print(f"CV ROC AUC score without {feature}: {score}")

# for feature in ablation_features:
#     # Remove one feature from the all features list
#     current_features = [f for f in all_features if f != feature]

#     # Prepare your features and target
#     X = df[current_features]
#     y = df["color_change"]

#     # Random Forest
#     rf = RandomForestClassifier(n_estimators=100, random_state=42)

#     # Cross-validation
#     cv_scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")
#     rf_scores[feature] = np.mean(cv_scores)

# # Print random forest scores sorted by score
# print("\nRandom Forest Scores")
# for feature, score in sorted(rf_scores.items(), key=lambda item: item[1], reverse=True):
#     print(f"CV ROC AUC score without {feature}: {score}")

## Reverse Ablation ##

In [391]:
# # Define initial features for ablation process
# ablation_features = [
#     "cmf",
#     "MACDs_6_13_5_6_13_5",
#     "RVI_10",
#     "BBL_5_2.0_5",
#     "STOCHk_14_3_3",
#     "STOCHk_14_3_3_7_3_3",
#     "STOCHk_14_3_3_10_3_3",
#     "MACDh_6_13_5_6_13_5",
#     "MACDh_12_26_9",
#     "BBL_10_2.0_10",
#     "R3",
#     "R2",
#     "o_shift",
#     "R1",
#     "open",
#     "BBL_15_2.0_15",
#     "SMA_5",
#     "BBM_5_2.0_5",
#     "c_shift",
#     "PP",
#     "BBL_20_2.0_20",
#     "EMA_5",
#     "ITS_5",
#     "SMA_10",
#     "BBM_10_2.0_10",
#     "EMA_10",
#     "S1",
#     "SMA_20",
#     "EMA_2",
#     "BBM_15_2.0_15",
#     "S2",
#     "BBM_20_2.0_20",
#     "IKS_15",
#     "close",
#     "PSARaf_0.01_0.1",
#     "S3",
#     "MACD_6_13_5_6_13_5",
#     "ISA_5",
#     "ISB_15",
#     "BBU_15_2.0_15",
#     "BBU_10_2.0_10",
#     "BBB_15_2.0_15",
#     "obv",
#     "BBU_5_2.0_5",
#     "high",
#     "STOCHd_14_3_3",
#     "STOCHd_14_3_3_7_3_3",
#     "STOCHd_14_3_3_10_3_3",
#     "TRIXs_10_5",
#     "MACD_12_26_9",
#     "PSARs_0.01_0.1",
#     "TRIXs_18_9",
#     "ICS_15",
#     "ROC_10",
#     "BBB_20_2.0_20",
#     "bollinger_bandwidth",
#     "RVI_15",
#     "MACDs_12_26_9",
#     "PSARl_0.01_0.1",
#     "TRIX_18_9",
#     "volume_diff_lag",
#     "TRIXs_12_6",
#     "ROC_5",
#     "BBU_20_2.0_20",
#     "low",
#     "TRIX_12_6",
#     "ROC_14",
# ]

# # All the features in the data
# all_features = list(df.columns)
# all_features.remove("color_change")  # we remove the target variable

# # Base features are all features excluding the ones in the ablation list
# base_features = [
#     feature for feature in all_features if feature not in ablation_features
# ]

# log_reg_scores = {}
# rf_scores = {}

# for feature in ablation_features:
#     # Add one feature from the ablation list
#     current_features = base_features + [feature]

#     # Prepare your features and target
#     X = df[current_features]
#     y = df["color_change"]

#     # Logistic Regression
#     log_reg = LogisticRegression(random_state=42, max_iter=500)

#     # Cross-validation
#     cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")
#     log_reg_scores[feature] = np.mean(cv_scores)

# # Print logistic regression scores sorted by score
# print("Logistic Regression Scores")
# for feature, score in sorted(
#     log_reg_scores.items(), key=lambda item: item[1], reverse=True
# ):
#     print(f"CV ROC AUC score with {feature}: {score}")

# for feature in ablation_features:
#     # Add one feature from the ablation list
#     current_features = base_features + [feature]

#     # Prepare your features and target
#     X = df[current_features]
#     y = df["color_change"]

#     # Random Forest
#     rf = RandomForestClassifier(n_estimators=100, random_state=42)

#     # Cross-validation
#     cv_scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")
#     rf_scores[feature] = np.mean(cv_scores)

# # Print random forest scores sorted by score
# print("\nRandom Forest Scores")
# for feature, score in sorted(rf_scores.items(), key=lambda item: item[1], reverse=True):
#     print(f"CV ROC AUC score with {feature}: {score}")

## Feature Importance ##

In [392]:
# X = df.drop("color_change", axis=1)
# y = df["color_change"]

# # Re-scale the data to include the new feature
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Fit XGBoost model and get feature importances
# xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
# xgb.fit(X_scaled, y)

# # Store the feature importances in a pandas series, then sort it in descending order
# importances = pd.Series(xgb.feature_importances_, index=X.columns)
# importances_sorted = importances.sort_values(ascending=False)

# print("Feature importances:")
# print(importances_sorted)

# # Now we can apply Logistic Regression and Random Forests using the features
# # Logistic Regression
# log_reg = LogisticRegression(random_state=42, max_iter=500)

# # Cross-validation
# cv_scores = cross_val_score(log_reg, X_scaled, y, cv=tscv, scoring="f1")

# print(f"\nLogistic Regression CV F1 score: {np.mean(cv_scores)}")

# # Random Forest
# rf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Cross-validation
# cv_scores = cross_val_score(rf, X_scaled, y, cv=tscv, scoring="f1")

# print(f"Random Forest CV F1 score: {np.mean(cv_scores)}")

## Recursive Feature Elimination ##

In [393]:
# X = df.drop("color_change", axis=1)
# y = df["color_change"]

# # Re-scale the data to include the new feature
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Recursive Feature Elimination
# # Here we use Logistic Regression as the model to evaluate the feature importance
# # You can replace it with any model you prefer
# model = LogisticRegression(random_state=42, max_iter=500)
# rfe = RFECV(model)
# rfe.fit(X_scaled, y)

# # Get the features ranking
# feature_ranking = {
#     feature_name: rank for feature_name, rank in zip(X.columns, rfe.ranking_)
# }

# # Sort and print the feature ranking
# for feature_name, rank in sorted(feature_ranking.items(), key=lambda item: item[1]):
#     print(f"{feature_name}: {rank}")

# # Transform X to include only the selected features
# X_transformed = rfe.transform(X_scaled)

# # Logistic Regression with transformed features
# log_reg = LogisticRegression(random_state=42, max_iter=500)

# # Cross-validation
# cv_scores = cross_val_score(log_reg, X_transformed, y, cv=tscv, scoring="f1")

# print(f"\nLogistic Regression CV F1 score: {np.mean(cv_scores)}")

# # Random Forest with transformed features
# rf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Cross-validation
# cv_scores = cross_val_score(rf, X_transformed, y, cv=tscv, scoring="f1")

# print(f"Random Forest CV F1 score: {np.mean(cv_scores)}")

## Univariate Feature Selection Process ##

In [394]:
X = df.drop("color_change", axis=1)
y = df["color_change"]

# Re-scale the data to include the new feature
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# feature selection
selector = SelectKBest(score_func=f_classif, k="all")
selector.fit(X_scaled, y)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
features_df_new = X.iloc[:, cols]

# Store the scores of each feature in a dictionary
feature_scores = {
    feature_name: score for feature_name, score in zip(X.columns, selector.scores_)
}

# Sort the dictionary by value in descending order and print the scores
for feature_name, score in sorted(
    feature_scores.items(), key=lambda item: item[1], reverse=True
):
    print(f"{feature_name}: {score}")

# Now we can apply Logistic Regression and Random Forests on the new features_df_new
# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, features_df_new, y, cv=tscv, scoring="f1")

print(f"\nLogistic Regression CV F1 score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, features_df_new, y, cv=tscv, scoring="f1")

print(f"Random Forest CV F1 score: {np.mean(cv_scores)}")

print("\n", features_df_new.columns)

high_pct: 110.94480524940894
volume: 91.92131389069183
turnover: 89.7339138964675
BBB_5_2.0_5: 40.51976375637065
PSARr_0.01_0.1: 33.222447998121424
ATR_5: 8.09442137040173
volume_ema5: 7.443922940784361
BBP_15_2.0_15: 6.265493240326674
CCI_14: 6.182127667059803
CCI_10: 5.536539624921639
close_change: 5.264615289857336
BBP_10_2.0_10: 4.8425351451218335
volume_change_roll5: 4.467484190456277
BBP_20_2.0_20: 4.391442577137113
BBB_10_2.0_10: 3.449706673320239
volume_roll5: 3.405378790575545
ATR_10: 3.3469011580871424
mfi: 2.7913408994283717
color: 2.650632704450662
ATR_14: 2.5343458390778144
BBP_5_2.0_5: 2.5087912415826117
CCI_5: 2.4255515533899725
avg_vol_last_100: 2.179824307137989
RSI_5: 1.8759432547309074
RVI_5: 1.770072600613789
RSI_10: 1.700735361160013
RSI_14: 1.3811647003530192
time: 1.285269957412531
high_pct_roll5: 1.2662544938393032
ATR_14_roll5: 0.8521782525282505
price_ema5: 0.31805785232331435
TRIX_10_5: 0.31578147717879135
RSI_14_roll5: 0.3015932283688058
close_change_roll5: 

## Base Model ##

In [395]:
# Prepare your features and target
X = df.drop("color_change", axis=1)
y = df["color_change"]

# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")

print(f"Logistic Regression CV ROC AUC score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")

print(f"Random Forest CV ROC AUC score: {np.mean(cv_scores)}")

print("\n", X.columns)

Logistic Regression CV ROC AUC score: 0.5547196931554365
Random Forest CV ROC AUC score: 0.8702600316865063

 Index(['time', 'volume', 'turnover', 'color', 'avg_vol_last_100', 'RSI_5',
       'RSI_10', 'RSI_14', 'ATR_14', 'ATR_10', 'ATR_5', 'CCI_14', 'CCI_10',
       'CCI_5', 'mfi', 'RVI_5', 'PSARr_0.01_0.1', 'TRIX_10_5', 'BBB_5_2.0_5',
       'BBP_5_2.0_5', 'BBB_10_2.0_10', 'BBP_10_2.0_10', 'BBP_15_2.0_15',
       'BBP_20_2.0_20', 'close_change', 'high_pct', 'close_change_roll5',
       'RSI_14_roll5', 'ATR_14_roll5', 'volume_roll5', 'high_pct_roll5',
       'volatility_5', 'price_ema5', 'volume_ema5', 'price_to_ema5',
       'volume_change_roll5'],
      dtype='object')
