## Imports ##

In [1082]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam

## Functions ##

In [1083]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    return df


def preprocess_data(df):
    # Convert 'color' column to binary representation
    df["color"] = df["color"].map({"red": 0, "green": 1})

    df["color_change"] = df["color"].diff().ne(0).astype(int)
    df["color_change"].fillna(0, inplace=True)

    # Fill NaNs in specified columns with 0
    df["PSARl_0.01_0.1"].fillna(0, inplace=True)
    df["PSARs_0.01_0.1"].fillna(0, inplace=True)
    df["ICS_15"].fillna(0, inplace=True)

    # duo diff features
    df["open_close"] = df["open"] - df["close"]
    df["high_low"] = df["high"] - df["low"]
    df["close_change"] = df["close"].diff()
    df["open_change"] = df["open"].diff()

    # Create difference features for best performing features
    diff_features = [
        "BBL_5_2.0_5",
        # "BBU_5_2.0_5",
        # "ATR_5",
        # "ATR_10",
        # "ATR_14",
        # "high",
        # "PSARaf_0.01_0.1",
        # "BBU_10_2.0_10",
        # "BBB_10_2.0_10",
        # "BBL_10_2.0_10",
        # "BBL_20_2.0_20",
        # "bollinger_bandwidth",
        # "PSARr_0.01_0.1",
        # "S2",
        # "S3",
        # "ISB_15",
        # "S1",
        # "PSARl_0.01_0.1",
        # "turnover",
        # "volume",
    ]

    for feature in diff_features:
        df[f"{feature}_diff"] = df[feature].diff()

    return df


def scale_data(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_scaled

## Globals ##

In [1084]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load the data
data_path = (
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_12min_ha_ti.csv"
)

# List of features to drop
features_to_drop = [
    ## Set 3
    "ROC_5",
    "EMA_5",
    "PP",
    "RSI_5",
    "MACDh_12_26_9",
    "BBL_15_2.0_15",
    "BBM_15_2.0_15",
    "MACDs_6_13_5_6_13_5",
    "BBB_20_2.0_20",
    "STOCHd_14_3_3",
    "TRIX_12_6",
    "STOCHk_14_3_3_10_3_3",
    ## Set 2
    "BBP_5_2.0_5",
    "RVI_15",
    "obv",
    "BBU_15_2.0_15",
    "SMA_10",
    "BBM_10_2.0_10",
    "ICS_15",
    "TRIXs_10_5",
    "STOCHd_14_3_3_7_3_3",
    "PSARs_0.01_0.1",
    ## Set 1
    "BBU_20_2.0_20",
    "ROC_10",
    "STOCHk_14_3_3_10_3_3",
    "MACDs_12_26_9",
    "CCI_5",
    "BBM_20_2.0_20",
    "SMA_5",
    "RSI_14",
    ## Set 0
    "low",
    "volume",
    "close",
    "avg_vol_last_100",
]

## Preprocessing

In [1085]:
df = load_data(data_path)

# Preprocess the data
df = preprocess_data(df)

# Drop the specified features
df = df.drop(columns=features_to_drop, errors="ignore")

# Prepare TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

## Scale Data

In [1086]:
# Forward Fill
df.ffill(inplace=True)

# Backward Fill
df.bfill(inplace=True)

# Re-scale the data to include the new feature
df_scaled = scale_data(df)

# print(df.isna().sum())

## DIFF Ablation ##

In [1087]:
# # Initial DataFrame for scaling
# df_initial = df.copy()

# # List of all possible features
# possible_features = list(df.columns)

# # We want to ignore the target column 'color_change'
# possible_features.remove("color_change")

# # A dictionary to store each feature and its associated score
# feature_scores = {}

# for feature in possible_features:
#     # Reset the DataFrame to initial state
#     df = df_initial.copy()

#     # Add the difference feature
#     df[f"{feature}_diff"] = df[feature].diff()

#     # Forward Fill
#     df.ffill(inplace=True)

#     # Backward Fill
#     df.bfill(inplace=True)

#     # Keep a copy of the target variable
#     color_change = df["color_change"]

#     # Re-scale the data to include the new feature (excluding the target variable)
#     df_scaled = scale_data(df.drop("color_change", axis=1))

#     # Add the target variable back to the DataFrame
#     df_scaled["color_change"] = color_change

#     # Prepare features and target
#     X = df_scaled.drop("color_change", axis=1)
#     y = df_scaled["color_change"]

#     # Logistic Regression
#     log_reg = LogisticRegression(random_state=42, max_iter=500)

#     # Cross-validation
#     cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")

#     # Add the score to the dictionary, using the feature name as the key
#     feature_scores[feature] = np.mean(cv_scores)

# # Now, you can print out the features and their scores, sorted by the score
# for feature, score in sorted(
#     feature_scores.items(), key=lambda item: item[1], reverse=True
# ):
#     print(f"{feature}: {score}")

## Base Model ##

In [1088]:
# Prepare your features and target
X = df.drop("color_change", axis=1)
y = df["color_change"]

# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Cross-validation
cv_scores = cross_val_score(log_reg, X, y, cv=tscv, scoring="roc_auc")

print(f"Logistic Regression CV ROC AUC score: {np.mean(cv_scores)}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")

print(f"Random Forest CV ROC AUC score: {np.mean(cv_scores)}")

print("\n", X.columns)

Logistic Regression CV ROC AUC score: 0.5547205360485506
Random Forest CV ROC AUC score: 0.8492430642567098

 Index(['time', 'open', 'high', 'turnover', 'color', 'R1', 'S1', 'R2', 'S2',
       'R3', 'S3', 'RSI_10', 'ATR_14', 'ATR_10', 'ATR_5', 'ROC_14', 'CCI_14',
       'CCI_10', 'cmf', 'mfi', 'RVI_10', 'RVI_5', 'PSARl_0.01_0.1',
       'PSARaf_0.01_0.1', 'PSARr_0.01_0.1', 'TRIX_18_9', 'TRIXs_18_9',
       'TRIXs_12_6', 'TRIX_10_5', 'SMA_20', 'EMA_2', 'EMA_10', 'ISA_5',
       'ISB_15', 'ITS_5', 'IKS_15', 'BBL_5_2.0_5', 'BBM_5_2.0_5',
       'BBU_5_2.0_5', 'BBB_5_2.0_5', 'BBL_10_2.0_10', 'BBU_10_2.0_10',
       'BBB_10_2.0_10', 'BBP_10_2.0_10', 'BBB_15_2.0_15', 'BBP_15_2.0_15',
       'BBL_20_2.0_20', 'BBP_20_2.0_20', 'bollinger_bandwidth', 'MACD_12_26_9',
       'MACD_6_13_5_6_13_5', 'MACDh_6_13_5_6_13_5', 'STOCHk_14_3_3',
       'STOCHk_14_3_3_7_3_3', 'STOCHd_14_3_3_10_3_3', 'open_close', 'high_low',
       'close_change', 'open_change', 'BBL_5_2.0_5_diff', 'BBB_5_2.0_5_diff'],
     

## Feature Engineering ##

## Feature Importance ##

## Correlation ##

In [1089]:
# # Compute pairwise correlation of columns
# corr = df.corr()

# # Set a lower threshold for correlations you want to display
# threshold = -0.6

# # Extract feature pairs with correlations below the threshold
# filtered_correlations = corr.stack().reset_index()
# filtered_correlations.columns = ["Feature 1", "Feature 2", "Correlation"]
# filtered_correlations = filtered_correlations[
#     (filtered_correlations["Feature 1"] != filtered_correlations["Feature 2"])
#     & (filtered_correlations["Correlation"] <= threshold)
# ]

# # Sort the filtered correlations by value and remove duplicates
# filtered_correlations = filtered_correlations.sort_values(
#     by="Correlation", ascending=True
# ).drop_duplicates()
# print(filtered_correlations)

## Utils ##

In [1090]:
# ## Read ablation file ##

# # Load the DataFrame
# df = pd.read_csv("model_accuracy_results.csv")

# # Sort the DataFrame by 'accuracy' from lowest to highest
# df_sorted = df.sort_values(by="accuracy", ascending=False)

# # # Sample 50% of the DataFrame
# # df_sampled = df_sorted.sample(frac=0.5)

# # Display the sampled DataFrame
# print(df_sorted)

## Ablation ##