## Imports ##

In [2317]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import LSTM, Dense

## Functions ##

In [2318]:
# Define the model
def create_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, activation="relu", input_shape=input_shape))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

## Globals ##

In [2319]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# List of features to drop
features_to_drop = [
    ## Set 3
    "ROC_5",
    "BBM_5_2.0_5",
    "EMA_5",
    "PP",
    "RSI_5",
    "MACDh_12_26_9",
    "BBL_15_2.0_15",
    "BBM_15_2.0_15",
    "MACDs_6_13_5_6_13_5",
    "BBB_20_2.0_20",
    "STOCHd_14_3_3",
    "TRIX_12_6",
    "PSARs_0.01_0.1",
    "STOCHk_14_3_3_10_3_3",
    "avg_vol_last_100",
    ## Set 2
    "BBP_5_2.0_5",
    "IKS_15",
    "RVI_15",
    "close",
    "obv",
    "BBU_15_2.0_15",
    "SMA_10",
    "turnover",
    "BBM_10_2.0_10",
    "ICS_15",
    "TRIXs_10_5",
    "STOCHd_14_3_3_7_3_3",
    "PSARl_0.01_0.1",
    ## Set 1
    "BBU_20_2.0_20",
    "ROC_10",
    "low",
    "volume",
    "STOCHk_14_3_3_10_3_3",
    "BBL_5_2.0_5",
    "RVI_5",
    "MACDs_12_26_9",
    "CCI_5",
    "BBM_20_2.0_20",
    "SMA_5",
    "RSI_14",
]

## Preprocessing ##

In [2320]:
# Load the data
data_path = (
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_12min_ha_ti.csv"
)
df = pd.read_csv(data_path)

# Convert 'color' column to binary representation
df["color"] = df["color"].map({"red": 0, "green": 1})

df["color_change"] = df["color"].diff().abs()
df["color_change"].fillna(0, inplace=True)

# Fill NaNs in specified columns with 0
df["PSARl_0.01_0.1"].fillna(0, inplace=True)
df["PSARs_0.01_0.1"].fillna(0, inplace=True)
df["ICS_15"].fillna(0, inplace=True)

# Forward Fill
df.ffill(inplace=True)

# Backward Fill
df.bfill(inplace=True)

# Split the data into features and target
X = df.drop("color_change", axis=1)
y = df["color_change"]

# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to your data and transform it
X_scaled = scaler.fit_transform(X)

# Prepare TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)


# Verify the changes
# print(df.head(10))

## Base Model ##

In [2321]:
# Perform cross-validation
accuracies = []

for train_index, test_index in tscv.split(X_scaled):
    # Drop the features from your dataset
    X_train = np.delete(
        X_scaled[train_index], df.columns.get_indexer(features_to_drop), axis=1
    )
    X_test = np.delete(
        X_scaled[test_index], df.columns.get_indexer(features_to_drop), axis=1
    )
    y_train, y_test = y[train_index], y[test_index]

    # Define and train the model
    base_model = LogisticRegression(max_iter=500)
    base_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = base_model.predict(X_test)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Print the mean accuracy over all folds
print(f"Base model cross-validated accuracy: {np.mean(accuracies)}")

Base model cross-validated accuracy: 0.5888888888888889


## Ablation ##

In [2322]:
# # Create a dataframe to hold results
# results = pd.DataFrame(columns=["feature", "accuracy"])

# # Feature ablation
# for column in X.columns:
#     print(f"Running model without {column}")
#     X_temp = X.drop(column, axis=1)
#     X_temp = X_temp.values.reshape((X_temp.shape[0], X_temp.shape[1], 1))
#     accuracies = []  # list to hold accuracies for each split

#     # Perform TimeSeriesSplit
#     for train_index, test_index in tscv.split(X_temp):
#         X_train, X_test = X_temp[train_index], X_temp[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         # Create and fit the model
#         model = create_model((X_train.shape[1], 1))
#         model.fit(X_train, y_train, epochs=10, verbose=0)

#         # Make predictions on the test set
#         y_pred = model.predict(X_test)
#         y_pred = (y_pred > 0.5).astype(int).flatten()
#         accuracy = accuracy_score(y_test, y_pred)
#         accuracies.append(accuracy)

#     # Calculate average accuracy for this feature
#     avg_accuracy = sum(accuracies) / len(accuracies)
#     print(f"Average accuracy without {column}: {avg_accuracy}")
#     results = results.append(
#         {"feature": column, "accuracy": avg_accuracy}, ignore_index=True
#     )

#     # Save results to CSV
#     results.to_csv(f"ablation_{column}.csv", index=False)

## Utils ##

In [2323]:
## Read abalation file ##

# Load the DataFrame
df = pd.read_csv("model_accuracy_results.csv")

# Sort the DataFrame by 'accuracy' from lowest to highest
df_sorted = df.sort_values(by="accuracy", ascending=False)

# # Sample 50% of the DataFrame
# df_sampled = df_sorted.sample(frac=0.5)

# Display the sampled DataFrame
print(df_sorted)

                 feature   accuracy
64                RVI_10  76.371568
8                    obv  76.309228
84                ISB_15  76.309228
76        PSARr_0.01_0.1  76.309228
4               turnover  76.246881
30         BBL_20_2.0_20  76.246881
42         BBM_10_2.0_10  76.184541
87                ICS_15  76.184541
82            TRIXs_10_5  76.122195
36           BBL_5_2.0_5  76.122195
10                RSI_10  76.059848
46         STOCHk_14_3_3  76.059848
21                 EMA_2  76.059848
34         BBP_20_2.0_20  76.059848
80            TRIXs_12_6  75.997508
60                 CCI_5  75.997508
59                CCI_10  75.997508
85                 ITS_5  75.997508
78            TRIXs_18_9  75.935161
72                    S3  75.935161
53                ATR_10  75.935161
49   STOCHd_14_3_3_7_3_3  75.935161
48   STOCHk_14_3_3_7_3_3  75.935161
54                 ATR_5  75.872821
0                   open  75.872821
14         MACDs_12_26_9  75.810474
32         BBU_20_2.0_20  75