## Imports ##

In [215]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import LSTM, Dense

## Functions ##

In [216]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    return df


def preprocess_data(df):
    # Convert 'color' column to binary representation
    df["color"] = df["color"].map({"red": 0, "green": 1})

    df["color_change"] = df["color"].diff().ne(0).astype(int)
    df["color_change"].fillna(0, inplace=True)

    # Fill NaNs in specified columns with 0
    df["PSARl_0.01_0.1"].fillna(0, inplace=True)
    df["PSARs_0.01_0.1"].fillna(0, inplace=True)
    df["ICS_15"].fillna(0, inplace=True)

    # Forward Fill
    df.ffill(inplace=True)

    # Backward Fill
    df.bfill(inplace=True)

    return df


def scale_data(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_scaled


def create_features(df):
    df["BBP_diff"] = df["BBP_10_2.0_10"] - df["BBP_5_2.0_5"]
    df["CCI_diff"] = df["CCI_10"] - df["CCI_5"]
    df["RVI_diff"] = df["RVI_10"] - df["RVI_5"]
    df["RSI_diff"] = df["RSI_14"] - df["RSI_5"]

    return df


def prepare_for_lstm(df, target_column, time_steps):
    # Check if target_column is in df
    if target_column not in df.columns:
        print(f"Target column '{target_column}' not found in DataFrame.")
        return None, None

    # Drop the target column from X
    X = df.drop(target_column, axis=1).values

    # Create y from the target column
    y = df[target_column].values

    # Reshape X to (samples, time_steps, features)
    X = np.array([X[i - time_steps : i] for i in range(time_steps, len(X))])

    # Reshape y to (samples, )
    y = y[time_steps:]

    return X, y


def feature_importance(X, y):
    # Use RandomForestClassifier to evaluate feature importance
    model = RandomForestClassifier(n_estimators=100, random_state=0)
    model.fit(X, y)

    importances = model.feature_importances_
    return importances


# Define the model
def create_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, activation="relu", input_shape=input_shape))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

## Globals ##

In [217]:
# # Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load the data
data_path = (
    "../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_12min_ha_ti.csv"
)

# List of features to drop
features_to_drop = [
    ## Set 3
    "ROC_5",
    "EMA_5",
    "PP",
    "RSI_5",
    "MACDh_12_26_9",
    "BBL_15_2.0_15",
    "BBM_15_2.0_15",
    "MACDs_6_13_5_6_13_5",
    "BBB_20_2.0_20",
    "STOCHd_14_3_3",
    "TRIX_12_6",
    "STOCHk_14_3_3_10_3_3",
    "avg_vol_last_100",
    ## Set 2
    "BBP_5_2.0_5",
    "RVI_15",
    "close",
    "obv",
    "BBU_15_2.0_15",
    "SMA_10",
    "turnover",
    "BBM_10_2.0_10",
    "ICS_15",
    "TRIXs_10_5",
    "STOCHd_14_3_3_7_3_3",
    "PSARs_0.01_0.1",
    ## Set 1
    "BBU_20_2.0_20",
    "ROC_10",
    "low",
    "volume",
    "STOCHk_14_3_3_10_3_3",
    "MACDs_12_26_9",
    "CCI_5",
    "BBM_20_2.0_20",
    "SMA_5",
    "RSI_14",
]

## Preprocessing ##

In [218]:
df = load_data(data_path)

# Preprocess the data
df = preprocess_data(df)

# Create features (if you have the function)
df = create_features(df)

# Scale the data
df_scaled = scale_data(df)

# Prepare data for LSTM
time_steps = 60  # choose based on your needs
X_lstm, y_lstm = prepare_for_lstm(df_scaled, "color_change", time_steps)

# Prepare TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

## Base Model ##

In [220]:
# Drop the unnecessary columns before scaling
X = df.drop("color_change", axis=1)
X = X.drop(columns=features_to_drop)

# Scale the data
X_scaled = scale_data(X)
X_scaled = X_scaled.values  # Convert DataFrame to numpy array

# Perform cross-validation
accuracies = []

for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Define and train the model
    base_model = LogisticRegression(max_iter=500)
    base_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = base_model.predict(X_test)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Print the mean accuracy over all folds
print(f"Base model cross-validated accuracy: {np.mean(accuracies)}")

Base model cross-validated accuracy: 0.5793650793650793


## Feature Importance ##

In [None]:
# Prepare your data for Random Forest
# For example, use the first 60 days as features and the color change of the 61st day as the target.
X_rf = df_scaled.iloc[:-1]  # All but the last row
y_rf = df["color_change"].iloc[1:]  # All but the first row

importances = feature_importance(X_rf, y_rf)

# Output the feature importance in descending order
features = df.columns
importance_df = pd.DataFrame({"Feature": features, "Importance": importances})
importance_df = importance_df.sort_values("Importance", ascending=False)
print(importance_df)

## Correlation ##

In [221]:
# Compute pairwise correlation of columns
corr = df.corr()

# Set a lower threshold for correlations you want to display
threshold = -0.6

# Extract feature pairs with correlations below the threshold
filtered_correlations = corr.stack().reset_index()
filtered_correlations.columns = ["Feature 1", "Feature 2", "Correlation"]
filtered_correlations = filtered_correlations[
    (filtered_correlations["Feature 1"] != filtered_correlations["Feature 2"])
    & (filtered_correlations["Correlation"] <= threshold)
]

# Sort the filtered correlations by value and remove duplicates
filtered_correlations = filtered_correlations.sort_values(
    by="Correlation", ascending=True
).drop_duplicates()
print(filtered_correlations)

                 Feature 1             Feature 2  Correlation
3289        PSARs_0.01_0.1        PSARl_0.01_0.1    -0.968489
3197        PSARl_0.01_0.1        PSARs_0.01_0.1    -0.968489
6044         BBP_10_2.0_10              RSI_diff    -0.851724
8620              RSI_diff         BBP_10_2.0_10    -0.851724
8582              RSI_diff                CCI_10    -0.836207
2510                CCI_10              RSI_diff    -0.836207
8583              RSI_diff                 CCI_5    -0.815109
2603                 CCI_5              RSI_diff    -0.815109
5579           BBP_5_2.0_5              RSI_diff    -0.811188
8615              RSI_diff           BBP_5_2.0_5    -0.811188
1580                 RSI_5              RSI_diff    -0.795455
8572              RSI_diff                 RSI_5    -0.795455
8496              RVI_diff                 RVI_5    -0.771021
3160                 RVI_5              RVI_diff    -0.771021
3161                 RVI_5              RSI_diff    -0.760581
8589    

## Ablation ##

In [222]:
# # Create a dataframe to hold results
# results = pd.DataFrame(columns=["feature", "accuracy"])

# # Feature ablation
# for column in X.columns:
#     print(f"Running model without {column}")
#     X_temp = X.drop(column, axis=1)
#     X_temp = X_temp.values.reshape((X_temp.shape[0], X_temp.shape[1], 1))
#     accuracies = []  # list to hold accuracies for each split

#     # Perform TimeSeriesSplit
#     for train_index, test_index in tscv.split(X_temp):
#         X_train, X_test = X_temp[train_index], X_temp[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         # Create and fit the model
#         model = create_model((X_train.shape[1], 1))
#         model.fit(X_train, y_train, epochs=10, verbose=0)

#         # Make predictions on the test set
#         y_pred = model.predict(X_test)
#         y_pred = (y_pred > 0.5).astype(int).flatten()
#         accuracy = accuracy_score(y_test, y_pred)
#         accuracies.append(accuracy)

#     # Calculate average accuracy for this feature
#     avg_accuracy = sum(accuracies) / len(accuracies)
#     print(f"Average accuracy without {column}: {avg_accuracy}")
#     results = results.append(
#         {"feature": column, "accuracy": avg_accuracy}, ignore_index=True
#     )

#     # Save results to CSV
#     results.to_csv(f"ablation_{column}.csv", index=False)

## Utils ##

In [223]:
## Read ablation file ##

# Load the DataFrame
df = pd.read_csv("model_accuracy_results.csv")

# Sort the DataFrame by 'accuracy' from lowest to highest
df_sorted = df.sort_values(by="accuracy", ascending=False)

# # Sample 50% of the DataFrame
# df_sampled = df_sorted.sample(frac=0.5)

# Display the sampled DataFrame
print(df_sorted)

                 feature   accuracy
64                RVI_10  76.371568
8                    obv  76.309228
84                ISB_15  76.309228
76        PSARr_0.01_0.1  76.309228
4               turnover  76.246881
30         BBL_20_2.0_20  76.246881
42         BBM_10_2.0_10  76.184541
87                ICS_15  76.184541
82            TRIXs_10_5  76.122195
36           BBL_5_2.0_5  76.122195
10                RSI_10  76.059848
46         STOCHk_14_3_3  76.059848
21                 EMA_2  76.059848
34         BBP_20_2.0_20  76.059848
80            TRIXs_12_6  75.997508
60                 CCI_5  75.997508
59                CCI_10  75.997508
85                 ITS_5  75.997508
78            TRIXs_18_9  75.935161
72                    S3  75.935161
53                ATR_10  75.935161
49   STOCHd_14_3_3_7_3_3  75.935161
48   STOCHk_14_3_3_7_3_3  75.935161
54                 ATR_5  75.872821
0                   open  75.872821
14         MACDs_12_26_9  75.810474
32         BBU_20_2.0_20  75