**Imports** 🕵️‍♂️

In [57]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from keras.callbacks import EarlyStopping
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
from scipy.signal import detrend
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose


**Functions** 🤌

In [80]:
def create_sequences(X, y, time_steps=60):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

def check_data_balance(y):
    counter = Counter(y)
    for label, count in counter.items():
        percentage = (count / len(y)) * 100
        print(f"Class {label}: {count} samples ({percentage:.2f}%)")

# Preprocessing function for train and test sets
def preprocess(df):
    # Copy the dataframe
    df = df.copy()
    
    # Fill NaNs in specific columns with 0
    df['PSARl_0.01_0.1'] = df['PSARl_0.01_0.1'].fillna(0)
    df['PSARs_0.01_0.1'] = df['PSARs_0.01_0.1'].fillna(0)

    # Identify the first non-null row
    first_valid_index = df.dropna().index[0]

    # Drop the rows before this index
    df = df.loc[first_valid_index:]

    # Use ffill to fill any remaining missing values
    df.ffill(inplace=True)

    # Return the processed dataframe
    return df

# Separate function for feature engineering
def feature_engineering(df, lag_and_window_features):
    # Convert DataFrame to numpy arrays for faster computation
    features_np = df[lag_and_window_features].to_numpy()
    n_samples, n_features = features_np.shape
    engineered_features = {}

    # Lagged Features
    for i, feature in enumerate(lag_and_window_features):
        for lag in [1, 2, 3, 5, 10]:
            lagged = np.roll(features_np[:, i], lag)
            lagged[:lag] = np.nan  # Pad with NaN
            engineered_features[f'{feature}_lag_{lag}'] = lagged

    # Rolling Window Statistics
    for i, feature in enumerate(lag_and_window_features):
        for window in [3, 5, 10]:
            rolled = np.lib.stride_tricks.sliding_window_view(features_np[:, i], window)
            rolled_mean = np.full(n_samples, np.nan)
            rolled_mean[window-1:] = np.mean(rolled, -1)  # Compute mean
            rolled_std = np.full(n_samples, np.nan)
            rolled_std[window-1:] = np.std(rolled, -1)  # Compute standard deviation
            engineered_features[f'{feature}_rolling_mean_{window}'] = rolled_mean
            engineered_features[f'{feature}_rolling_std_{window}'] = rolled_std

    # Convert engineered features back to DataFrame
    engineered_features_df = pd.DataFrame(engineered_features, index=df.index)

    # Concatenate original features with engineered features
    df = pd.concat([df, engineered_features_df], axis=1)

    # Drop rows with NaN values created by lagged features and rolling window statistics
    df = df.dropna()

    return df

**Globals** 🌎

In [77]:
# # Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

feature_names = [
    # Fundamental price data
    'open', 'high', 'low', 'close', 

    # Auxiliary data
    'turnover', 'color',

    # Volume-related
    'volume', 'avg_vol_last_100', 'obv', 

    # Momentum and trend indicators
    'RSI_5', 'RSI_10', 'RSI_14',
    'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'MACD_6_13_5_6_13_5', 'MACDh_6_13_5_6_13_5', 'MACDs_6_13_5_6_13_5', 

    # Moving averages
    'SMA_20', 'SMA_5', 'SMA_10', 'EMA_2', 'EMA_5', 'EMA_10', 

    # Bollinger Bands
    'BBP_10_2.0_10', 'BBL_15_2.0_15', 'BBM_15_2.0_15', 'BBU_15_2.0_15', 'BBB_15_2.0_15', 'BBP_15_2.0_15', 'BBL_20_2.0_20', 'BBM_20_2.0_20', 'BBU_20_2.0_20', 'BBB_20_2.0_20', 'BBP_20_2.0_20', 'bollinger_bandwidth',
    'BBL_5_2.0_5', 'BBM_5_2.0_5', 'BBU_5_2.0_5', 'BBB_5_2.0_5', 'BBP_5_2.0_5', 'BBL_10_2.0_10', 'BBM_10_2.0_10', 'BBU_10_2.0_10', 'BBB_10_2.0_10', 'BBP_10_2.0_10',

    # Stochastic Oscillator
    'STOCHd_14_3_3', 'STOCHk_14_3_3_7_3_3', 'STOCHd_14_3_3_7_3_3', 'STOCHk_14_3_3_10_3_3', 'STOCHd_14_3_3_10_3_3', 

    # Volatility
    'ATR_14', 'ATR_10', 'ATR_5', 

    # Other momentum oscillators
    'ROC_14', 'ROC_10', 'ROC_5', 

    # Other versatile indicators
    'CCI_14', 'CCI_10', 'CCI_5', 

    # Money Flow Index and Chaikin Money Flow
    'cmf', 'mfi',
    
    # Relative Vigor Index (RVI)
    'RVI_15', 'RVI_10', 'RVI_5',
    
    # Pivot Points
    'PP', 'R1', 'S1', 'R2', 'S2', 'R3', 'S3',

    # Parabolic SAR (PSAR)
    'PSARl_0.01_0.1', 'PSARs_0.01_0.1', 'PSARaf_0.01_0.1', 'PSARr_0.01_0.1',

    # Triple Exponential Average (TRIX)
    'TRIX_18_9', 'TRIXs_18_9', 'TRIX_12_6', 'TRIXs_12_6', 'TRIX_10_5', 'TRIXs_10_5',

    # Ichimoku Cloud (ISA, ISB, ITS, IKS, ICS)
    'ISA_5', 'ISB_15', 'ITS_5', 'IKS_15', 'ICS_15',

]

# List of original features you have
lag_and_window_features = ['open', 'high', 'low', 'close', 'volume', 'SMA_20', 'SMA_5', 'SMA_10', 'ROC_14', 'ROC_10', 'ROC_5', 
                'RSI_5', 'RSI_10', 'RSI_14', 'BBP_10_2.0_10', 'BBL_15_2.0_15', 'BBM_15_2.0_15', 'BBU_15_2.0_15', 
                'BBB_15_2.0_15', 'BBP_15_2.0_15', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'STOCHk_14_3_3', 
                'STOCHd_14_3_3', 'obv', 'ATR_14', 'ATR_10', 'ATR_5', 'color', 'avg_vol_last_100']

**Preprocessing** 👻

In [60]:
# Load the data
df = pd.read_csv('../../../data/kc/btc/heiken_ashi/with_trade_indicators/raw/kc_btc_15min_ha_ti.csv')

# Convert color to 0 for 'red' and 1 for 'green'
df['color'] = df['color'].map({'red': 0, 'green': 1})

# Add 'color_change' column: 1 if color changes from the previous row, 0 otherwise
df['color_change'] = df['color'].diff().abs()

# Fill the first row's 'color_change' with 0
df['color_change'].fillna(0, inplace=True)

# Drop 'time' and 'turnover' columns
df = df.drop(['time'], axis=1)

# Separate features and target
features_df = df.drop('color_change', axis=1)
target = df['color_change']

# Determine the split point
split_point = int(len(features_df) * 0.8)

# Split the data into train and test sets
X_train_df, X_test_df = features_df[:split_point], features_df[split_point:]
y_train, y_test = target[:split_point], target[split_point:]

# Apply preprocessing to the train and test sets
X_train_df = preprocess(X_train_df)
X_test_df = preprocess(X_test_df)

# Initialize a scaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)

# Reshape input to be 3D [samples, timesteps, features] for LSTM
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

check_data_balance(y_test)


Class 1.0: 866 samples (53.99%)
Class 0.0: 738 samples (46.01%)


**Feature Engineering** 🚂

In [78]:
# Apply feature engineering to the train and test sets
X_train_df = feature_engineering(X_train_df, lag_and_window_features)
X_test_df = feature_engineering(X_test_df, lag_and_window_features)

# Initialize a scaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit the scaler on the training data and transform both training and test data
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)

# Convert the scaled features back to a DataFrame
X_train = pd.DataFrame(X_train, columns=X_train_df.columns, index=X_train_df.index)
X_test = pd.DataFrame(X_test, columns=X_test_df.columns, index=X_test_df.index)

# The target needs to match the features DataFrame
y_train = y_train.loc[X_train.index]
y_test = y_test.loc[X_test.index]

# Reshape input to be 3D [samples, timesteps, features] for LSTM
X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

check_data_balance(y_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# Fit a Logistic Regression model to the data
log_reg = LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)

# Use SelectFromModel to select features whose coefficients are non-zero
selector = SelectFromModel(estimator=log_reg, prefit=True)

# Transform the data
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Get the selected feature names
selected_features = features_df.columns[selector.get_support()]

print(f"Selected features: {selected_features}")

**Cross Validation** 🏴‍☠️

In [76]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy: %.2f' % (accuracy*100))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 74.50
