In [None]:
# imports
import numpy as np
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import xgboost as xgb

In [None]:
# load preproessed data 
with open('preprocessed_data/preprocessed_LSTM_CNN.pkl', 'rb') as f:
    LSTM_CNN_preprocessed_data = pickle.load(f)

In [None]:
# The variable LSTM_CNN_preprocessed_data is a list of our 5 bins [0-4][][]. Each bin contains the 15 belonging stocks or ETFs [0-4][0-14][].
# After accessing the stock we can get the we are left with a 2 element list containing [0] X the features and [1] y the target [0-4][0-14][0-1].

# For the NNs the accessed data is 3D (samples, sample_size, features_per_sample)

LSTM_CNN_preprocessed_data[2][7][0].shape
# This for example accesses the X data from the 8th stock in bin 3.

In [None]:
# Only pass pandas DataFrames as data_X data_y to the function below.
def create_windows(data_X, data_y, window):
    X, y, indices = [], [], []
    for i in range(window, len(data_X)):
        X.append(data_X[i - window:i].values.astype("float32"))
        y.append(float(data_y.iloc[i-1])) # i-1 because we shifted the target by one in the General preprocessing pipeline 
        indices.append(data_y.index[i-1])
    return np.array(X, dtype="float32"), np.array(y, dtype="float32"), np.array(indices)

# Create the tuning objective with according data sets
def make_lstm_objective(X, y):
    def lstm_objective(trial):
        # Suggest hyperparameters
        n_units = trial.suggest_int("lstm_units", 32, 128, step=32)
        n_layers = trial.suggest_int("lstm_layers", 1, 2, step=1)
        dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5, step=0.1)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
        batch_size = trial.suggest_categorical("batch_size", [16, 32])
        n_dense = trial.suggest_int("dense_units", 16, 128, step=16)
        window_size = trial.suggest_int("window_size", 10, 60, step=10)

        # create windows
        X_windows, y_windows, indices = create_windows(X, y, window_size)

        # split data in train 0.8, validation 0.1 and test 0.1
        end_train_set = X.index[int(X.shape[0] * 0.8)]
        end_validation_set = X.index[int(X.shape[0] * 0.9)]

        train_mask = indices < end_train_set
        validation_mask = (indices >= end_train_set) & (indices < end_validation_set)
        test_mask = indices >= end_validation_set

        X_train = X_windows[train_mask]
        y_train = y_windows[train_mask]
        X_val = X_windows[validation_mask]
        y_val = y_windows[validation_mask]
        X_test = X_windows[test_mask]
        y_test = y_windows[test_mask]
        

        # Build model
        model = Sequential()
        for i in range(n_layers):
            bool_return_sequences = (i < n_layers - 1)  # Return sequences for all but the last layer
            bool_firtst_layer = (i == 0)  # First layer needs input shape
            if bool_firtst_layer:
                model.add(LSTM(units=n_units, input_shape=(window_size, X_train.shape[2]),
                            return_sequences=bool_return_sequences))
            else:
                model.add(LSTM(units=n_units, return_sequences=bool_return_sequences))
            model.add(Dropout(rate=dropout_rate))
        model.add(Dense(1, activation="linear"))  # regression

        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss="mse")

        # Early stopping callback to prune unpromising trials
        early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

        # Train
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=50,
            batch_size=batch_size,
            callbacks=[early_stop],
            verbose=0
        )

        # evaluate and calculate errors
        y_pred = model.predict(X_val, batch_size=batch_size)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        mape = np.mean(np.abs((y_pred - y_val) / y_val)) * 100
        nrmse = np.sqrt(mean_squared_error(y_val, y_pred)) / np.mean(y_val) * 100  # in percent

        # Store results in trial
        trial.set_user_attr("mae", mae)
        trial.set_user_attr("r2", r2)
        trial.set_user_attr("rmse", rmse)
        trial.set_user_attr("nrmse", nrmse)

        return mape
    return lstm_objective

In [None]:
# Run the study

# outer list to store results for each bin
LSTM_study_results = []

N_TRIALS = 70
for i in range(0, len(LSTM_CNN_preprocessed_data)):
    # list per bin to store results of single asset
    LSTM_bin_results = []

    for j in range(0, len(LSTM_CNN_preprocessed_data[i])):

        print(f"Running LSTM study for bin {i+1}, stock {j+1}...")
        X, y, symbol = LSTM_CNN_preprocessed_data[i][j]

        # create LSTM study
        lstm_objective = make_lstm_objective(X, y)
        lstm_study = optuna.create_study(
            direction="minimize",
            study_name="lstm_regression_study",
            sampler=optuna.samplers.TPESampler(),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        )
        lstm_study.optimize(lstm_objective, n_trials=N_TRIALS)

        best_params = lstm_study.best_trial.params
        window_size = best_params["window_size"]
        
        #create windows
        # split data in train 0.9 and test 0.1
        X_windows, y_windows, indices = create_windows(X, y, window_size)

        end_train_validation_set = X.index[int(X.shape[0] * 0.9)]

        train_mask = indices < end_train_validation_set
        test_mask = indices >= end_train_validation_set

        X_train_val = X_windows[train_mask]
        y_train_val = y_windows[train_mask]
        X_test = X_windows[test_mask]
        y_test = y_windows[test_mask]

        #Build the model
        model = Sequential()
        for k in range(best_params["lstm_layers"]):
            return_seq = (k < best_params["lstm_layers"] - 1)
            if k ==0:
                model.add(LSTM( best_params["lstm_units"],
                            input_shape=(window_size, X_train_val.shape[2]),
                            return_sequences=return_seq) )
            else:
                model.add( LSTM(best_params["lstm_units"], return_sequences=return_seq))
            model.add(Dropout(best_params["dropout_rate"]))
        model.add(Dense(1, activation= "linear"))

        optimizer = tf.keras.optimizers.Adam(learning_rate=best_params["learning_rate"])
        model.compile(optimizer=optimizer, loss="mean_absolute_percentage_error")

        # Train the model based on the best parameters for the particular asset
        model.fit(
            X_train_val, y_train_val,
            epochs=50,
            batch_size=best_params['batch_size'],
            verbose=0
        )

        # calculate errors
        y_pred = model.predict(X_test, batch_size=best_params["batch_size"])
        test_mape   = np.mean(np.abs((y_pred - y_test) / y_test)) * 100
        test_rmse   = np.sqrt(mean_squared_error(y_test, y_pred))
        test_mae    = mean_absolute_error(y_test, y_pred)
        test_r2     = r2_score(y_test, y_pred)

        results = {
            "test_mape": test_mape,
            "test_rmse": test_rmse,
            "test_mae":  test_mae,
            "test_r2":   test_r2
        }


        LSTM_bin_results.append(results)
        # write out the results
        with open(f'results/LSTM/bin{i+1}/Performance_metrices_LSTM_bin_{i+1}_stock_{j+1}.pkl', 'wb') as f:
            pickle.dump(results, f)
    LSTM_study_results.append(LSTM_bin_results)
    # write out the results
    with open(f'results/LSTM/Performance_metrices_LSTM_bin_{i+1}.pkl', 'wb') as f:
        pickle.dump(LSTM_bin_results, f)