In [None]:
# imports
import pickle 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
# load generally preprocessed data
with open('preprocessed_data/preprocessed_general.pkl', 'rb') as f:
    chosen_stocks = pickle.load(f)

We now have our stock data with features for each stock. The next step for us is to split the data into train and test/validation to fit a scaler only on the training set.
We do that so that the scaler does not get any information about the range the values may lie in in the test or validation set.

In [None]:
# we split after dates
end_train_set = chosen_stocks[0][0].index[int((chosen_stocks[0][0].shape[0] - 1) * 0.8)]
end_validation_set = chosen_stocks[0][0].index[int((chosen_stocks[0][0].shape[0] - 1) * 0.9)]
print(end_train_set)
print(end_validation_set)

2024-05-13 00:00:00
2024-11-05 00:00:00


In [None]:
# create outer list to store preprocessed data per bin
preprocessed_NN_data = []
for i in range(len(chosen_stocks)):
    # inner lest to store preprocessed data per asset
    preprocessed_NN_bin = []
    for j in range(len(chosen_stocks[i])):
        #Split into train 0.9, validation 0.1 and test 0.1 sets
        X_train = chosen_stocks[i][j].loc[:end_train_set].drop(columns=['target', 'target_ret', 'target_ret_log'])
        y_train = chosen_stocks[i][j].loc[:end_train_set, 'target_ret_log']
        X_validation = chosen_stocks[i][j].loc[end_train_set:end_validation_set].drop(columns=['target', 'target_ret', 'target_ret_log'])
        y_validation = chosen_stocks[i][j].loc[end_train_set:end_validation_set, 'target_ret_log']
        X_test = chosen_stocks[i][j].loc[end_validation_set:].drop(columns=['target', 'target_ret', 'target_ret_log'])
        y_test = chosen_stocks[i][j].loc[end_validation_set:, 'target_ret_log']


        # Scale Features based on train set
        # We drop the 'symbol' column before scaling and store it seperately
        symbol = X_train.iloc[0]['symbol']
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train.drop(columns=['symbol'], inplace=False))
        X_validation_scaled = scaler.transform(X_validation.drop(columns=['symbol'], inplace=False))
        X_test_scaled = scaler.transform(X_test.drop(columns=['symbol'], inplace=False))

        # Put back into dfs so we have the original indices and columns for windowing
        columns = X_train.drop(columns=['symbol']).columns
        X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=columns)
        X_validation_scaled = pd.DataFrame(X_validation_scaled, index=X_validation.index, columns=columns) 
        X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=columns)

        # We concatenate the dataframes to create a single dataset for windowing so that we dont lose window.size() windows at the start of each set but only once.
        X = pd.concat([X_train_scaled, X_validation_scaled, X_test_scaled])
        y = pd.concat([y_train, y_validation, y_test])

        # store the preprocessed data along with the symbol
        entry = (X, y, symbol)
        preprocessed_NN_bin.append(entry)
    preprocessed_NN_data.append(preprocessed_NN_bin)

# write out the preprocessed data
with open(f'preprocessed_data/preprocessed_LSTM_CNN.pkl', 'wb') as f:
    pickle.dump(preprocessed_NN_data, f)

In [6]:
print(X_train.shape, X_validation.shape, X_test.shape)

(981, 61) (123, 61) (124, 61)
