In [1]:
# imports
import pickle 
import pandas as pd
import numpy as np

In [2]:
# load generally preprocessed data
with open('preprocessed_data/preprocessed_general.pkl', 'rb') as f:
    chosen_stocks = pickle.load(f)

In [3]:
chosen_stocks[2][13].shape

(1256, 65)

In [4]:
# functin to create df with only lagged values
def create_lag_values(data, columns, lag):
    df = pd.DataFrame(index=data.index)
    for i in range(1, lag + 1):
        df[[f"{col}_lag_{i}" for col in columns]] = data[columns].shift(i)
    return df
# different lags for different columns
lags_5 = ['close', 'open', 'high', 'low', 'volume', 'vwap']
lags_3 = ['rsi', 'roc_9', 'roc_14', 'roc_21', 'macd', 'macd_signal', 'macd_hist',
          'stoch_%K', 'stoch_%D', 'stoch_diff', 'cci', 'plus_di', 'minus_di', 'dx', 'adx',
          'obv', 'obv_norm', 'obv_momentum', 'vpt', 'vpt_norm', 'vpt_momentum']

# Create lists to hold the data all bins
preprocessed_Tree_data = []
for i in range(len(chosen_stocks)):
    preprocessed_Tree_bin = []
    for j in range(len(chosen_stocks[i])):
        chosen_stocks[i][j].drop(columns=['symbol'], inplace=True)
        # create df with lagged values
        lags_5_df = create_lag_values(chosen_stocks[i][j], lags_5, 5)
        lags_3_df = create_lag_values(chosen_stocks[i][j], lags_3, 3)

        # append lagged values to the original df
        chosen_stocks[i][j] = pd.concat([chosen_stocks[i][j], lags_5_df, lags_3_df], axis=1)
        chosen_stocks[i][j].dropna(inplace=True)

        # split the data into train 0.9, validation 0.1 and test 0.1 sets
        n = len(chosen_stocks[i][j])
        end_train = int(n * 0.8)
        end_val = int(n * 0.9)

        train_df = chosen_stocks[i][j].iloc[:end_train]
        val_df = chosen_stocks[i][j].iloc[end_train:end_val]
        test_df = chosen_stocks[i][j].iloc[end_val:]

        # prepare input and target variables
        X_train = train_df.drop(columns=['target', 'target_ret', 'target_ret_log'])
        y_train = train_df['target_ret']
        X_validation = val_df.drop(columns=['target', 'target_ret', 'target_ret_log'])
        y_validation = val_df['target_ret']
        X_test = test_df.drop(columns=['target', 'target_ret', 'target_ret_log'])
        y_test = test_df['target_ret']

        # store the preprocessed data
        entry = (X_train, y_train, X_validation, y_validation, X_test, y_test)
        preprocessed_Tree_bin.append(entry)
    preprocessed_Tree_data.append(preprocessed_Tree_bin)     

with open(f'preprocessed_data/preprocessed_XGBoost.pkl', 'wb') as f:
    pickle.dump(preprocessed_Tree_data, f)