In [1]:
import numpy as np
import pandas as pd

In [2]:
data_df = pd.read_csv('finaldata.csv', index_col=0)

In [3]:
data_df['date'] = pd.to_datetime(data_df['date'], format = '%d/%m/%Y')
data_df.set_index('date', inplace=True)

In [4]:
X = data_df.drop('ERP_Sign', axis=1)
y = data_df['ERP_Sign']

In [5]:
max_lags = 10
fcst_horizon = [5, 10, 15, 20]
max_horizon = fcst_horizon[-1]
model_vars = X.columns

In [6]:
#Create lagged variables for features

for lag in range(1, max_lags + max_horizon):
    lagged = X[model_vars].shift(lag)
    lagged.columns = [x + 'L' + str(lag) for x in model_vars]

    X = pd.concat((X, lagged), axis=1)

In [None]:
# Correlation matrix where absolute correlation coefficient > 0.8

# corr_matrix = X.corr()
# corr_matrix.where(corr_matrix.abs().gt(0.8)).stack()[:23]

close  close         1.000000
       adjclose      0.999270
       ma50d         0.995358
       ma200d        0.981347
       ema50d        0.996596
       ema200d       0.986725
       gdp           0.949610
       closeL1       0.999644
       adjcloseL1    0.998949
       ma50dL1       0.995083
       ma200dL1      0.981041
       ema50dL1      0.996309
       ema200dL1     0.986440
       gdpL1         0.949418
       closeL2       0.999345
       adjcloseL2    0.998680
       ma50dL2       0.994812
       ma200dL2      0.980736
       ema50dL2      0.996025
       ema200dL2     0.986156
       gdpL2         0.949225
       closeL3       0.999017
       adjcloseL3    0.998381
dtype: float64

In [7]:
from sktime.forecasting.model_selection import temporal_train_test_split

X_train, X_test = temporal_train_test_split(X, test_size=0.2)
y_train, y_test = temporal_train_test_split(y, test_size=0.2)

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1200), X_test: (855, 1200), y_train: (3416,), y_test: (855,)


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
X_train_scaled = pd.DataFrame(data=X_train_scaled, index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_scaled, index=X_test.index, columns=X_test.columns)

In [10]:
X_train_scaled.to_csv('train.csv')
y_train.to_csv('train_labels.csv')
X_test_scaled.to_csv('test.csv')
y_test.to_csv('test_labels.csv')