# Data transformation & train-test split

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../src/')
from preprocessing import split_data, preprocess_3d_data
from matplotlib import pyplot as plt
#from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
#load processed data
eur_usd = pd.read_csv('../data/processed/eur_usd_processed.csv', header=0)
usd_jpy = pd.read_csv('../data/processed/usd_jpy_processed.csv', header=0)
aud_usd = pd.read_csv('../data/processed/aud_usd_processed.csv', header=0)
gbp_usd = pd.read_csv('../data/processed/gbp_usd_processed.csv', header=0)
nzd_usd = pd.read_csv('../data/processed/nzd_usd_processed.csv', header=0)
usd_cad = pd.read_csv('../data/processed/usd_cad_processed.csv', header=0)
usd_chf = pd.read_csv('../data/processed/usd_chf_processed.csv', header=0)

currency_pairs = [eur_usd, usd_jpy, aud_usd, gbp_usd, nzd_usd, usd_cad, usd_chf]
titles = ['EUR_USD','USD_JPY','AUD_USD','GBP_USD','NZD_USD','USD_CAD','USD_CHF']

eur_usd.head()

Unnamed: 0,Open,High,Low,Close,RSI,EMA_5,EMA_10,EMA_20,NextClose,Target,Open_s,High_s,Low_s,Close_s,EMA_5_s,EMA_10_s,EMA_20_s
0,1.2249,1.22498,1.22435,1.22466,36.721191,1.225017,1.225094,1.22484,1.22489,1,-0.00023,-0.00018,-0.00045,-0.00022,-0.000179,-9.7e-05,-1.9e-05
1,1.22466,1.22489,1.22449,1.22489,45.099152,1.224975,1.225057,1.224845,1.22461,0,-0.00024,-9e-05,0.00014,0.00023,-4.2e-05,-3.7e-05,5e-06
2,1.22489,1.22494,1.22461,1.22461,37.960877,1.224853,1.224976,1.224823,1.22446,0,0.00023,5e-05,0.00012,-0.00028,-0.000122,-8.1e-05,-2.2e-05
3,1.22461,1.22466,1.22413,1.22446,34.543656,1.224722,1.224882,1.224788,1.22462,1,-0.00028,-0.00028,-0.00048,-0.00015,-0.000131,-9.4e-05,-3.5e-05
4,1.22446,1.22483,1.22446,1.22462,41.13767,1.224688,1.224834,1.224772,1.2255,1,-0.00015,0.00017,0.00033,0.00016,-3.4e-05,-4.8e-05,-1.6e-05


## 3D transformation (sample, timestep, features)

In [3]:
#set features, target and variables
datas = currency_pairs
#feature selection
feature_col = ['Open_s','High_s','Low_s','Close_s','RSI','EMA_5_s','EMA_10_s','EMA_20_s']
target_col = 'Target'
sequence_length = 30 #past data per prediction
Xs = [] #input set
ys = [] #output set

#preprocess data to 3d to fit model
for data in datas:
    X, y = preprocess_3d_data(data=data, feature_col=feature_col, target_col=target_col, sequence_length=sequence_length)
    Xs.append(X)
    ys.append(y)
    print(X.shape) #validate 3d shape
    print(y.shape) 

(74592, 30, 8)
(74592,)
(74591, 30, 8)
(74591,)
(74586, 30, 8)
(74586,)
(74592, 30, 8)
(74592,)
(74606, 30, 8)
(74606,)
(74585, 30, 8)
(74585,)
(74585, 30, 8)
(74585,)


## Split data 
Training and testing input/output

In [4]:
X_trains = []
X_tests = []
y_trains = []
y_tests = []

for X,y in zip(Xs,ys):
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    X_trains.append(X_train)
    X_tests.append(X_test)
    y_trains.append(y_train)
    y_tests.append(y_test)
    
    print(X_train.shape)
    print(X_test.shape)
# sample, timestep, features

(59673, 30, 8)
(14919, 30, 8)
(59672, 30, 8)
(14919, 30, 8)
(59668, 30, 8)
(14918, 30, 8)
(59673, 30, 8)
(14919, 30, 8)
(59684, 30, 8)
(14922, 30, 8)
(59668, 30, 8)
(14917, 30, 8)
(59668, 30, 8)
(14917, 30, 8)


## Save data to .npy format

In [5]:
for title, X_train, X_test, y_train, y_test in zip(titles, X_trains, X_tests, y_trains, y_tests):
    np.save(f'../data/processed/{title.lower()}_X_train.npy', X_train)
    np.save(f'../data/processed/{title.lower()}_X_test.npy', X_test)
    np.save(f'../data/processed/{title.lower()}_Y_train.npy', y_train)
    np.save(f'../data/processed/{title.lower()}_Y_test.npy', y_test)

End of section