In [1]:
import pickle
import os

import pandas as pd
import numpy as np

from pymlpipe.tabular import PyMLPipe
from pymlpipe.pymlpipeUI import start_ui

import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
import optuna

from helper import objective
from helper import get_model 
from helper import evaluate_for_testing

In [2]:
REPOSITORY_DATA_RAW = 'data raw'
REPOSITORY_DATA_PREPROCESSED = 'data preprocessed'
REPOSITORY_STUDIES = 'studies'

In [3]:
mlp = PyMLPipe()

# Preprocessing

In [4]:
N_PREVIOUS_HOUR_VALUES = 720
N_PREVIOUS_DAY_VALUES = 90
N_PREVIOUS_WEEK_VALUES = 50
N_PREVIOUS_MONTH_VALUES = 36
N_FEATURES = 8

Read raw dataframes 

In [5]:
df_ask = pd.read_csv('./data raw/FOREX_EUR_CHF_ASK.csv')
df_bid = pd.read_csv('./data raw/FOREX_EUR_CHF_BID.csv')

Rename columns with bid and ask prefix

In [6]:
df_ask.rename(
    columns={
        'Open': 'Ask_Open',
        'High': 'Ask_High',
        'Low': 'Ask_Low',
        'Close': 'Ask_Close',
        'Volume': 'Ask_Volume'
    },
    inplace=True)

df_bid.rename(
    columns={
        'Open': 'Bid_Open',
        'High': 'Bid_High',
        'Low': 'Bid_Low',
        'Close': 'Bid_Close',
        'Volume': 'Bid_Volume'
    },
    inplace=True)

Merge dataframes together

In [7]:
df_forex = df_ask.merge(df_bid, left_on='Gmt time', right_on='Gmt time')

Extract year, month, day and hour from GMT string time

In [8]:
df_forex['Year'] = df_forex['Gmt time'].apply(lambda x: int(x[6:10]))
df_forex['Month'] = df_forex['Gmt time'].apply(lambda x: int(x[3:5]))
df_forex['Day'] = df_forex['Gmt time'].apply(lambda x: int(x[0:2]))
df_forex['Hour'] = df_forex['Gmt time'].apply(lambda x: int(x[11:13]))

df_forex.drop(columns=['Gmt time'], inplace=True)

Create targets vectors

In [9]:
df_forex['Bid_Close_next'] = np.concatenate([df_forex.Bid_Close.to_numpy(), np.array([0])])[1:]
df_forex['Ask_Close_next'] = np.concatenate([df_forex.Ask_Close.to_numpy(), np.array([0])])[1:]
df_forex = df_forex.iloc[0:-1, :]

y_bid = df_forex['Bid_Close_next'].to_numpy()
y_ask = df_forex['Ask_Close_next'].to_numpy()

Create features vectors

In [10]:
def concat_previous_data_from_timelaps(X):
    
    return np.array([[
        np.max(X[:, 0]), np.min(X[:, 1]), X[0, 2], np.sum(X[:, 3]), np.max(X[:, 4]), np.min(X[:, 5]), X[0, 6], np.sum(X[:, 7])
    ]])

In [11]:
X_date = df_forex[['Year', 'Month', 'Day', 'Hour']].to_numpy()
X_now = df_forex[['Ask_High', 'Ask_Low', 'Ask_Close', 'Ask_Volume', 'Bid_High', 'Bid_Low', 'Bid_Close', 'Bid_Volume']].to_numpy()
X_previous_hour = np.concatenate([X_now[N_PREVIOUS_HOUR_VALUES - 1 - i: X_now.shape[0] - 1 - i] for i in range(N_PREVIOUS_HOUR_VALUES)], axis=1)
X_previous_day = np.concatenate([
    np.concatenate([
        concat_previous_data_from_timelaps(X_now[(N_PREVIOUS_DAY_VALUES - 1 - i) * 24 + line: (N_PREVIOUS_DAY_VALUES - i) * 24 + line]) 
        for i in range(N_PREVIOUS_DAY_VALUES)], 
        axis=1)
    for line in range(X_now.shape[0] - N_PREVIOUS_DAY_VALUES * 24)],
    axis=0)
X_previous_week = np.concatenate([
    np.concatenate([
        concat_previous_data_from_timelaps(X_now[(N_PREVIOUS_WEEK_VALUES - 1 - i) * 24 * 7 + line: (N_PREVIOUS_WEEK_VALUES - i) * 24 * 7 + line]) 
        for i in range(N_PREVIOUS_WEEK_VALUES)], 
        axis=1)
    for line in range(X_now.shape[0] - N_PREVIOUS_WEEK_VALUES * 24 * 7)],
    axis=0)
X_previous_month = np.concatenate([
    np.concatenate([
        concat_previous_data_from_timelaps(X_now[(N_PREVIOUS_MONTH_VALUES - 1 - i) * 24 * 30 + line: (N_PREVIOUS_MONTH_VALUES - i) * 24 * 30 + line]) 
        for i in range(N_PREVIOUS_MONTH_VALUES)], 
        axis=1)
    for line in range(X_now.shape[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30)],
    axis=0)

Remove offset to year, month and day

In [12]:
X_date[:, 0] = X_date[:, 0] - 2013
X_date[:, 1] = X_date[:, 1] - 1
X_date[:, 2] = X_date[:, 2] - 1

Remove N_PREVIOUS_MONTH_VALUES to features and targets vectors (since there is not historic before)

In [13]:
X_date = X_date[X_date.shape[0] - X_previous_month.shape[0]:]
X_now = X_now[X_now.shape[0] - X_previous_month.shape[0]:]
X_previous_hour = X_previous_hour[X_previous_hour.shape[0] - X_previous_month.shape[0]:]
X_previous_day = X_previous_day[X_previous_day.shape[0] - X_previous_month.shape[0]:]
X_previous_week = X_previous_week[X_previous_week.shape[0] - X_previous_week.shape[0]:]
y_bid = y_bid[y_bid.shape[0] - X_previous_month.shape[0]:]
y_ask = y_ask[y_ask.shape[0] - X_previous_month.shape[0]:]

Create index for tuning and evaluation training/validation/testing

In [14]:
idx_tuning_trains = [
    range(0, df_forex[(df_forex.Year == int(2020 + i / 2)) & (df_forex.Month == 1 + ((6 * i) % 12)) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30)
    for i in range(4)
]

idx_tuning_validations = [
    range(
        df_forex[(df_forex.Year == int(2020 + i / 2)) & (df_forex.Month == 1 + ((6 * i) % 12)) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30,
        df_forex[(df_forex.Year == int(2020 + (i + 1) / 2)) & (df_forex.Month == 1 + ((6 * (i + 1)) % 12)) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30)
    for i in range(4)
]

idx_eval_train = range(0, df_forex[(df_forex.Year == 2021) & (df_forex.Month == 1) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30)

idx_eval_validation = range(
    df_forex[(df_forex.Year == 2021) & (df_forex.Month == 1) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30,
    df_forex[(df_forex.Year == 2022) & (df_forex.Month == 1) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30)

idx_eval_test = range(
    df_forex[(df_forex.Year == 2022) & (df_forex.Month == 1) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30,
    df_forex[(df_forex.Year == 2023) & (df_forex.Month == 1) & (df_forex.Day == 1) & (df_forex.Hour == 0)].index[0] - N_PREVIOUS_MONTH_VALUES * 24 * 30)

Dataset class

In [15]:
class ForexDataset(Dataset):
    
    def __init__(self, X_date, X_now, X_previous_hour, X_previous_day, X_previous_week, X_previous_month, y_bid, y_ask, idx, 
                 n_previous_hour_values, n_previous_day_values, n_previous_week_values, n_previous_month_values, n_features):
        
        self.X_date = X_date[idx].astype(np.int32)
        self.X_now = X_now[idx].astype(np.float32)
        self.X_previous_hour = X_previous_hour[idx].astype(np.float32)
        self.X_previous_day = X_previous_day[idx].astype(np.float32)
        self.X_previous_week = X_previous_week[idx].astype(np.float32)
        self.X_previous_month = X_previous_month[idx].astype(np.float32)
        self.y_bid = y_bid[idx].astype(np.float32)
        self.y_ask = y_ask[idx].astype(np.float32)
        self.n_previous_hour_values = n_previous_hour_values
        self.n_previous_day_values = n_previous_day_values
        self.n_previous_week_values = n_previous_week_values
        self.n_previous_month_values = n_previous_month_values
        self.n_features = n_features
        
    def __len__(self):
        
        return self.y_bid.shape[0]
    
    def __getitem__(self, idx):
        
        return self.X_date[idx], self.X_now[idx], self.X_previous_hour[idx], self.X_previous_day[idx], \
               self.X_previous_week[idx], self.X_previous_month[idx], self.y_bid[idx], self.y_ask[idx]
    
    def fit_scalers(self, scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask):
        
        return scaler_now.fit(self.X_now), scaler_previous_hour.fit(self.X_previous_hour), scaler_previous_day.fit(self.X_previous_day), \
               scaler_previous_week.fit(self.X_previous_week), scaler_previous_month.fit(self.X_previous_month), \
               scaler_y_bid.fit(np.expand_dims(self.y_bid, axis=1)), scaler_y_ask.fit(np.expand_dims(self.y_ask, axis=1))
    
    def scale(self, scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask):
        
        self.X_now = scaler_now.transform(self.X_now)
        self.X_previous_hour = scaler_previous_hour.transform(self.X_previous_hour).reshape(self.X_previous_hour.shape[0], self.n_previous_hour_values, self.n_features)
        self.X_previous_day = scaler_previous_day.transform(self.X_previous_day).reshape(self.X_previous_day.shape[0], self.n_previous_day_values, self.n_features)
        self.X_previous_week = scaler_previous_week.transform(self.X_previous_week).reshape(self.X_previous_week.shape[0], self.n_previous_week_values, self.n_features)
        self.X_previous_month = scaler_previous_month.transform(self.X_previous_month).reshape(self.X_previous_month.shape[0], self.n_previous_month_values, self.n_features)
        self.y_bid = np.squeeze(scaler_y_bid.transform(np.expand_dims(self.y_bid, axis=1)))
        self.y_ask = np.squeeze(scaler_y_ask.transform(np.expand_dims(self.y_ask, axis=1)))
        
    def transfer_to_tensor(self):
        
        self.X_date = torch.from_numpy(self.X_date)
        self.X_now = torch.from_numpy(self.X_now)
        self.X_previous_hour = torch.from_numpy(self.X_previous_hour)
        self.X_previous_day = torch.from_numpy(self.X_previous_day)
        self.X_previous_week = torch.from_numpy(self.X_previous_week)
        self.X_previous_month = torch.from_numpy(self.X_previous_month)
        self.y_bid = torch.from_numpy(self.y_bid)
        self.y_ask = torch.from_numpy(self.y_ask)
        
    def cuda(self):
        
        self.X_date = self.X_date.cuda()
        self.X_now = self.X_now.cuda()
        self.X_previous_hour = self.X_previous_hour.cuda()
        self.X_previous_day = self.X_previous_day.cuda()
        self.X_previous_week = self.X_previous_week.cuda()
        self.X_previous_month = self.X_previous_month.cuda()
        self.y_bid = self.y_bid.cuda()
        self.y_ask = self.y_ask.cuda()
        
    def cpu(self):
        
        self.X_date = self.X_date.cpu()
        self.X_now = self.X_now.cpu()
        self.X_previous_hour = self.X_previous_hour.cpu()
        self.X_previous_day = self.X_previous_day.cpu()
        self.X_previous_week = self.X_previous_week.cpu()
        self.X_previous_month = self.X_previous_month.cpu()
        self.y_bid = self.y_bid.cpu()
        self.y_ask = self.y_ask.cpu()

Create datasets

In [16]:
dataset_tuning_trains = [ForexDataset(X_date, X_now, X_previous_hour, X_previous_day, X_previous_week, X_previous_month, y_bid, y_ask, idx, N_PREVIOUS_HOUR_VALUES, N_PREVIOUS_DAY_VALUES, N_PREVIOUS_WEEK_VALUES, N_PREVIOUS_MONTH_VALUES, N_FEATURES) for idx in idx_tuning_trains]
dataset_tuning_validations = [ForexDataset(X_date, X_now, X_previous_hour, X_previous_day, X_previous_week, X_previous_month, y_bid, y_ask, idx, N_PREVIOUS_HOUR_VALUES, N_PREVIOUS_DAY_VALUES, N_PREVIOUS_WEEK_VALUES, N_PREVIOUS_MONTH_VALUES, N_FEATURES) for idx in idx_tuning_validations]
dataset_eval_train = ForexDataset(X_date, X_now, X_previous_hour, X_previous_day, X_previous_week, X_previous_month, y_bid, y_ask, idx_eval_train, N_PREVIOUS_HOUR_VALUES, N_PREVIOUS_DAY_VALUES, N_PREVIOUS_WEEK_VALUES, N_PREVIOUS_MONTH_VALUES, N_FEATURES)
dataset_eval_validation = ForexDataset(X_date, X_now, X_previous_hour, X_previous_day, X_previous_week, X_previous_month, y_bid, y_ask, idx_eval_validation, N_PREVIOUS_HOUR_VALUES, N_PREVIOUS_DAY_VALUES, N_PREVIOUS_WEEK_VALUES, N_PREVIOUS_MONTH_VALUES, N_FEATURES)
dataset_eval_test = ForexDataset(X_date, X_now, X_previous_hour, X_previous_day, X_previous_week, X_previous_month, y_bid, y_ask, idx_eval_test, N_PREVIOUS_HOUR_VALUES, N_PREVIOUS_DAY_VALUES, N_PREVIOUS_WEEK_VALUES, N_PREVIOUS_MONTH_VALUES, N_FEATURES)

Scale datasets

In [17]:
for dataset_tuning_train, dataset_tuning_validation in zip(dataset_tuning_trains, dataset_tuning_validations):
    
    scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask = dataset_tuning_train.fit_scalers(MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler())
    dataset_tuning_train.scale(scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask)
    dataset_tuning_validation.scale(scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask)
    
scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask = dataset_eval_train.fit_scalers(MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), MinMaxScaler())
dataset_eval_train.scale(scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask)
dataset_eval_validation.scale(scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask)
dataset_eval_test.scale(scaler_now, scaler_previous_hour, scaler_previous_day, scaler_previous_week, scaler_previous_month, scaler_y_bid, scaler_y_ask)

Transfer to pytorch tensor

In [18]:
for dataset_tuning_train, dataset_tuning_validation in zip(dataset_tuning_trains, dataset_tuning_validations):
    
    dataset_tuning_train.transfer_to_tensor()
    dataset_tuning_validation.transfer_to_tensor()
    
dataset_eval_train.transfer_to_tensor()
dataset_eval_validation.transfer_to_tensor()
dataset_eval_test.transfer_to_tensor()

Save datasets

In [19]:
with open('data preprocessed/dataset_tuning_train_0.pt', 'wb') as file:
    torch.save(dataset_tuning_trains[0], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_train_1.pt', 'wb') as file:
    torch.save(dataset_tuning_trains[1], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_train_2.pt', 'wb') as file:
    torch.save(dataset_tuning_trains[2], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_train_3.pt', 'wb') as file:
    torch.save(dataset_tuning_trains[3], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_validation_0.pt', 'wb') as file:
    torch.save(dataset_tuning_validations[0], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_validation_1.pt', 'wb') as file:
    torch.save(dataset_tuning_validations[1], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_validation_2.pt', 'wb') as file:
    torch.save(dataset_tuning_validations[2], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_tuning_validation_3.pt', 'wb') as file:
    torch.save(dataset_tuning_validations[3], file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_eval_train.pt', 'wb') as file:
    torch.save(dataset_eval_train, file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_eval_validation.pt', 'wb') as file:
    torch.save(dataset_eval_validation, file, pickle_module=pickle, pickle_protocol=4)
with open('data preprocessed/dataset_eval_test.pt', 'wb') as file:
    torch.save(dataset_eval_test, file, pickle_module=pickle, pickle_protocol=4)

# Y bid

In [20]:
TARGET = 'y_bid'
mlp.set_experiment("Forex EUR CHF Bid")

## MLP 0 layer hour memory

In [21]:
MODEL_NAME = 'MLP0HourMemory'
MODEL_VERSION = 1.0
TUNING_PATIENCE = 5
TUNING_EPOCHS = 50
TESTING_PATIENCE = 10
TESTING_EPOCHS = 100

### Tuning

In [None]:
if os.path.exists(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl')):
    with open(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl'), 'rb') as file:
        study = pickle.load(file)
else:
    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
    
while True:
    
    study.optimize(
        lambda trial: objective(
            trial, 
            dataset_tuning_trains, 
            dataset_tuning_validations, 
            MODEL_NAME, 
            TARGET, 
            patience=TUNING_PATIENCE, 
            epochs=TUNING_EPOCHS),
        n_trials=1, 
        timeout=None, 
        n_jobs=1)
    
    with open(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl'), 'wb') as file:
        pickle.dump(study, file)

### Visualization

In [None]:
if os.path.exists(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl')):
    with open(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl'), 'rb') as file:
        study = pickle.load(file)
else:
    raise Exception("Study do not exists")

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_slice(study)

### Evaluation

In [22]:
if os.path.exists(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl')):
    with open(os.path.join(REPOSITORY_STUDIES, TARGET, 'study ' + TARGET + ' ' + MODEL_NAME + '.pkl'), 'rb') as file:
        study = pickle.load(file)
else:
    raise Exception("Study do not exists")

model = get_model(MODEL_NAME, **study.best_params)

model, loss_test_mse, loss_test_mae, loss_test_mse_unscaled, loss_test_mae_unscaled, relative_error, max_error_absolute, max_error_relative, epoch = evaluate_for_testing(
    model, 
    dataset_eval_train, 
    dataset_eval_validation, 
    dataset_eval_test, 
    scaler_target=scaler_y_bid if TARGET == 'y_bid' else scaler_y_ask, 
    target=TARGET, 
    optimizer=study.best_params['optimizer'], 
    batch_size_train=study.best_params['batch_size_train'], 
    batch_size_validation=dataset_eval_validation.__len__(), 
    batch_size_test=dataset_eval_test.__len__(), 
    learning_rate=study.best_params['learning_rate'], 
    weight_decay=study.best_params['weight_decay'], 
    patience=TESTING_PATIENCE, 
    epochs=TESTING_EPOCHS)

mlp.set_version(MODEL_VERSION)

with mlp.run():
    
    mlp.pytorch.register_model(MODEL_NAME, model)
    mlp.log_params({
        "n_previous_hour_values": study.best_params.get("n_previous_hour_values", 0),
        "n_previous_day_values": study.best_params.get("n_previous_day_values", 0),
        "n_previous_week_values": study.best_params.get("n_previous_week_values", 0),
        "n_previous_month_values": study.best_params.get("n_previous_month_values", 0),
        "optimizer": study.best_params['optimizer'],
        "batch_size_train": int(2 ** study.best_params['batch_size_train']),
        "learning_rate": study.best_params['learning_rate'],
        "weight_decay": study.best_params['weight_decay'],
        "patience": TESTING_PATIENCE,
        "epochs": TESTING_EPOCHS,
        "effective epochs": epoch
    })
    mlp.log_metrics({
        "MAE normalized * 1e6": loss_test_mse * 1e6,
        "MSE normalized * 1e6": loss_test_mae * 1e6,
        "MAE absolute * 1e6": loss_test_mse_unscaled * 1e6,
        "MSE absolute * 1e6": loss_test_mae_unscaled * 1e6,
        "Relative error * 1e6": relative_error * 1e6,
        "Max error absolute * 1e6": max_error_absolute * 1e6,
        "Max error relative * 1e6": max_error_relative * 1e6
    })

# Y ask

In [None]:
TARGET = 'y_ask'
mlp.set_experiment("Forex EUR CHF Ask")

# UI MlOps

In [None]:
start_ui(host='0.0.0.0', port=8085)