Approraches

1. Training a model with simulation results
1. Training a model with another symbols
1. Training a model with pertubation
1. Training a model with output of statistic model
1. Training a model with weight average

In [17]:
import os

try:
  from google.colab import drive
  IS_GOOGLE_COLAB = True
except ImportError:
  IS_GOOGLE_COLAB = False

if IS_GOOGLE_COLAB:
  mount_path = '/content/drive'
  base_folder = os.path.join(mount_path, "My Drive", "Data")
  data_folder = os.path.join(base_folder, "FX")
else:
  base_folder = '../../../Data'
  data_folder = os.path.join(base_folder, "FX", "OANDA-Japan MT5 Live")

In [18]:
import os
import shutil
import zipfile
import requests

def download_modlue_from_gh(repository, github_account='Naradice', branch='master', folder=None, module_path='/gdrive/My Drive/modules', **kwargs):
  if folder is None:
    folder = repository

  zip_url = f"https://github.com/{github_account}/{repository}/archive/refs/heads/{branch}.zip"
  response = requests.get(zip_url)
  if response.status_code == 200:
    with open("temp.zip", "wb") as f:
      f.write(response.content)
    with zipfile.ZipFile("temp.zip", "r") as zip_ref:
      zip_ref.extractall("temp_dir")

    source_folder = f"temp_dir/{repository}-{branch}/{folder}"
    destination_folder = os.path.join(module_path, folder)
    shutil.copytree(source_folder, destination_folder, dirs_exist_ok=True)
    os.remove("temp.zip")
    shutil.rmtree("temp_dir")
  else:
    print(f"filed to download {zip_url}: {response.status_code}, {response.text}")

In [12]:
if IS_GOOGLE_COLAB:
  drive.mount(mount_path)
  module_path = f"{mount_path}/My Drive/modules"
else:
  module_path = '../../modules'

if os.path.exists(module_path) is False:
  os.makedirs(module_path)

repositories = [
    {'repository': 'stocknet_study', 'branch': 'master', 'folder': 'Dataset', 'refresh': False},
    {'repository': 'finance_process', 'branch': 'master', 'folder': 'fprocess', 'refresh': False},
    {'repository': 'cloud_storage_handler', 'branch': 'main', 'folder': 'cloud_storage_handler', 'refresh': False},
]

destination = os.path.join(module_path, '__init__.py')
if os.path.exists(destination) is False:
  with open(destination, mode='w') as fp:
    fp.close()

for repo_kwargs in repositories:
  destination = os.path.join(module_path, repo_kwargs['folder'])
  if repo_kwargs['refresh'] or os.path.exists(destination) is False:
    download_modlue_from_gh(**repo_kwargs, module_path=module_path)

In [13]:
import math

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import (TransformerDecoder, TransformerDecoderLayer,
                      TransformerEncoder, TransformerEncoderLayer)

In [14]:
import sys, os

sys.path.append(module_path)
try:
    from fprocess import fprocess
except ImportError:
    import fprocess
    
from Dataset import utils

In [15]:
# Initialize cloud storage handler if needed
from cloud_storage_handler import DropboxHandler


# storage_handler = DropboxHandler("nhjrq1cjpugk5hc", "http://localhost")
# storage_handler.authenticate()
# Otherwise, specify None
storage_handler = None

## Agent Simulation

### Determinisitc Simulation Dataset

In [19]:
from Dataset.generator import AgentSimulationTrainDataGenerator

agent_num = 300
parallel_model_count = 3
agent_config_01 = {
    "spread":0.1, "max_volatility":0.05, "min_volatility":0.001, "initial_positions": [(-1)**i for i in range(agent_num)]
}
agent_config_02 = {
    "spread":0.1, "max_volatility":0.05, "min_volatility":0.001, "initial_positions": [1 if i < int(300*0.6) else -1 for i in range(agent_num)]
}
agent_config_03 = {
    "spread":0.1, "max_volatility":0.05, "min_volatility":0.001, "initial_positions": [1 if i < int(300*0.4) else -1 for i in range(agent_num)]
}
agent_config = [agent_config_01, agent_config_02, agent_config_03]
observation_length = 60
prediction_length = 10
total_length = observation_length + prediction_length
batch_size = 64

In [20]:
df = pd.read_csv(f"{data_folder}/OANDA_2021_tick.zip", parse_dates=True, index_col=0)
index = df.index
df = df.resample("MIN").ohlc().dropna()
df = df.price.diff()
stats = df.describe()
columns = list(df.columns)
del df

In [21]:
min_values = stats.loc["min"]
max_values = stats.loc["max"]
diff_p = fprocess.DiffPreProcess(columns=columns)
standalization_p = fprocess.MinMaxPreProcess(columns=columns, min_values=min_values, max_values=max_values)
processes = [diff_p, standalization_p]

In [22]:
data_generator = AgentSimulationTrainDataGenerator(agent_per_model = agent_num, output_length=total_length, model_count=parallel_model_count, sample_timeindex=index, model_config=agent_config,
                                                   processes=processes, batch_first=True, batch_size=batch_size)

In [14]:
# Add data previously generated. Remove this code if previouse data shouldn't affect to the training.
import glob

files = glob.glob(f"{data_folder}/simulations/*.csv")
tick_data_list = []
threathold = 1/10

for file in files:
    tick_srs = pd.read_csv(file, index_col=0, parse_dates=True)["0"]
    if len(tick_srs) > len(index) * threathold:
        tick_data_list.append(tick_srs)
if len(tick_data_list) > 0:
    print(f"load {len(tick_data_list)} data")
    data_generator.add_multiple_data(tick_data_list)
del tick_data_list

### Define Model

In [35]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.05, batch_first=True):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(-2)
        if batch_first:
            pe = pe.transpose(0, 1)
            self.forward = self.__fforward
        else:
            self.forward = self.__mforward
            
        self.register_buffer("pe", pe)

    def __mforward(self, src, tgt):
        src_pos = src.size(0)
        tgt_pos = src_pos + tgt.size(0) - 1
        return self.dropout(src + self.pe[:src_pos, :]), self.dropout(tgt + self.pe[src_pos-1:tgt_pos, :])
    
    def __fforward(self, src, tgt):
        src_pos = src.size(1)
        tgt_pos = src_pos + tgt.size(1) - 1
        return self.dropout(src + self.pe[:, :src_pos, :]), self.dropout(tgt + self.pe[:, src_pos-1:tgt_pos, :])

In [39]:
class Seq2SeqTransformer(nn.Module):

    def __init__(
        self, num_encoder_layers: int, num_decoder_layers: int,
        feature_size: int, batch_first=True,
        dim_feedforward:int = 512, dropout:float = 0.1, nhead:int = 8
    ):

        super(Seq2SeqTransformer, self).__init__()

        self.positional_encoding = PositionalEncoding(feature_size, dropout=dropout, batch_first=batch_first)

        encoder_layer = TransformerEncoderLayer(
            d_model=feature_size, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        decoder_layer = TransformerDecoderLayer(
            d_model=feature_size, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first
        )
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

    def forward(
        self, src: Tensor, tgt: Tensor, mask_tgt: Tensor,
        mask_src: Tensor=None, padding_mask_src: Tensor=None, padding_mask_tgt: Tensor=None,
        memory_key_padding_mask: Tensor=None
    ):

        src, tgt = self.positional_encoding(src, tgt)
        memory = self.transformer_encoder(src, mask_src, padding_mask_src)
        outs = self.transformer_decoder(
            tgt, memory, mask_tgt, None,
            padding_mask_tgt, memory_key_padding_mask
        )
        return outs

In [40]:
SMA=100

def train(obs_length, model, generator, optimizer, criterion, device, logger=None):

    model.train()
    losses = np.array([])
    sma_loss = np.inf
    for observations in generator:

        # assume batch_first=True
        src = observations[:, :obs_length]
        tgt = observations[:, obs_length:]

        input_tgt = tgt[:, :-1]

        mask_tgt = nn.Transformer.generate_square_subsequent_mask(input_tgt.size(1)).to(device)
        logits = model(
            src=src, tgt=input_tgt,  mask_tgt=mask_tgt
        )

        optimizer.zero_grad()

        output_tgt = tgt[:, 1:]
        loss = criterion(logits, output_tgt)
        loss.backward()
        optimizer.step()
        loss_value = loss.item()
        losses = np.append(losses, loss_value)
        if len(losses) >= SMA:
          if len(losses) % 10 == 0:
            mean_loss = losses[-SMA:].mean()
            if logger is not None:
              logger.add_training_log(mean_loss, 0)
            if sma_loss >= mean_loss:
              sma_loss = mean_loss
            else:
              break
    return losses.mean()

In [41]:
def create_model(num_encoder_layers, num_decoder_layers, feature_size, dim_feedforward, dropout, nhead, **kwargs):
    model = Seq2SeqTransformer(
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers,
        feature_size=feature_size,
        dim_feedforward=dim_feedforward,
        dropout=dropout, nhead=nhead,
    )
    return model

In [42]:
model_version = 1
model_name = f"pretrainiing_ohlc_{data_generator.sampler_rule}"
# Hyper parameters. If model name and version is already used, load it from params file instead.
nhead = 2
dim_feedforward = 1
num_encoder_layers = 1
num_decoder_layers = 1
dropout = 0.1

criterion = nn.MSELoss()
optimizer_class=torch.optim.Adam
lr = 0.005
scheduler_class=torch.optim.lr_scheduler.StepLR
gamma = 0.95
device = utils.get_device()
logger = utils.Logger(model_name, model_version, base_folder, storage_handler=storage_handler)

success, model_params, model, optimizer, scheduler, best_train_loss = logger.load_model_checkpoint(create_model, model_name, model_version, 
                                                                        train=True, storage_handler=storage_handler,
                                                                        optimizer_class=optimizer_class,
                                                                        scheduler_class=scheduler_class)

if success is False:
    print("Initialize a new model.")
    if model_params is None:
        model_params = {
            "nhead": nhead,
            "dim_feedforward": dim_feedforward,
            "num_encoder_layers": num_encoder_layers,
            "num_decoder_layers": num_decoder_layers,
            "dropout": dropout,
            "feature_size": 4
        }
    if model is None:
        model = create_model(
            **model_params
        ).to(device)
        optimizer = optimizer_class(model.parameters(), lr=lr)
        scheduler = scheduler_class(optimizer, step_size=1, gamma=gamma)

params_num = 0
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
    if p.requires_grad:
        params_num += p.numel()
print(f"params: {params_num}")

exsisting model not found.
Initialize a new model.
params: 306


In [43]:
params = {"processes": fprocess.preprocess_to_params(processes),
          "source": {
              "type": "simulation",
              "agent_num": agent_num,
              "agent_config": agent_config,
          },
          "feature_size": len(columns),
          "features": columns,
          "batch_size": batch_size,
          "observation_length": observation_length,
          "prediction_length": prediction_length,
          **model_params,
          "params_num": params_num,
          "version": 2
}

logger.save_params(params, model_name, model_version)

print("training log will be saved on ", logger.log_file_path)

training log will be saved on  ../../../Data\pretrainiing_ohlc_MIN\pretrainiing_ohlc_MIN_v1.csv


### Training

In [44]:
epoch = 500
best_train_loss = np.inf
best_train_model = None
patience = 3
counter = 0

for loop in range(1, epoch + 1):
  loss_train = train(
      obs_length=observation_length,
      model=model, generator=data_generator, optimizer=optimizer,
      criterion=criterion, device=device, logger=logger
  )

  if best_train_loss > loss_train:
    best_train_loss = loss_train
    best_train_model = model
    counter = 0
  else:
    counter += 1
    scheduler.step()
  print(f"epoc: {loop}, loss: {loss_train}")
  logger.add_training_log(loss_train, 0.0)

  if counter > patience:
    break

In [19]:
logger.save_checkpoint(best_train_model, optimizer, scheduler, model_name, model_version, best_train_loss)
data_generator.save_ticks(f"{data_folder}/simulations")

In [None]:
if IS_GOOGLE_COLAB:
    # run this method if you want to disconnect runtime after the traning ends
    from google.colab import runtime

    drive.flush_and_unmount()
    runtime.unassign()