Approraches

1. Training a model with simulation results
1. Training a model with another symbols
1. Training a model with pertubation
1. Training a model with output of statistic model
1. Training a model with weight average

In [8]:
import os

try:
  from google.colab import drive
  IS_GOOGLE_COLAB = True
except ImportError:
  IS_GOOGLE_COLAB = False

if IS_GOOGLE_COLAB:
  mount_path = '/content/drive'
  base_folder = os.path.join(mount_path, "My Drive", "Data")
  data_folder = os.path.join(base_folder, "FX")
else:
  base_folder = '../../../Data'
  data_folder = os.path.join(base_folder, "FX", "OANDA-Japan MT5 Live")

In [2]:
import os
import shutil
import sys
import zipfile
import requests

def download_modlue_from_gh(repository, github_account='Naradice', branch='master', folder=None, module_path='/gdrive/My Drive/modules', **kwargs):
  if folder is None:
    folder = repository

  zip_url = f"https://github.com/{github_account}/{repository}/archive/refs/heads/{branch}.zip"
  response = requests.get(zip_url)
  if response.status_code == 200:
    with open("temp.zip", "wb") as f:
      f.write(response.content)
    with zipfile.ZipFile("temp.zip", "r") as zip_ref:
      zip_ref.extractall("temp_dir")

    source_folder = f"temp_dir/{repository}-{branch}/{folder}"
    destination_folder = os.path.join(module_path, folder)
    shutil.copytree(source_folder, destination_folder, dirs_exist_ok=True)
    os.remove("temp.zip")
    shutil.rmtree("temp_dir")
  else:
    print(f"filed to download {zip_url}: {response.status_code}, {response.text}")

In [3]:
if IS_GOOGLE_COLAB:
  drive.mount(mount_path)
  module_path = f"{mount_path}/My Drive/modules"
else:
  module_path = '../../modules'

if os.path.exists(module_path) is False:
  os.makedirs(module_path)

repositories = [
    {'repository': 'stocknet_study', 'branch': 'master', 'folder': 'Dataset', 'refresh': False},
    {'repository': 'finance_process', 'branch': 'master', 'folder': 'fprocess', 'refresh': False},
    {'repository': 'cloud_storage_handler', 'branch': 'main', 'folder': 'cloud_storage_handler', 'refresh': False},
]

destination = os.path.join(module_path, '__init__.py')
if os.path.exists(destination) is False:
  with open(destination, mode='w') as fp:
    fp.close()

for repo_kwargs in repositories:
  destination = os.path.join(module_path, repo_kwargs['folder'])
  if repo_kwargs['refresh'] or os.path.exists(destination) is False:
    download_modlue_from_gh(**repo_kwargs, module_path=module_path)

In [4]:
import math

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import (TransformerDecoder, TransformerDecoderLayer,
                      TransformerEncoder, TransformerEncoderLayer)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import sys, os


if IS_GOOGLE_COLAB:
    module_path = os.path.abspath("/content/drive/My Drive/modules")
    sys.path.append(module_path)
    import fprocess
else:
    module_path = os.path.abspath("../..")
    sys.path.append(module_path)
    from fprocess import fprocess

## Agent Simulation

### Determinisitc Simulation Dataset

In [6]:
from Dataset.generator import AgentSimulationTrainDataGenerator

agent_num = 300
parallel_model_count = 3
agent_config_01 = {
    "spread":0.1, "max_volatility":0.05, "min_volatility":0.001, "initial_positions": [(-1)**i for i in range(agent_num)]
}
agent_config_02 = {
    "spread":0.1, "max_volatility":0.05, "min_volatility":0.001, "initial_positions": [1 if i < int(300*0.6) else -1 for i in range(agent_num)]
}
agent_config_03 = {
    "spread":0.1, "max_volatility":0.05, "min_volatility":0.001, "initial_positions": [1 if i < int(300*0.4) else -1 for i in range(agent_num)]
}
agent_config = [agent_config_01, agent_config_02, agent_config_03]
obs_total_seconds = 60 * 30
batch_size = 64

In [10]:
df = pd.read_csv(f"{data_folder}/OANDA_2021_tick.zip", parse_dates=True, index_col=0)
index = df.index
df = df.resample("MIN").ohlc().dropna()
df = df.price.diff()
stats = df.describe()
columns = df.columns
del df

In [11]:
min_values = stats.loc["min"]
max_values = stats.loc["max"]
diff_p = fprocess.DiffPreProcess(columns=["open", "high", "low", "close"])
standalization_p = fprocess.MinMaxPreProcess(columns=["open", "high", "low", "close"], min_values=min_values, max_values=max_values)
processes = [diff_p, standalization_p]

In [12]:
data_generator = AgentSimulationTrainDataGenerator(agent_per_model = 300, output_length=70, model_count=parallel_model_count, sample_timeindex=index, model_config=agent_config,
                                                   processes=processes, batch_first=True, batch_size=batch_size)

### Define Model

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.05):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(-2)
        #pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, src, tgt):
        src_pos = src.size(0)
        tgt_pos = src_pos + tgt.size(0) - 1
        return self.dropout(src + self.pe[:src_pos, :]), self.dropout(tgt + self.pe[src_pos-1:tgt_pos, :])

In [None]:
class Seq2SeqTransformer(nn.Module):

    def __init__(
        self, num_encoder_layers: int, num_decoder_layers: int,
        feature_size: int, batch_first=True,
        dim_feedforward:int = 512, dropout:float = 0.1, nhead:int = 8
    ):

        super(Seq2SeqTransformer, self).__init__()

        self.positional_encoding = PositionalEncoding(feature_size, dropout=dropout)

        encoder_layer = TransformerEncoderLayer(
            d_model=feature_size, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        decoder_layer = TransformerDecoderLayer(
            d_model=feature_size, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first
        )
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

    def forward(
        self, src: Tensor, tgt: Tensor, mask_tgt: Tensor,
        mask_src: Tensor=None, padding_mask_src: Tensor=None, padding_mask_tgt: Tensor=None,
        memory_key_padding_mask: Tensor=None
    ):

        src, tgt = self.positional_encoding(src, tgt)
        memory = self.transformer_encoder(src, mask_src, padding_mask_src)
        outs = self.transformer_decoder(
            tgt, memory, mask_tgt, None,
            padding_mask_tgt, memory_key_padding_mask
        )
        return outs

In [None]:
SMA=100

def train(obs_length, model, generator, optimizer, criterion, batch_size, device):

    model.train()
    losses = np.array([])
    sma_loss = np.inf
    for observations in generator:

        # assume batch_first=True
        src = observations[:, :obs_length]
        tgt = observations[:, obs_length:]

        input_tgt = tgt[:, :-1]

        mask_tgt = nn.Transformer.generate_square_subsequent_mask(input_tgt.size(1)).to(device)
        logits = model(
            src=src, tgt=input_tgt,  mask_tgt=mask_tgt
        )

        optimizer.zero_grad()

        output_tgt = tgt[:, 1:]
        loss = criterion(logits, output_tgt)
        loss.backward()
        optimizer.step()
        loss_value = loss.item()
        losses = np.append(losses, loss_value)
        if len(losses) >= SMA:
          if len(losses) % 10 == 0:
            mean_loss = losses[-SMA:].mean()
            if sma_loss >= mean_loss:
              sma_loss = mean_loss
            else:
              break
    return losses.mean()

In [None]:
model_version = 1
model_name = f"pretrainiing_ohlc_{data_generator.sampler_rule}_v{model_version}"

nhead = 2
dim_feedforward = 1
num_encoder_layers = 1
num_decoder_layers = 1
dropout = 0.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = Seq2SeqTransformer(
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    feature_size=4,
    dim_feedforward=dim_feedforward,
    dropout=dropout, nhead=nhead,
).to(device)

In [None]:
lr = 0.0005

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.95)

### Training

In [None]:
model_path = f"{data_folder}/{model_name}.state"

if os.path.exists(model_path):
    if torch.cuda.is_available():
        check_point = torch.load(model_path)
    else:
        check_point = torch.load(model_path, map_location=torch.device('cpu'))
    model.load_state_dict(check_point['model_state_dict'])
    optimizer.load_state_dict(check_point['optimizer_state_dict'])
    scheduler.load_state_dict(check_point['scheduler_state_dict'])
else:
    print("start training a new model")

In [None]:
epoch = 500
best_train_loss = np.inf
best_train_model = None
patience = 3
counter = 0

for loop in range(1, epoch + 1):
  loss_train = train(
      obs_length=60,
      model=model, generator=data_generator, optimizer=optimizer,
      criterion=criterion, batch_size=batch_size,
      device=device
  )

  if best_train_loss > loss_train:
    best_train_loss = loss_train
    best_train_model = model
    counter = 0
  else:
    counter += 1
    scheduler.step()
  print(f"epoc: {loop}, loss: {loss_train}")

  if counter > patience:
    break

40
epoc: 1, loss: 0.0008273472718428821
80
epoc: 2, loss: 0.0007707675016717985
80
epoc: 3, loss: 0.0007932903674372938
40
epoc: 4, loss: 0.0007469542222679592
50
epoc: 5, loss: 0.0007474757928866893
40
epoc: 6, loss: 0.0007578721852041781
40
epoc: 7, loss: 0.0007321291137486696
40
epoc: 8, loss: 0.0006980493431910873
70
epoc: 9, loss: 0.0006928286881053022
40
epoc: 10, loss: 0.0007172456593252718
80
epoc: 11, loss: 0.0006900915890582837
50
epoc: 12, loss: 0.0006746361823752522
40
epoc: 13, loss: 0.0006745217106072232
40
epoc: 14, loss: 0.0006751551729394123
110
epoc: 15, loss: 0.0006756118350577625
60
epoc: 16, loss: 0.0006489181279903278
40
epoc: 17, loss: 0.0006655260833213106
40
epoc: 18, loss: 0.0006402997649274766
50
epoc: 19, loss: 0.0006138340005418286
40
epoc: 20, loss: 0.0006358670769259333
40
epoc: 21, loss: 0.0006139543402241543
50
epoc: 22, loss: 0.0006232692324556411
40
epoc: 23, loss: 0.0006505652767373249


In [None]:
torch.save({
  'model_state_dict': best_train_model.state_dict(),
  'optimizer_state_dict': optimizer.state_dict(),
  'scheduler_state_dict': scheduler.state_dict(),
}, f"{data_folder}/{model_name}.state")

In [None]:
import datetime

os.makedirs(f"{data_folder}/simulations", exist_ok=True)

date_str = datetime.datetime.now().isoformat()
for index, data in enumerate(data_generator.row_data):
    sim_srs = pd.Series(data, index=data_generator.sample_timemindex[:len(data)])
    sim_srs.to_csv(f"{data_folder}/simulations/{date_str}_{index}.csv")