# Another BaseLine Transformer model

Caution: This is not a usual use case of Transformer.

Transformer developed by Google usually use Embeddeding Layer to represents a target such as a word.

In this model, I use Close values as a state vector though we can take same approch for finance data.
As feature is very limited, this would be a base line of Transformer.

Parameters:
- Input:  ohlc_df["close"].iloc[index: index+data_length] while observation_length
- Target: ohlc_df["close"].iloc[index + observation_length:index + observation_length + data_length]

## Data Preparation

In [2]:
import os
ohlc_column = ('open','high','low','close')
file_path = os.path.abspath('../Data/mt5_USDJPY_min30.csv')

In [3]:
import pandas as pd
df = pd.read_csv(file_path)
df

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume
0,2014-07-07 08:30:00,102.086,102.122,102.081,102.102,738,3,0
1,2014-07-07 09:00:00,102.102,102.146,102.098,102.113,1036,3,0
2,2014-07-07 09:30:00,102.113,102.115,102.042,102.044,865,3,0
3,2014-07-07 10:00:00,102.047,102.052,102.005,102.019,983,3,0
4,2014-07-07 10:30:00,102.017,102.025,101.918,101.941,1328,3,0
...,...,...,...,...,...,...,...,...
100715,2022-08-12 21:30:00,133.461,133.506,133.439,133.484,1125,3,0
100716,2022-08-12 22:00:00,133.484,133.530,133.437,133.475,1277,3,0
100717,2022-08-12 22:30:00,133.475,133.486,133.433,133.483,1506,3,0
100718,2022-08-12 23:00:00,133.484,133.536,133.465,133.521,1038,3,0


## Define Dataset

In [None]:
import math
import random
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import (TransformerDecoder, TransformerDecoderLayer,
                      TransformerEncoder, TransformerEncoderLayer)
from tqdm import tqdm

In [1]:
from CommonDataset import Dataset


class CloseDataset(Dataset):
    """ 

    Args:
        Dataset (_type_): _description_
    """
    

    def __init__(self, df, close_column, volume_columns=[], data_length=15, observation_length:int=15, device="cuda", init_type="log", future_step_size=1, seed=1017, is_training = True):
        self.data_length = data_length
        super().__init__(df, [close_column], volume_columns, observation_length, device, init_type, future_step_size, seed, is_training)
        
    def _init_indicies(self, data, split_ratio=0.7):
        length = len(data) - self._future_step_size - self.data_length
        if length <= 0:
            raise Exception(f"date length {length} is less than observation_length {self.observation_length}")
            
        indices = random.sample(range(self.observation_length+1, length), k=length-self.observation_length-1)
        
        if self.is_training:
            from_index = 0
            to_index = int(length*split_ratio)
        else:
            from_index = int(length*split_ratio)+1
            to_index = length
        
        self._indices = indices[from_index:to_index]
        self._entire_indices = indices
        
    def _outputFunc(self, batch_size):
        if type(batch_size) == int:
            batch_size = slice(batch_size, batch_size+1)
        batch_indices = batch_size
        
        chunk_data = []
        for index in self._indices[batch_indices]:
            f_data = []
            for f_step in range(0, self._future_step_size):
                f_data.append(self._data[self._columns][index + f_step: index + self.data_length + f_step].values.reshape(self.data_length * len(self._columns)).tolist())                
            chunk_data.append(f_data)
            
        return torch.tensor(chunk_data, device=self.device, dtype=torch.float).transpose(0, 1)
    
    def _inputFunc(self, batch_size):
        if type(batch_size) == int:
            batch_size = slice(batch_size, batch_size+1)
        batch_indices = batch_size
        chunk_src = []
        for index in self._indices[batch_indices]:
            ob_src = []
            for ob_step in range(self.observation_length, 0, -1):
                ob_src.append(self._data[self._columns][index - ob_step: index - ob_step + self.data_length].values.reshape(self.data_length * len(self._columns)).tolist())
            chunk_src.append(ob_src)
        
        return torch.tensor(chunk_src, device=self.device, dtype=torch.float).transpose(0, 1)

    def eval(self):
        split_ratio = 0.7
        length = len(self._entire_indices)
        from_index = int(length*split_ratio)+1
        to_index = length - self._future_step_size
        self._indices = self._entire_indices[from_index:to_index]
        self.is_training = False
            
    def get_actual_index(self,ndx):
        inputs = []
        if type(ndx) == slice:
            for index in self._indices[ndx]:
                inputs.append(index)
        else:
            inputs = self._indices[ndx]
        return inputs