# Another BaseLine Transformer model

Caution: This is not a usual use case of Transformer.

Transformer developed by Google usually use Embeddeding Layer to represents a target such as a word.

In this model, I use Close values as a state vector though we can take same approch for finance data.
As feature is very limited, this would be a base line of Transformer.

Parameters:
- Input:  ohlc_df["close"].iloc[index: index+data_length] while observation_length
- Target: ohlc_df["close"].iloc[index + observation_length:index + observation_length + data_length]

## Data Preparation

In [1]:
import os
ohlc_column = ('open','high','low','close')
file_path = os.path.abspath('mt5_USDJPY_min30.csv')

In [8]:
import pandas as pd
df = pd.read_csv(file_path, index_col=0, parse_dates=True)
df

Unnamed: 0_level_0,open,high,low,close,tick_volume,spread,real_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-07-07 08:30:00,102.086,102.122,102.081,102.102,738,3,0
2014-07-07 09:00:00,102.102,102.146,102.098,102.113,1036,3,0
2014-07-07 09:30:00,102.113,102.115,102.042,102.044,865,3,0
2014-07-07 10:00:00,102.047,102.052,102.005,102.019,983,3,0
2014-07-07 10:30:00,102.017,102.025,101.918,101.941,1328,3,0
...,...,...,...,...,...,...,...
2022-08-12 21:30:00,133.461,133.506,133.439,133.484,1125,3,0
2022-08-12 22:00:00,133.484,133.530,133.437,133.475,1277,3,0
2022-08-12 22:30:00,133.475,133.486,133.433,133.483,1506,3,0
2022-08-12 23:00:00,133.484,133.536,133.465,133.521,1038,3,0


## Define Dataset

In [5]:
import math
import random
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import (TransformerDecoder, TransformerDecoderLayer,
                      TransformerEncoder, TransformerEncoderLayer)
from tqdm import tqdm

In [9]:
import random

class CloseDataset:    
    key = "close"

    def __init__(self, df, close_column="close", date_column="index", data_length=30, observation_length:int=31,
                 device="cuda", init_type="log", future_step_size=10, seed=1192, is_training = True):
        self.seed(seed)
        
        close_df = df[[close_column]]
        close_df = close_df.apply(np.log).diff()
        if init_type == "cumsum":
            close_df = close_df.cumsum()
        scale = (-1, 1)
        min_series = close_df.min()
        max_series = close_df.max()
        std = (close_df - min_series)/(max_series - min_series)
        close_df = std * (scale[1] - scale[0]) + scale[0]
        
        self.__get_timeindex = lambda idx :None
        if date_column:
            hours_of_week = (df.index.weekday*24 + df.index.hour + df.index.minute/60)*(60/30)
            self.hours_of_week = hours_of_week.to_frame().iloc[1:].convert_dtypes(int)
            
        self.observation_length = observation_length
        self.data_length = data_length
        self.is_training = is_training
        self.device = device
        self._data = close_df
        self._columns = close_column
        self._future_step_size = future_step_size
        self._init_indicies(close_df)
    
    def __get_timeindex(self, index, length):
        indices = [index+ bias+ self.data_length for bias in range(length)]
        return self.hours_of_week.iloc[indices].values.tolist()
                
    def _outputFunc(self, batch_size):
        if type(batch_size) == int:
            pass
        elif type(batch_size) == slice:
            batch_indices = batch_size                
            chunk_tgt = []
            time_chunk_tgt = []
            ndx = self._indices[batch_indices]
            
            for index in ndx:
                future_tgt = []
                future_time = []
                # -1 for shift
                for bias in range(-1, self._future_step_size):
                    start_index = index + self.observation_length - self.data_length  + bias
                    stop_index = index + self.observation_length + bias
                    step_ndx = slice(start_index, stop_index)
                    future_tgt.append(self._data[self._columns][step_ndx].values.tolist())
                    future_time.append(self.hours_of_week.iloc[stop_index].values.tolist())
                chunk_tgt.append(future_tgt)
                time_chunk_tgt.append(future_time)
            
            return torch.tensor(chunk_tgt, device=self.device, dtype=torch.float).transpose(0, 1), torch.tensor(time_chunk_tgt, device=self.device).transpose(0, 1)
    
    def _inputFunc(self, batch_size):
        if type(batch_size) == int:
            pass
        elif type(batch_size) == slice:
            batch_indices = batch_size
            chunk_src = []
            time_chunk_src = []
            ndx = self._indices[batch_indices]
            
            for index in ndx:
                obs_src = []
                obs_time = []
                for bias in range(self.observation_length):
                    stop_index = index + bias
                    obs_ndx = slice(index - self.data_length + bias, stop_index)
                    obs_src.append(self._data[self._columns][obs_ndx].values.tolist())
                    obs_time.append(self.hours_of_week.iloc[stop_index].values.tolist())
                chunk_src.append(obs_src)
                time_chunk_src.append(obs_time)
        
            return torch.tensor(chunk_src, device=self.device, dtype=torch.float).transpose(0, 1), torch.tensor(time_chunk_src, device=self.device).transpose(0, 1)
    
    def __len__(self):
        return len(self._indices)
    
    def __getitem__(self, ndx):
        return self._inputFunc(ndx), self._outputFunc(ndx)
           
    def seed(self, seed=None):
        '''
        '''
        if seed == None:
            seed = 1192
        else:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)
        self.seed_value = seed
        
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)
        
    def render(self, mode='human', close=False):
        '''
        '''
        pass
    
    def get_actual_index(self,ndx):
        inputs = []
        if type(ndx) == slice:
            for index in self._indices[ndx]:
                inputs.append(index)
        else:
            inputs = self._indices[ndx]
        return inputs
    
    def get_row_data(self, ndx):
        inputs = []
        if type(ndx) == slice:
            for index in self._indices[ndx]:
                df = self._data[index: index + self.observation_length]
                inputs.append(df)
        else:
            index = ndx
            inputs = df = self._data[index: index + self.observation_length]
        return inputs

    def _init_indicies(self, data, split_ratio=0.7):
        length = len(data) - self.observation_length -self.data_length - self._future_step_size
        if length < 0:
            raise Exception(f"date length {length} is less than observation_length {self.observation_length}")
        
        from_index = self.data_length + 1#avoid diff nan
        to_index = int(length*split_ratio)
        self.train_indices = random.sample(range(from_index, to_index), k=to_index - from_index)
        
        from_index = int(length*split_ratio) + self.observation_length + self.data_length + self._future_step_size
        to_index = length
        self.val_indices = random.sample(range(from_index, to_index), k=to_index - from_index)
        
        if self.is_training:
            self._indices = self.train_indices
        else:
            self._indices = self.val_indices
            
    def eval(self):
        self._indices = self.val_indices
        self.is_training = False
        
    def train(self):
        self._indices = self.train_indices
        self.is_training = True

In [10]:
ds = CloseDataset(df)

In [11]:
src, tgt = ds[:16]

In [14]:
src[0].shape, src[1].shape

(torch.Size([31, 16, 30]), torch.Size([31, 16, 1]))

In [15]:
tgt[0].shape, tgt[1].shape

(torch.Size([11, 16, 30]), torch.Size([11, 16, 1]))

In [16]:
ds.train()
print(len(ds))

for index in range(0, len(ds)-16, 16):
    src, tgt = ds[index: index+16]
    
print(src[0].shape, src[1].shape)
print(tgt[0].shape, tgt[1].shape)

70423
torch.Size([31, 16, 30]) torch.Size([31, 16, 1])
torch.Size([11, 16, 30]) torch.Size([11, 16, 1])


In [17]:
ds.eval()
print(len(ds))

for index in range(0, len(ds)-16, 16):
    src, tgt = ds[index: index+16]
    
print(src[0].shape, src[1].shape)
print(tgt[0].shape, tgt[1].shape)

30124
torch.Size([31, 16, 30]) torch.Size([31, 16, 1])
torch.Size([11, 16, 30]) torch.Size([11, 16, 1])


In [18]:
src[0][-1, 0, :]

tensor([0.2735, 0.2639, 0.2650, 0.2623, 0.2666, 0.2703, 0.2655, 0.2719, 0.2496,
        0.2687, 0.2629, 0.2767, 0.2655, 0.2698, 0.2618, 0.2517, 0.2597, 0.2533,
        0.2857, 0.2719, 0.2341, 0.2942, 0.2724, 0.2698, 0.2730, 0.2543, 0.2862,
        0.3032, 0.2793, 0.2618], device='cuda:0')

In [22]:
tgt[0][0, 0, :]

tensor([0.2735, 0.2639, 0.2650, 0.2623, 0.2666, 0.2703, 0.2655, 0.2719, 0.2496,
        0.2687, 0.2629, 0.2767, 0.2655, 0.2698, 0.2618, 0.2517, 0.2597, 0.2533,
        0.2857, 0.2719, 0.2341, 0.2942, 0.2724, 0.2698, 0.2730, 0.2543, 0.2862,
        0.3032, 0.2793, 0.2618], device='cuda:0')

In [23]:
class VariationalDropout(nn.Dropout):
    def forward(self, input):
        if self.training:
            mask = self.get_mask(input)
            return input * mask
        else:
            return input

    def get_mask(self, input):
        return torch.ones_like(input).bernoulli_(1 - self.p) / (1 - self.p)

In [26]:
class TimePositionalEncoding(nn.Module):
    def __init__(self, time_size, d_model, device="cuda"):
        super().__init__()
        self.pe = nn.Embedding(time_size, d_model, device=device)

    def forward(self,time_ids):
        position = self.pe(time_ids)
        #tetative approach to fix shape
        position = position.squeeze(2)
        return position

In [35]:
class Seq2SeqTransformer(nn.Module):
    
    def __init__(
        self, num_encoder_layers: int, num_decoder_layers: int, 
        feature_size: int=1, time_size: int=48*7, d_model=30,
        dim_feedforward:int = 512, dropout:float = 0.1, nhead:int = 8,
    ):
        
        super(Seq2SeqTransformer, self).__init__()
        
        self.positional_encoding = TimePositionalEncoding(time_size, d_model)
        # self.src_dropaut_layer = VariationalDropout(dropout)
        # self.tgt_dropaut_layer = VariationalDropout(dropout)
        self.src_dropaut_layer = nn.Dropout(dropout)
        self.tgt_dropaut_layer = nn.Dropout(dropout)
        
        encoder_layer = TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        decoder_layer = TransformerDecoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward
        )
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        
        self.output = nn.Linear(d_model, feature_size)

    def forward(
        self, src: Tensor, src_time, tgt: Tensor, tgt_time,
        mask_tgt: Tensor, mask_src: Tensor=None, padding_mask_src: Tensor=None, padding_mask_tgt: Tensor=None,
        memory_key_padding_mask: Tensor=None
    ):
        
        src_time = self.positional_encoding(src_time)
        src = self.src_dropaut_layer(torch.add(src, src_time))
        
        tgt_time = self.positional_encoding(tgt_time)
        tgt = self.tgt_dropaut_layer(torch.add(tgt, tgt_time))
        
        memory = self.transformer_encoder(src, mask_src, padding_mask_src)
        outs = self.transformer_decoder(
            tgt, memory, mask_tgt, None,
            padding_mask_tgt, memory_key_padding_mask
        )
        return self.output(outs)

In [42]:
def train(model, ds, optimizer, criterion, batch_size):
    
    model.train()
    ds.train()
    losses = 0
    
    length = 0.0
    end_index = len(ds) - batch_size
    for index in tqdm(range(0, end_index, batch_size)):
        length+=1.0
        src, tgt = ds[index:index+batch_size]
        src, src_time = src
        tgt, tgt_time = tgt

        input_tgt = tgt[:-1, :]
        input_time_tgt = tgt_time[:-1, :]

        mask_tgt = nn.Transformer.generate_square_subsequent_mask(input_tgt.size(0)).to(device)
        logits = model(
            src=src, src_time=src_time, 
            tgt=input_tgt, tgt_time=input_time_tgt,
            mask_tgt=mask_tgt
        )

        optimizer.zero_grad()

        output_tgt = tgt[1:, :, -1:]
        loss = criterion(logits, output_tgt)
        loss.backward()

        optimizer.step()
        losses += loss.item()
        
    return losses / length

In [43]:
def evaluate(model, ds, criterion, batch_size):
    
    model.eval()
    ds.eval()
    losses = 0
    length = 0.0
    for index in range(0, len(ds) - batch_size, batch_size):
        length+=1.0
        src, tgt = ds[index:index+batch_size]
        src, src_time = src
        tgt, tgt_time = tgt

        input_tgt = tgt[:-1, :]
        input_time_tgt = tgt_time[:-1, :]

        mask_tgt = nn.Transformer.generate_square_subsequent_mask(input_tgt.size(0)).to(device)
        logits = model(
            src=src, src_time=src_time, 
            tgt=input_tgt, tgt_time=input_time_tgt,
            mask_tgt=mask_tgt
        )
        
        output_tgt = tgt[1:, :, -1:]
        loss = criterion(logits, output_tgt)
        losses += loss.item()
        
    return losses / length

In [34]:
from torch.utils.data import DataLoader

standalization = "log"
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
d_model = 30
observation_length = 31
future_step_size = 10
column = "close"

ds = CloseDataset(df, close_column=column, data_length=d_model, observation_length=observation_length)

In [39]:
feature_size = 1
d_model=30
nhead = 2
dim_feedforward = 1000
num_encoder_layers = 6
num_decoder_layers = 6
dropout = 0.01
time_size = int(24*(60/30)*7)

model = Seq2SeqTransformer(
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    feature_size=feature_size,
    time_size=time_size,
    d_model=d_model,
    dim_feedforward=dim_feedforward,
    dropout=dropout, nhead=nhead
)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(device)

In [40]:
lr = 0.00005

#criterion = torch.nn.CrossEntropyLoss()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.95)

In [44]:
epoch = 500
best_loss = float('Inf')
best_model = None
patience = 5
counter = 0

for loop in range(1, epoch + 1):
    
    start_time = time.time()
    
    loss_train = train(
        model=model, ds=ds, optimizer=optimizer,
        criterion=criterion, batch_size=batch_size
    )
    
    elapsed_time = time.time() - start_time
    
    loss_valid = evaluate(
        model=model, ds=ds, criterion=criterion,batch_size=batch_size
    )
    
    print('[{}/{}] train loss: {:.10f}, valid loss: {:.10f}  [{}{:.0f}s] count: {}, {}'.format(
        loop, epoch,
        loss_train, loss_valid,
        str(int(math.floor(elapsed_time / 60))) + 'm' if math.floor(elapsed_time / 60) > 0 else '',
        elapsed_time % 60,
        counter,
        '**' if best_loss > loss_valid else ''
    ))
    
    if best_loss > loss_valid:
        best_loss = loss_valid
        best_model = model
        counter = 0
    else:
        counter += 1
        scheduler.step()
        
    if counter > patience:
        break

 82%|████████▏ | 903/1100 [12:48<02:45,  1.19it/s]