# Transformer Model
Training a transformer model to forecast time series sequence of stock closing price  
Using 10 timesteps to forecast 1 forward timestep

Imports

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import time
import math
import matplotlib.pyplot as plt

Load data and set global variables

In [2]:
input_window = 10 # number of input steps
output_window = 1 # number of prediction steps, in this model its fixed to one
batch_size = 250
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

df = pd.read_csv('FB_raw.csv') # data path of facebook stock price (Apr 2019 - Nov 2020)
close = np.array(df['close'])
logreturn = np.diff(np.log(close)) # Transform closing price to log returns, instead of using min-max scaler

csum_logreturn = logreturn.cumsum() # Cumulative sum of log returns

Plot shows the reduced scale of the closing prices

In [None]:
fig, axs = plt.subplots(2, 1)
axs[0].plot(close, color='red')
axs[0].set_title('Closing Price')
axs[0].set_ylabel('Close Price')
axs[0].set_xlabel('Time Steps')

axs[1].plot(csum_logreturn, color='green')
axs[1].set_title('Cumulative Sum of Log Returns')
axs[1].set_xlabel('Time Steps')

fig.tight_layout()
plt.show()

Positional Encoder

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()       
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]        

Transformer Model

In [4]:
class TransAm(nn.Module):
    def __init__(self, feature_size=250, num_layers=1, dropout=0.1):
        super(TransAm, self).__init__()
        self.model_type = 'Transformer'
        
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(feature_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=10, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)        
        self.decoder = nn.Linear(feature_size,1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1    
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self,src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.pos_encoder(src)
        output = self.transformer_encoder(src,self.src_mask)
        output = self.decoder(output)
        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

Window function, split data into sequence window

In [5]:
def create_inout_sequences(input_data, tw):
    inout_seq = []
    L = len(input_data)
    for i in range(L-tw):
        train_seq = input_data[i:i+tw]
        train_label = input_data[i+output_window:i+tw+output_window]
        inout_seq.append((train_seq ,train_label))
    return torch.FloatTensor(inout_seq)

Split data in training and testing, prepared in windowed sequences and pass through GPU

In [6]:
def get_data(data, split):
    """Split ratio of training data"""

    series = data
    
    split = round(split*len(series))
    train_data = series[:split]
    test_data = series[split:]

    train_data = train_data.cumsum()
    train_data = 2*train_data # Training data augmentation, increase amplitude for the model to better generalize.(Scaling by 2 is aribitrary)
                              # Similar to image transformation to allow model to train on wider data sets

    test_data = test_data.cumsum()

    train_sequence = create_inout_sequences(train_data,input_window)
    train_sequence = train_sequence[:-output_window]

    test_data = create_inout_sequences(test_data,input_window)
    test_data = test_data[:-output_window]

    return train_sequence.to(device), test_data.to(device)

Split into training batches

In [7]:
def get_batch(source, i, batch_size):
    seq_len = min(batch_size, len(source) - 1 - i)
    data = source[i:i+seq_len]    
    input = torch.stack(torch.stack([item[0] for item in data]).chunk(input_window, 1))
    target = torch.stack(torch.stack([item[1] for item in data]).chunk(input_window, 1))
    return input, target

Training function

In [8]:
def train(train_data):
    model.train() # Turn on the evaluation mode
    total_loss = 0.
    start_time = time.time()

    for batch, i in enumerate(range(0, len(train_data) - 1, batch_size)):
        data, targets = get_batch(train_data, i,batch_size)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7)
        optimizer.step()

        total_loss += loss.item()
        log_interval = int(len(train_data) / batch_size / 5)
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.10f} | {:5.2f} ms | '
                  'loss {:5.7f}'.format(
                    epoch, batch, len(train_data) // batch_size, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss))
            total_loss = 0
            start_time = time.time()

Evaluation function for model after training

In [9]:
def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    eval_batch_size = 1000
    with torch.no_grad():
        for i in range(0, len(data_source) - 1, eval_batch_size):
            data, targets = get_batch(data_source, i, eval_batch_size)
            output = eval_model(data)            
            total_loss += len(data[0])* criterion(output, targets).cpu().item()
    return total_loss / len(data_source)

Function to forecast 1 time step from window sequence

In [10]:
def model_forecast(model, seqence):
    model.eval() 
    total_loss = 0.
    test_result = torch.Tensor(0)    
    truth = torch.Tensor(0)

    seq = np.pad(seqence, (0, 3), mode='constant', constant_values=(0, 0))
    seq = create_inout_sequences(seq, input_window)
    seq = seq[:-output_window].to(device)

    seq, _ = get_batch(seq, 0, 1)
    with torch.no_grad():
        for i in range(0, output_window):            
            output = model(seq[-output_window:])                        
            seq = torch.cat((seq, output[-1:]))

    seq = seq.cpu().view(-1).numpy()

    return seq

Function to forecast entire sequence

In [11]:
def forecast_seq(model, sequences):
    """Sequences data has to been windowed and passed through device"""
    start_timer = time.time()
    model.eval() 
    forecast_seq = torch.Tensor(0)    
    actual = torch.Tensor(0)
    with torch.no_grad():
        for i in range(0, len(sequences) - 1):
            data, target = get_batch(sequences, i, 1)
            output = model(data)            
            forecast_seq = torch.cat((forecast_seq, output[-1].view(-1).cpu()), 0)
            actual = torch.cat((actual, target[-1].view(-1).cpu()), 0)
    timed = time.time()-start_timer
    print(f"{timed} sec")

    return forecast_seq, actual

Prepare data for training model

In [12]:
train_data, val_data = get_data(logreturn, 0.6) # 60% train, 40% test split
model = TransAm().to(device)

Model parameters

In [13]:
criterion = nn.MSELoss() # Loss function
lr = 0.00005 # learning rate

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

epochs =  150 # Number of epochs

Training loop

In [14]:
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(train_data)
    
    if(epoch % epochs == 0): # Valid model after last training epoch
        val_loss = evaluate(model, val_data)
        print('-' * 80)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss: {:5.7f}'.format(epoch, (time.time() - epoch_start_time), val_loss))
        print('-' * 80)

    else:   
        print('-' * 80)
        print('| end of epoch {:3d} | time: {:5.2f}s'.format(epoch, (time.time() - epoch_start_time)))
        print('-' * 80)

    scheduler.step() 




| epoch   1 |    77/  385 batches | lr 0.0000500000 | 16.21 ms | loss 0.0593711
| epoch   1 |   154/  385 batches | lr 0.0000500000 |  9.70 ms | loss 0.0091353
| epoch   1 |   231/  385 batches | lr 0.0000500000 |  9.81 ms | loss 0.0070163
| epoch   1 |   308/  385 batches | lr 0.0000500000 |  9.77 ms | loss 0.0040359
| epoch   1 |   385/  385 batches | lr 0.0000500000 | 10.12 ms | loss 0.0143901
--------------------------------------------------------------------------------
| end of epoch   1 | time:  4.28s
--------------------------------------------------------------------------------
| epoch   2 |    77/  385 batches | lr 0.0000451250 |  9.92 ms | loss 0.0046685
| epoch   2 |   154/  385 batches | lr 0.0000451250 |  9.79 ms | loss 0.0031082
| epoch   2 |   231/  385 batches | lr 0.0000451250 |  9.84 ms | loss 0.0026059
| epoch   2 |   308/  385 batches | lr 0.0000451250 |  9.75 ms | loss 0.0016253
| epoch   2 |   385/  385 batches | lr 0.0000451250 |  9.81 ms | loss 0.0079709
----

| epoch  15 |    77/  385 batches | lr 0.0000231646 |  9.94 ms | loss 0.0062810
| epoch  15 |   154/  385 batches | lr 0.0000231646 |  9.70 ms | loss 0.0021632
| epoch  15 |   231/  385 batches | lr 0.0000231646 |  9.69 ms | loss 0.0021351
| epoch  15 |   308/  385 batches | lr 0.0000231646 |  9.60 ms | loss 0.0009584
| epoch  15 |   385/  385 batches | lr 0.0000231646 |  9.72 ms | loss 0.0207240
--------------------------------------------------------------------------------
| end of epoch  15 | time:  3.75s
--------------------------------------------------------------------------------
| epoch  16 |    77/  385 batches | lr 0.0000220063 |  9.97 ms | loss 0.0051010
| epoch  16 |   154/  385 batches | lr 0.0000220063 |  9.64 ms | loss 0.0018938
| epoch  16 |   231/  385 batches | lr 0.0000220063 | 10.28 ms | loss 0.0019386
| epoch  16 |   308/  385 batches | lr 0.0000220063 |  9.92 ms | loss 0.0008964
| epoch  16 |   385/  385 batches | lr 0.0000220063 | 10.00 ms | loss 0.0208446
----

| epoch  29 |    77/  385 batches | lr 0.0000112968 |  9.57 ms | loss 0.0007369
| epoch  29 |   154/  385 batches | lr 0.0000112968 |  9.56 ms | loss 0.0004317
| epoch  29 |   231/  385 batches | lr 0.0000112968 |  9.36 ms | loss 0.0003335
| epoch  29 |   308/  385 batches | lr 0.0000112968 |  9.56 ms | loss 0.0004462
| epoch  29 |   385/  385 batches | lr 0.0000112968 |  9.68 ms | loss 0.0039794
--------------------------------------------------------------------------------
| end of epoch  29 | time:  3.68s
--------------------------------------------------------------------------------
| epoch  30 |    77/  385 batches | lr 0.0000107319 |  9.66 ms | loss 0.0006762
| epoch  30 |   154/  385 batches | lr 0.0000107319 |  9.46 ms | loss 0.0003914
| epoch  30 |   231/  385 batches | lr 0.0000107319 |  9.47 ms | loss 0.0002996
| epoch  30 |   308/  385 batches | lr 0.0000107319 |  9.44 ms | loss 0.0004212
| epoch  30 |   385/  385 batches | lr 0.0000107319 |  9.77 ms | loss 0.0036301
----

| epoch  43 |    77/  385 batches | lr 0.0000055092 |  9.62 ms | loss 0.0003684
| epoch  43 |   154/  385 batches | lr 0.0000055092 |  9.79 ms | loss 0.0001638
| epoch  43 |   231/  385 batches | lr 0.0000055092 |  9.53 ms | loss 0.0001201
| epoch  43 |   308/  385 batches | lr 0.0000055092 |  9.40 ms | loss 0.0002199
| epoch  43 |   385/  385 batches | lr 0.0000055092 |  9.66 ms | loss 0.0013683
--------------------------------------------------------------------------------
| end of epoch  43 | time:  3.70s
--------------------------------------------------------------------------------
| epoch  44 |    77/  385 batches | lr 0.0000052337 | 11.06 ms | loss 0.0003627
| epoch  44 |   154/  385 batches | lr 0.0000052337 | 11.51 ms | loss 0.0001584
| epoch  44 |   231/  385 batches | lr 0.0000052337 | 10.27 ms | loss 0.0001150
| epoch  44 |   308/  385 batches | lr 0.0000052337 |  9.97 ms | loss 0.0002176
| epoch  44 |   385/  385 batches | lr 0.0000052337 |  9.82 ms | loss 0.0013059
----

| epoch  57 |    77/  385 batches | lr 0.0000026867 |  9.78 ms | loss 0.0001863
| epoch  57 |   154/  385 batches | lr 0.0000026867 |  9.75 ms | loss 0.0000925
| epoch  57 |   231/  385 batches | lr 0.0000026867 |  9.65 ms | loss 0.0000665
| epoch  57 |   308/  385 batches | lr 0.0000026867 |  9.95 ms | loss 0.0001847
| epoch  57 |   385/  385 batches | lr 0.0000026867 |  9.64 ms | loss 0.0006785
--------------------------------------------------------------------------------
| end of epoch  57 | time:  3.75s
--------------------------------------------------------------------------------
| epoch  58 |    77/  385 batches | lr 0.0000025523 | 10.13 ms | loss 0.0001770
| epoch  58 |   154/  385 batches | lr 0.0000025523 |  9.73 ms | loss 0.0000888
| epoch  58 |   231/  385 batches | lr 0.0000025523 |  9.88 ms | loss 0.0000641
| epoch  58 |   308/  385 batches | lr 0.0000025523 |  9.88 ms | loss 0.0001829
| epoch  58 |   385/  385 batches | lr 0.0000025523 |  9.43 ms | loss 0.0006541
----

| epoch  71 |    77/  385 batches | lr 0.0000013102 |  9.54 ms | loss 0.0001306
| epoch  71 |   154/  385 batches | lr 0.0000013102 | 10.35 ms | loss 0.0000727
| epoch  71 |   231/  385 batches | lr 0.0000013102 |  9.98 ms | loss 0.0000545
| epoch  71 |   308/  385 batches | lr 0.0000013102 |  9.59 ms | loss 0.0001726
| epoch  71 |   385/  385 batches | lr 0.0000013102 | 10.14 ms | loss 0.0005132
--------------------------------------------------------------------------------
| end of epoch  71 | time:  3.82s
--------------------------------------------------------------------------------
| epoch  72 |    77/  385 batches | lr 0.0000012447 | 10.04 ms | loss 0.0001284
| epoch  72 |   154/  385 batches | lr 0.0000012447 |  9.54 ms | loss 0.0000711
| epoch  72 |   231/  385 batches | lr 0.0000012447 |  9.94 ms | loss 0.0000540
| epoch  72 |   308/  385 batches | lr 0.0000012447 |  9.33 ms | loss 0.0001739
| epoch  72 |   385/  385 batches | lr 0.0000012447 |  9.74 ms | loss 0.0005080
----

| epoch  85 |    77/  385 batches | lr 0.0000006390 |  9.51 ms | loss 0.0001141
| epoch  85 |   154/  385 batches | lr 0.0000006390 |  9.43 ms | loss 0.0000669
| epoch  85 |   231/  385 batches | lr 0.0000006390 |  9.42 ms | loss 0.0000533
| epoch  85 |   308/  385 batches | lr 0.0000006390 |  9.38 ms | loss 0.0001693
| epoch  85 |   385/  385 batches | lr 0.0000006390 |  9.36 ms | loss 0.0004660
--------------------------------------------------------------------------------
| end of epoch  85 | time:  3.63s
--------------------------------------------------------------------------------
| epoch  86 |    77/  385 batches | lr 0.0000006070 |  9.48 ms | loss 0.0001132
| epoch  86 |   154/  385 batches | lr 0.0000006070 |  9.38 ms | loss 0.0000672
| epoch  86 |   231/  385 batches | lr 0.0000006070 |  9.43 ms | loss 0.0000536
| epoch  86 |   308/  385 batches | lr 0.0000006070 |  9.36 ms | loss 0.0001692
| epoch  86 |   385/  385 batches | lr 0.0000006070 |  9.30 ms | loss 0.0004716
----

| epoch  99 |    77/  385 batches | lr 0.0000003116 |  9.54 ms | loss 0.0001090
| epoch  99 |   154/  385 batches | lr 0.0000003116 |  9.33 ms | loss 0.0000649
| epoch  99 |   231/  385 batches | lr 0.0000003116 |  9.40 ms | loss 0.0000533
| epoch  99 |   308/  385 batches | lr 0.0000003116 |  9.54 ms | loss 0.0001669
| epoch  99 |   385/  385 batches | lr 0.0000003116 |  9.33 ms | loss 0.0004487
--------------------------------------------------------------------------------
| end of epoch  99 | time:  3.63s
--------------------------------------------------------------------------------
| epoch 100 |    77/  385 batches | lr 0.0000002960 |  9.54 ms | loss 0.0001094
| epoch 100 |   154/  385 batches | lr 0.0000002960 |  9.33 ms | loss 0.0000648
| epoch 100 |   231/  385 batches | lr 0.0000002960 |  9.33 ms | loss 0.0000535
| epoch 100 |   308/  385 batches | lr 0.0000002960 |  9.20 ms | loss 0.0001678
| epoch 100 |   385/  385 batches | lr 0.0000002960 |  9.94 ms | loss 0.0004446
----

| epoch 113 |    77/  385 batches | lr 0.0000001520 |  9.56 ms | loss 0.0001073
| epoch 113 |   154/  385 batches | lr 0.0000001520 |  9.43 ms | loss 0.0000656
| epoch 113 |   231/  385 batches | lr 0.0000001520 |  9.48 ms | loss 0.0000540
| epoch 113 |   308/  385 batches | lr 0.0000001520 |  9.49 ms | loss 0.0001653
| epoch 113 |   385/  385 batches | lr 0.0000001520 |  9.44 ms | loss 0.0004338
--------------------------------------------------------------------------------
| end of epoch 113 | time:  3.65s
--------------------------------------------------------------------------------
| epoch 114 |    77/  385 batches | lr 0.0000001444 |  9.46 ms | loss 0.0001062
| epoch 114 |   154/  385 batches | lr 0.0000001444 |  9.49 ms | loss 0.0000649
| epoch 114 |   231/  385 batches | lr 0.0000001444 |  9.78 ms | loss 0.0000538
| epoch 114 |   308/  385 batches | lr 0.0000001444 |  9.51 ms | loss 0.0001658
| epoch 114 |   385/  385 batches | lr 0.0000001444 |  9.56 ms | loss 0.0004339
----

| epoch 127 |    77/  385 batches | lr 0.0000000741 |  9.44 ms | loss 0.0001065
| epoch 127 |   154/  385 batches | lr 0.0000000741 |  9.38 ms | loss 0.0000659
| epoch 127 |   231/  385 batches | lr 0.0000000741 |  9.39 ms | loss 0.0000538
| epoch 127 |   308/  385 batches | lr 0.0000000741 |  9.43 ms | loss 0.0001649
| epoch 127 |   385/  385 batches | lr 0.0000000741 |  9.34 ms | loss 0.0004281
--------------------------------------------------------------------------------
| end of epoch 127 | time:  3.62s
--------------------------------------------------------------------------------
| epoch 128 |    77/  385 batches | lr 0.0000000704 |  9.44 ms | loss 0.0001062
| epoch 128 |   154/  385 batches | lr 0.0000000704 |  9.65 ms | loss 0.0000657
| epoch 128 |   231/  385 batches | lr 0.0000000704 |  9.51 ms | loss 0.0000540
| epoch 128 |   308/  385 batches | lr 0.0000000704 |  9.43 ms | loss 0.0001627
| epoch 128 |   385/  385 batches | lr 0.0000000704 |  9.79 ms | loss 0.0004304
----

| epoch 141 |    77/  385 batches | lr 0.0000000361 |  9.44 ms | loss 0.0001056
| epoch 141 |   154/  385 batches | lr 0.0000000361 |  9.38 ms | loss 0.0000649
| epoch 141 |   231/  385 batches | lr 0.0000000361 |  9.34 ms | loss 0.0000543
| epoch 141 |   308/  385 batches | lr 0.0000000361 |  9.30 ms | loss 0.0001626
| epoch 141 |   385/  385 batches | lr 0.0000000361 |  9.44 ms | loss 0.0004279
--------------------------------------------------------------------------------
| end of epoch 141 | time:  3.61s
--------------------------------------------------------------------------------
| epoch 142 |    77/  385 batches | lr 0.0000000343 |  9.56 ms | loss 0.0001051
| epoch 142 |   154/  385 batches | lr 0.0000000343 |  9.48 ms | loss 0.0000651
| epoch 142 |   231/  385 batches | lr 0.0000000343 |  9.39 ms | loss 0.0000545
| epoch 142 |   308/  385 batches | lr 0.0000000343 |  9.30 ms | loss 0.0001621
| epoch 142 |   385/  385 batches | lr 0.0000000343 |  9.46 ms | loss 0.0004257
----

In [15]:
test_result, truth = forecast_seq(model, val_data)

95.75811314582825 sec


Plot forecasted sequence vs actual

In [1]:
plt.plot(truth, color='red', alpha=0.7)
plt.plot(test_result, color='blue', linewidth=0.7)
plt.title('Actual vs Forecast')
plt.legend(['Actual', 'Forecast'])
plt.xlabel('Time Steps')
plt.show()

NameError: name 'plt' is not defined

Test random sequence

In [None]:
r = np.random.randint(100000, 160000)
test_forecast = model_forecast(model, csum_logreturn[r: r+10]) # random 10 sequence length

print(f"forecast sequence: {test_forecast}")
print(f"Actual sequence: {csum_logreturn[r: r+11]}")

Save model

In [None]:
torch.save(model.state_dict(), "transformer_ts.pth")

Load model

In [None]:
model2 = TransAm() # rename as model2
model2.load_state_dict(torch.load("transformer_ts.pth"))
model2.to(device)

Testing model on Boeing stock from the same time period

In [None]:
df2 = pd.read_csv('BA_raw.csv') # Boeing Co stock
close2 = df2['close'].fillna(method = 'ffill')
close2 = np.array(close2)
logreturn2 = np.diff(np.log(close2))

In [None]:
train_data2, val_data2 = get_data(logreturn2, 0.6)
test2_eval = evaluate(model2, val_data2)
print(f'Test 2 loss: {test2_eval :.5f}')

In [None]:
test_result2, truth2 = forecast_seq(model2, val_data2)

plt.plot(truth2, color='red', alpha=0.7)
plt.plot(test_result2, color='blue', linewidth=0.7)
plt.title('Actual vs Forecast')
plt.legend(['Actual', 'Forecast'])
plt.xlabel('Time Steps')
plt.show()

Testing model on JPMorgan stock from the same time period

In [None]:
df3 = pd.read_csv('JPM_raw.csv') # JPMorgan Chase & Co stock
close3 = df3['close'].fillna(method = 'ffill')
close3 = np.array(close3)
logreturn3 = np.diff(np.log(close3))

In [None]:
train_data3, val_data3 = get_data(logreturn3, 0.6)
test3_eval = evaluate(model2, val_data3)
print(f'Test 3 loss: {test3_eval :.5f}')

In [None]:
test_result3, truth3 = forecast_seq(model2, val_data3)

plt.plot(truth3, color='red', alpha=0.7)
plt.plot(test_result3, color='blue', linewidth=0.7)
plt.title('Actual vs Forecast')
plt.legend(['Actual', 'Forecast'])
plt.xlabel('Time Steps')
plt.show()

## Conclusion
Transformer model is able to train on a longer sequence compared to LSTM models (5 vs 10)  
Transformer model also trained faster, given the longer sequence length, and took lesser epochs to train the model

### Data Transforming and Normalising
Advantages of using cumulative sum of log returns compared to min-max scaler in normalising stocks prices,  
create better standardisation across stocks  
Training data augmentation allowed model to be trained on wider data points, resulting in the model generalising well across unseen data from test 2 and test 3  
Previous testing without data augmentation had models underperforming with higher loss