### Imports

In [1]:
import os
import numpy as np
import random
import joblib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # hopefully nothing explodes

import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
print(tf.version)
print(tf.config.list_physical_devices('GPU'))

<module 'tensorflow._api.v2.version' from '/home/zyzz/anaconda3/lib/python3.11/site-packages/tensorflow/_api/v2/version/__init__.py'>
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# For reproducible results
seed = 42
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)  

## Data

In [3]:
# Constants
val_size = 0.2
data_path = 'training_dataset'
seq_length = 128     # predictions based on previous seq_length data entries
forecast_length = 9  # predicting forecast_length time steps into the future
sample_length = seq_length + forecast_length

In [4]:
# Read data
categories = np.load(os.path.join(data_path, 'categories.npy'))
training_data = np.load(os.path.join(data_path, 'training_data.npy'))
valid_periods = np.load(os.path.join(data_path, 'valid_periods.npy'))

print(categories.shape, training_data.shape, valid_periods.shape)

(48000,) (48000, 2776) (48000, 2)


In [5]:
# Create a dictionary mapping each category to corresponding data
data = {category: [] for category in np.unique(categories)}
for i, time_series in enumerate(training_data):
    data[categories[i]].append(time_series[valid_periods[i][0]:valid_periods[i][1]])

# Print amount of time series for each category
formatted_strings = [f"{category}: {len(time_series_list)}" for category, time_series_list in data.items()]
print(", ".join(formatted_strings))

A: 5728, B: 10987, C: 10017, D: 10016, E: 10975, F: 277


In [6]:
# Convert time series to {x: sequences of length seq_length, y: values to be predicted from previous sequence}
def to_sequences(time_series):
    
    x = []
    y = []
    
    for i in range(time_series.shape[0]-seq_length-forecast_length+1):
        x.append(time_series[i:i+seq_length])
        y.append(time_series[i+seq_length:i+seq_length+forecast_length])  
    
    x = np.array(x)
    y = np.array(y)
    
    return {'x': x, 'y': y}

In [7]:
for category in data.keys():
    
    # Shuffle the lists of time series (we don't want to make any assumptions about the order)
    random.shuffle(data[category])
    
    # Build sequences from the time series
    X, y = [], []
    for i, time_series in enumerate(data[category]): 
        if (len(time_series) >= sample_length): # assert we can draw at least one sample from the time_series
            sequences = to_sequences(time_series)
            X.append(sequences['x']) 
            y.append(sequences['y'])   

    # Build our data sets                
    # Note: there is no overlap between train and validation; each processed time series is used in train xor val    
    split_index = int((1-val_size)*len(X))
    X_train = np.concatenate(X[0:split_index], axis=0)
    X_val = np.concatenate(X[split_index:], axis=0)
    y_train = np.concatenate(y[0:split_index], axis=0)
    y_val = np.concatenate(y[split_index:], axis=0)
    
    # Apply robust scaling (fit only to training data to avoid bias)
    rscaler_X = joblib.load('LSTM_v4/rscaler_X.save')
    rscaler_y = joblib.load('LSTM_v4/rscaler_y.save')
    X_train = rscaler_X.transform(X_train)
    X_val = rscaler_X.transform(X_val)
    y_train = rscaler_y.transform(y_train)
    y_val = rscaler_y.transform(y_val)

    # Add the time dimension to the data sets
    X_train = X_train.reshape((-1, seq_length, 1))
    X_val = X_val.reshape((-1, seq_length, 1))                      
    y_train = y_train.reshape((-1, forecast_length, 1)) 
    y_val = y_val.reshape((-1, forecast_length, 1))   
    print(category, ': ', X_train.shape, X_val.shape, y_train.shape, y_val.shape)
     
    # Replace the list of time series with a dictionary with the data sets
    data[category] = {'X_train': X_train, 'X_val': X_val, 'y_train': y_train, 'y_val': y_val}

A :  (661911, 128, 1) (161525, 128, 1) (661911, 9, 1) (161525, 9, 1)
B :  (525289, 128, 1) (130754, 128, 1) (525289, 9, 1) (130754, 9, 1)
C :  (729993, 128, 1) (184536, 128, 1) (729993, 9, 1) (184536, 9, 1)
D :  (860625, 128, 1) (211567, 128, 1) (860625, 9, 1) (211567, 9, 1)
E :  (586833, 128, 1) (144234, 128, 1) (586833, 9, 1) (144234, 9, 1)
F :  (17665, 128, 1) (5476, 128, 1) (17665, 9, 1) (5476, 9, 1)


## ML

In [8]:
# Constants
batch_size = 128
epochs = 1000

# Stop training when validation loss stops improving, maintain best weights
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,         # how many epochs to check for improvement before stopping
    restore_best_weights=True,
)
    
final_val_losses = {}
final_val_losses_per_step = {}

for category in data.keys():
    print(f"Training model for CATEGORY {category}...")
    
    # Load model trained on all data
    model = tfk.models.load_model('LSTM_v4/model')
    model.compile(optimizer='adam', loss='mse')
    
    # "Transfer learning"
    history = model.fit(data[category]['X_train'],
                        data[category]['y_train'], 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        validation_data=(data[category]['X_val'], data[category]['y_val']),
                        callbacks=early_stopping,
                        verbose=1)
    
    # Save model
    model.save(os.path.join('LSTM_v5', category))
    
    # Evaluate on original validation data 
    y_val_org = rscaler_y.inverse_transform(data[category]['y_val'].reshape((-1, forecast_length)))
    y_pred = model.predict(data[category]['X_val'])
    y_pred_iscaled = rscaler_y.inverse_transform(y_pred.reshape((-1, forecast_length)))
    mse = tfk.losses.MeanSquaredError()
    final_val_losses[category] = mse(y_val_org, y_pred_iscaled).numpy()
       
    # Val loss for each prediction step
    final_val_losses_per_step[category] = []
    for t in range(forecast_length):
        mse = tfk.metrics.MeanSquaredError()
        mse.update_state(y_val_org[:, t], y_pred_iscaled[:, t])
        final_val_losses_per_step[category].append(mse.result().numpy())

Training model for CATEGORY A...
Epoch 1/1000


I0000 00:00:1703205112.089337     780 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
INFO:tensorflow:Assets written to: LSTM_v5/A/assets


INFO:tensorflow:Assets written to: LSTM_v5/A/assets


Training model for CATEGORY B...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
INFO:tensorflow:Assets written to: LSTM_v5/B/assets


INFO:tensorflow:Assets written to: LSTM_v5/B/assets


Training model for CATEGORY C...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
INFO:tensorflow:Assets written to: LSTM_v5/C/assets


INFO:tensorflow:Assets written to: LSTM_v5/C/assets


Training model for CATEGORY D...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
INFO:tensorflow:Assets written to: LSTM_v5/D/assets


INFO:tensorflow:Assets written to: LSTM_v5/D/assets


Training model for CATEGORY E...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
INFO:tensorflow:Assets written to: LSTM_v5/E/assets


INFO:tensorflow:Assets written to: LSTM_v5/E/assets


Training model for CATEGORY F...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
INFO:tensorflow:Assets written to: LSTM_v5/F/assets


INFO:tensorflow:Assets written to: LSTM_v5/F/assets




In [9]:
# Print stats for each model 
for category in data.keys():    
    print(f"Category {category} val MSE: {final_val_losses[category]}")
    print(f"Val MSE per step: {final_val_losses_per_step[category]}")
    print()

Category A val MSE: 0.005164689384400845
Val MSE per step: [0.0024824727, 0.0034169152, 0.0041555474, 0.004824764, 0.00535913, 0.0058594467, 0.0063155144, 0.006822516, 0.007245894]

Category B val MSE: 0.006230644881725311
Val MSE per step: [0.00309717, 0.0040435004, 0.004834892, 0.005655216, 0.006334659, 0.0070043067, 0.007676845, 0.008366125, 0.009063091]

Category C val MSE: 0.004625280387699604
Val MSE per step: [0.0020232128, 0.0027910173, 0.003524516, 0.0041232198, 0.004710779, 0.0052975304, 0.0058630677, 0.006356482, 0.006937697]

Category D val MSE: 0.005008918698877096
Val MSE per step: [0.0028908388, 0.0035152617, 0.0040775607, 0.004632967, 0.005087474, 0.0055332016, 0.0060181683, 0.0064534172, 0.0068713804]

Category E val MSE: 0.004516208078712225
Val MSE per step: [0.0020334437, 0.0028191747, 0.0034218228, 0.0040717577, 0.004572788, 0.0051004975, 0.005668595, 0.0061783274, 0.0067794626]

Category F val MSE: 0.003445324255153537
Val MSE per step: [0.0013405962, 0.0020186729

In [10]:
# Print average val loss
print(f"Avg val MSE: {sum(final_val_losses.values())/len(data.keys())}")

Avg val MSE: 0.00483184428109477
