### Import

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
%cd /gdrive/My Drive/

Mounted at /gdrive
/gdrive/My Drive


In [None]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [None]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.15.0


In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler

# splitting data w/ stratify
from sklearn.model_selection import train_test_split

### Load data


In [None]:
categories = np.load ("./datasets/categories.npy", allow_pickle = True)
training_data = np.load ("./datasets/training_data.npy", allow_pickle = True)
valid_periods = np.load ("./datasets/valid_periods.npy", allow_pickle = True)

In [None]:
categories.shape, training_data.shape, valid_periods.shape

categories.dtype, training_data.dtype, valid_periods.dtype

(dtype('<U1'), dtype('float64'), dtype('int64'))

In [None]:
# Define categories
unique_categories = np.unique(categories)
print(unique_categories)

['A' 'B' 'C' 'D' 'E' 'F']


### Data prep + splitting

REMOVING DUPLICATES

In [None]:
def remove_duplicates(training_data, categories, valid_periods):

  unique_sequences, counts = np.unique(training_data, return_counts=True, axis=0)
  print("Unique sequences:", len(unique_sequences))

  # Get the indices of all the unique sequences in the dataset
  _, indexes = np.unique(training_data, return_index=True, axis=0)

  # Get the categories of each unique sequence in the dataset
  unique_categories = categories[indexes]

  # Get the valid periods of each unique sequence in the dataset
  unique_valid_periods = valid_periods[indexes]

  # print duplicates' infos
  dup_sequences = unique_sequences[counts > 1]
  print(len(training_data)-len(unique_sequences), "duplicates deleted from", len(training_data), "sequences in the dataset")

  return unique_sequences, unique_categories,  unique_valid_periods

In [None]:
training_data, categories, valid_periods = remove_duplicates(training_data,categories, valid_periods)

Unique sequences: 47974
26 duplicates deleted from 48000 sequences in the dataset


CUTTING TIMESERIES W/ SIZE LESS THEN X

In [None]:
window = 50
telescope = 1
stride = 5

In [None]:
CUTTING = True
THRESHOLD = window+telescope

In [None]:
if CUTTING:
  start = valid_periods[:, 0]
  end = valid_periods[:, 1]

  #indexes = np.where(np.logical_or(end-start > THRESHOLD , end-start < 59 ))[0]
  indexes = np.where(end-start < THRESHOLD )[0]

  training_data = np.delete(training_data, indexes , 0)
  valid_periods = np.delete(valid_periods, indexes , 0)
  categories = np.delete(categories, indexes , 0)

  print (training_data.shape)
  print (valid_periods.shape)
  print (categories.shape)

(37337, 2776)
(37337, 2)
(37337,)


SPLITTIG DATA MANTAINING PROPRORTION FOR CATEGORY

In [None]:
def split_train_val(training_data, categories, split_size=0.1, seed=seed):

  df = pd.DataFrame(training_data)
  df['category'] = categories

  # Proportionate Sampling
  df_train, df_val = train_test_split(df, test_size=0.1, random_state=seed, stratify=df[['category']], shuffle=True)

   # Split sequences array
  df_train = df_train.drop(columns=['category'])
  df_val = df_val.drop(columns=['category'])
  train_data = df_train.to_numpy()
  val_data = df_val.to_numpy()

  return  train_data, val_data

In [None]:
def split_train_val_test(training_data, categories, val_size=0.1, test_size=0.1, seed=seed):

  df = pd.DataFrame(training_data)
  df['category'] = categories

  # Proportionate Sampling
  df_train_val, df_test = train_test_split(df, test_size=test_size, random_state=seed, stratify=df[['category']], shuffle=True)

  df_train, df_val = train_test_split(df_train_val, test_size=val_size, random_state=seed, stratify=df_train_val[['category']], shuffle=True)

   # Split sequences array
  df_train = df_train.drop(columns=['category'])
  df_val = df_val.drop(columns=['category'])
  df_test = df_test.drop(columns=['category'])
  train_data = df_train.to_numpy()
  val_data = df_val.to_numpy()
  test_data = df_test.to_numpy()

  return  train_data, val_data, test_data

In [None]:
# take only the last 700 values form each TS
maintains = 700

smaller_training_data = training_data[:, -maintains:]

In [None]:
# padding to add to not lose values from the TS
padding = window - ((maintains-(window+telescope)) % stride)
padding

In [None]:
# adding the padding
padded_training_data = np.pad(smaller_training_data, ((0, 0), (padding, 0)), 'constant')

padded_training_data.shape

CREATE TRAIN-VAL OR TRAIN-VAL-TEST SETS

In [None]:
X_train_raw, X_val_raw, X_test_raw = split_train_val_test(padded_training_data, categories, 0.1, 0.1, seed)

print (X_train_raw.shape, X_val_raw.shape, X_test_raw.shape)

In [None]:
X_train_raw , X_val_raw = split_train_val(padded_training_data, categories, 0.1, 42)

print (X_train_raw.shape, X_val_raw.shape)


BUILDING SEQUENCES

In [None]:
def build_sequences (training_data, window, telescope,stride):

  seq_train=[]
  seq_predict=[]

  for i in range(len(training_data)):


    for idx in np.arange((len(training_data[i])-window-telescope), 0,-stride):


      seq_train.append(training_data[i][idx:idx+window])
      seq_predict.append(training_data[i][idx+window:idx+window+telescope])

  return np.array(seq_train), np.array(seq_predict)

In [None]:
to_predict = 9

X_train, y_train = build_sequences(X_train_raw, window, telescope,stride)
X_val, y_val = build_sequences(X_val_raw, window, to_predict,stride)
X_test, y_test = build_sequences(X_test_raw, window, to_predict,stride)
print (X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

#print(y_train[0][0])

# SHAPE ORA SONO CORRETTE :)

In [None]:
# free some memory
del padded_training_data
del smaller_training_data
del X_train_raw
del X_val_raw
del X_test_raw
del training_data
del categories
del valid_periods

### Function helper

In [None]:
# Plot model's history graph
def plot_history(history):

  best_epoch = np.argmin(history['val_loss'])
  plt.figure(figsize=(17,4))
  plt.plot(history['loss'], label='Training loss', alpha=.8, color='#ff7f0e')
  plt.plot(history['val_loss'], label='Validation loss', alpha=.9, color='#5a9aa5')
  plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
  plt.title('Mean Squared Error')
  plt.legend()
  plt.grid(alpha=.3)
  plt.show()

  plt.figure(figsize=(18,3))
  plt.plot(history['lr'], label='Learning Rate', alpha=.8, color='#ff7f0e')
  plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
  plt.legend()
  plt.grid(alpha=.3)
  plt.show()

In [None]:
# Print model's MSE + MAE for autoregression models
def prediction_performance_auto(X_test, y_test, model):
  # Predict the test set using the model
  temp_X = X_test
  out = np.empty([temp_X.shape[0],1])
  for i in range(9):
    res = model.predict(temp_X)
    out = np.append(out, res, axis=1)
    temp_X = np.delete(temp_X, 0, axis=1)
    temp_X = np.append(temp_X, res, axis=1)

  out = np.delete(out, 0, axis=1)
  predictions = out

  # Print the shape of the predictions
  print(f"Predictions shape: {predictions.shape}")

  # Calculate and print Mean Squared Error (MSE)
  mean_squared_error = tfk.metrics.mean_squared_error(y_test.flatten(), predictions.flatten()).numpy()
  print(f"Mean Squared Error: {mean_squared_error}")

  # Calculate and print Mean Absolute Error (MAE)
  mean_absolute_error = tfk.metrics.mean_absolute_error(y_test.flatten(), predictions.flatten()).numpy()
  print(f"Mean Absolute Error: {mean_absolute_error}")

  return mean_squared_error, mean_absolute_error

### GRU model

In [None]:
input_shape = X_train.shape[1:]
output_shape = y_train.shape[1:]

print(input_shape)
print(output_shape)

(50,)
(9,)


In [None]:
def build_gru(input_shape, output_shape, units):

    input_layer = tfkl.Input(shape = input_shape)
    x = tfkl.Reshape((input_shape[0], 1), input_shape = input_shape)(input_layer)
    x = tfkl.GRU (units = units, return_sequences = True)(x)
    x = tfkl.Dropout(0.2)(x)
    # Hidden layer
    x = tfkl.GRU(units = units)(x)
    x = tfkl.Dropout(0.2)(x)
    output_layer = tfkl.Dense(units = output_shape[0])(x)


    model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name='GRU_model')
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam())

    return model

In [None]:
model = build_gru(input_shape, output_shape,128)
model.summary()
tfk.utils.plot_model(model, expand_nested=True, show_shapes=True)

In [None]:
# Train the model
history = model.fit(
    x = X_train,
    y = y_train,
    batch_size = 256,
    epochs = 30,
    validation_data=(X_val, y_val),
    callbacks = [
        tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=6, restore_best_weights=True),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=4, factor=0.1, min_lr=1e-5)
    ]
).history

In [None]:
 # Predict the test set using the model
 predictions = model.predict(X_val, verbose=0)
 # Print the shape of the predictions
 print(f"Predictions shape: {predictions.shape}")
 # Calculate and print Mean Squared Error (MSE)
 mean_squared_error = tfk.metrics.mean_squared_error(y_val.flatten(), predictions.flatten()).numpy()
 print(f"Mean Squared Error: {mean_squared_error}")
 # Calculate and print Mean Absolute Error (MAE)
 mean_absolute_error = tfk.metrics.mean_absolute_error(y_val.flatten(), predictions.flatten()).numpy()
 print(f"Mean Absolute Error: {mean_absolute_error}")

In [None]:
plot_history(history)

In [None]:
model.save('Forecasting_X-GRU')

In [None]:
#model = tfk.models.load_model('Forecasting_1-GRU')
mse, mae = prediction_performance_auto(X_test,y_test,model)