In [21]:
# standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
import math
import time

# create lstm models
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras import metrics
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error

# hyperparameter tuning
from kerastuner.tuners import RandomSearch
from tensorflow.keras.callbacks import EarlyStopping, Callback

# 1-Load Data

In [2]:
# load interim data set
data_set = pd.read_csv('../data/processed/international-airline-passengers.csv')
data_set

Unnamed: 0,passenger_number,month,previous_passenger_number
0,118.0,2,112.0
1,132.0,3,118.0
2,129.0,4,132.0
3,121.0,5,129.0
4,135.0,6,121.0
...,...,...,...
138,606.0,8,622.0
139,508.0,9,606.0
140,461.0,10,508.0
141,390.0,11,461.0


# 2-Perform Train, Validation (2 years), Test Split (2 years)

In [10]:
train_size = len(data_set) - 48
validation_size = 24
test_size = 24
train_set, validation_set, test_set = data_set[0:train_size+1], data_set[train_size:train_size+validation_size], data_set[train_size+validation_size:]
print(f"Training set: {len(train_set)}, validation set: {len(validation_set)}, test set: {len(test_set)}")

Training set: 96, validation set: 24, test set: 24


# 3-Normalize Data

In [18]:
# get the columns that have to be normalized
i = 0
temp = train_set.drop(columns="passenger_number")
for col in temp.columns:
    print(i)
    print(col)
    i = i + 1

0
month
1
previous_passenger_number


In [19]:
features_to_normalize = [0, 1]

In [23]:
def normalize_lookahead(train_set, validation_set, test_set, features_to_normalize):
    passenger_number_normalized = []
    all_features_normalized = []
    
    # fit & save demand scaler in order to denormalize predictions later on
    passenger_number_scaler = MinMaxScaler(feature_range=(0, 1))
    passenger_number_train = train_set['passenger_number'].values.reshape(len(train_set), 1)
    passenger_number_scaler = passenger_number_scaler.fit(passenger_number_train)
    
    # normalize passenger_number
    passenger_number_val =  validation_set['passenger_number'].values.reshape(len(validation_set), 1)
    passenger_number_test = test_set['passenger_number'].values.reshape(len(test_set), 1)

    passenger_number_normalized_train = passenger_number_scaler.transform(passenger_number_train)
    passenger_number_normalized.append(passenger_number_normalized_train)
    
    passenger_number_normalized_val = passenger_number_scaler.transform(passenger_number_val)
    passenger_number_normalized.append(passenger_number_normalized_val)
    
    passenger_number_normalized_test = passenger_number_scaler.transform(passenger_number_test)
    passenger_number_normalized.append(passenger_number_normalized_test)
    
    
    # fit & save a scaler for all features
    all_features_scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('mm', MinMaxScaler(feature_range=(0, 1)) , features_to_normalize)
        ])
    
    # reshape train set
    temp_train = train_set.drop(columns=["passenger_number"])
    train_reshaped_df = temp_train.values.reshape((len(temp_train), len(temp_train.columns)))
    
    # fit & save the scaler for all features
    all_features_scaler = all_features_scaler.fit(train_reshaped_df)
    
    # normalize train set
    normalized_train_set = all_features_scaler.transform(train_reshaped_df)
    all_features_normalized.append(normalized_train_set)
    
    # reshape & normalize validation set
    temp_val = validation_set.drop(columns=["passenger_number"])
    val_reshaped_df = temp_val.values.reshape((len(temp_val), len(temp_val.columns)))
    normalized_val_set = all_features_scaler.transform(val_reshaped_df)
    all_features_normalized.append(normalized_val_set)
    
    # reshape & normalize test set
    temp_test = test_set.drop(columns=["passenger_number"])
    test_reshaped_df = temp_test.values.reshape((len(temp_test), len(temp_test.columns)))
    normalized_test_set = all_features_scaler.transform(test_reshaped_df)
    all_features_normalized.append(normalized_test_set)
    
    return passenger_number_scaler, passenger_number_normalized, all_features_normalized

In [24]:
passenger_number_scaler, passenger_number_normalized, all_features_normalized = normalize_lookahead(train_set, validation_set, test_set, features_to_normalize)

# 4-Create samples using Moving Window Technique

In [29]:
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1] #[lookahead]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [31]:
def create_samples(passenger_number_normalized, all_features_normalized, input_window_size, output_window_size):
    train_set = np.hstack((all_features_normalized[0], passenger_number_normalized[0]))
    train_samples = split_sequences(train_set, input_window_size, output_window_size)
    
    val_set = np.hstack((all_features_normalized[1], passenger_number_normalized[1]))
    val_samples = split_sequences(val_set, input_window_size, output_window_size)
    
    test_set = np.hstack((all_features_normalized[2], passenger_number_normalized[2]))
    test_samples = split_sequences(test_set, input_window_size, output_window_size)

    print(f"Number of training samples: {len(train_samples[0])}, validation samples: {len(val_samples[0])}, test samples: {len(test_samples[0])}")
    return train_samples, val_samples, test_samples

## Let's set size of the input window width to 6 since this was the best performing size for the univariate model

In [32]:
input_window_size = 6
output_window_size = 1

train_samples, val_samples, test_samples = create_samples(passenger_number_normalized, all_features_normalized, input_window_size, output_window_size)

Number of training samples: 91, validation samples: 19, test samples: 19


In [33]:
train_samples

(array([[[0.09090909, 0.02588997],
         [0.18181818, 0.04530744],
         [0.27272727, 0.09061489],
         [0.36363636, 0.08090615],
         [0.45454545, 0.05501618],
         [0.54545455, 0.10032362]],
 
        [[0.18181818, 0.04530744],
         [0.27272727, 0.09061489],
         [0.36363636, 0.08090615],
         [0.45454545, 0.05501618],
         [0.54545455, 0.10032362],
         [0.63636364, 0.14239482]],
 
        [[0.27272727, 0.09061489],
         [0.36363636, 0.08090615],
         [0.45454545, 0.05501618],
         [0.54545455, 0.10032362],
         [0.63636364, 0.14239482],
         [0.72727273, 0.14239482]],
 
        ...,
 
        [[0.45454545, 0.69255663],
         [0.54545455, 0.87378641],
         [0.63636364, 1.        ],
         [0.72727273, 0.97411003],
         [0.81818182, 0.81229773],
         [0.90909091, 0.65372168]],
 
        [[0.54545455, 0.87378641],
         [0.63636364, 1.        ],
         [0.72727273, 0.97411003],
         [0.81818182, 0.8122