In [1]:
import os
import json
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [2]:
print("Num GPUs Available: ", tf.config.list_physical_devices())

Num GPUs Available:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# 1. No outlier treatment
Modelos entrenados sin tratamiento de outliers, hay dos casos:
- Datos normalizados entre 0 y 1
- Datos normalizados entre -1 y 1

Las imagenes en la carpeta plots se identifican de la siguiente forma:

{normalización}\_{variable}\_{neuronas}\_{ventana temporal}\_{epochs}ep_{batch size}bs.png

Por ejemplo: **01_T_64_24_15ep_32bs.png**
- Los datos son normalizados entre 0 y 1. 
- La variable que muestra la grafica es la Temperatura (T).
- La capa LSTM tiene 64 neuronas.
- La ventana temporal es de 24 (24 datos previos al instante predicho, cada una de estas 24 representa una medición cada 30min)
- Entrenado durante 15 epochs
- Batch size de 32.

## 1.1. Training phase
In here the training parameters for the sesion are decided. A list of dicts that contains the parameters of each model is created.

In [9]:
input_width = [24]
prediction_width = [1]
batch_size = [8]
epochs = [15]
dropout = [0.05]
neurons = [64]
optimizer = ['adam']
normalization = [[-1, 1]]#, [0, 1]]

station = 'C6.zip'
variables = ['T', 'HR', 'P', 'u10', 'v10', 'day', 'time', 'date']
input_vars = ['T', 'HR', 'P', 'u10', 'v10', 'day', 'time']
cols = ['T', 'HR', 'P', 'u10', 'v10']

df_initial = pd.read_csv(f'data/data_by_station/{station}', compression='zip', header=0, sep=',')
df_initial['date'] = pd.to_datetime(df_initial['date'], format='%Y-%m-%d %H:%M:%S')
df_initial['day'] = df_initial['date'].dt.dayofyear / 365
df_initial['time'] = df_initial['date'].dt.hour / 24
df_initial = df_initial.astype({'T': 'float', 'HR': 'float', 'P': 'float', 'u2': 'float', 'v2': 'float', 'u6': 'float', 'v6': 'float', 'u10': 'float', 'v10': 'float', 'altitud': 'float', 'latitud': 'float', 'longitud': 'float'})
df_initial = df_initial[variables]

parameters = []
for i in input_width:
    for j in prediction_width:
        for k in batch_size:
            for l in epochs:
                for m in dropout:
                    for n in neurons:
                            for p in optimizer:
                                for s in normalization:
                                    parameters.append({'width':i, 'output': j, 'batch': k, 'epochs': l, 'dropout': m, 'neurons': n, 'opt': p, 'norm': s})

In [10]:
last_params = {'width': 0, 'norm': []}
min_maxs = {}

for params in parameters:
    df = df_initial.copy()
    for col in cols:
        iqr = df_initial[col].quantile(0.75) - df_initial[col].quantile(0.25)
        min_maxs[col] = [df_initial[col].quantile(0.25) - 1.5 * iqr, df_initial[col].quantile(0.75) + 1.5 * iqr]
        df[col] = ((params['norm'][1] - params['norm'][0]) * (df[col] - min_maxs[col][0]) / (min_maxs[col][1] - min_maxs[col][0])) + params['norm'][0]
        df.loc[df[col] < params['norm'][0], col] = params['norm'][0]
        df.loc[df[col] > params['norm'][1], col] = params['norm'][1]
        
    df_train = df[df['date'] < '2019-01-01'].copy()
    df_test = df[df['date'] >= '2019-01-01'].copy()

    train_X = []
    train_Y = []
    for i in range(params['width'], len(df_train) - params['output']):
        train_X.append(df_train.iloc[i - params['width']:i][input_vars].values)
        train_Y.append(df_train.iloc[i:i + params['output']][cols].values)
    train_X = np.array(train_X)
    train_Y = np.array(train_Y)

    test_X = []
    test_Y = []
    for i in range(params['width'], len(df_test) - params['output']):
        test_X.append(df_test.iloc[i - params['width']:i][input_vars].values)
        test_Y.append(df_test.iloc[i:i + params['output']][cols].values)
    test_X = np.array(test_X)
    test_Y = np.array(test_Y)
        
    model = Sequential()
    model.add(LSTM(params['neurons'], activation='tanh', input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=False))
    model.add(Dropout(params['dropout']))
    model.add(Dense(units=len(cols), activation='linear'))
    model.compile(optimizer=params['opt'], loss='mse', metrics=['mae'])

    with tf.device('/device:GPU:0'):
        history = model.fit(train_X, train_Y, epochs=params['epochs'], batch_size=params['batch'], validation_split=0.1, verbose=1, shuffle=False)
    model.save(f'models/{station.strip(".zip")}_{params["width"]}_{params["output"]}_{params["batch"]}_{params["epochs"]}_{params["dropout"]}_{params["neurons"]}_{params["opt"]}_{params["norm"][0]}{params["norm"][1]}.h5')

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## 1.2. Prediction and plotting phase 
In this phase the predictions of the models trained are done and saved to plot afterwards

In [11]:
directories = os.listdir('models')

norm = [model for model in directories if '11' in model]

results = []

for model_path in norm:
    params = model_path.strip('.h5').split('_')
    if '01' in params[-1]:
        norm_min, norm_max = 0, 1
    else:
        norm_min, norm_max = -1, 1
    df_test = df_initial[df_initial['date'] >= '2019-01-01'].copy()

    for col in cols:
        iqr = df_initial[col].quantile(0.75) - df_initial[col].quantile(0.25)
        min_maxs[col] = [df_initial[col].quantile(0.25) - 1.5 * iqr, df_initial[col].quantile(0.75) + 1.5 * iqr]
        df_test[col] = ((norm_max - norm_min) * (df_test[col] - min_maxs[col][0]) / (min_maxs[col][1] - min_maxs[col][0])) + norm_min
        df_test.loc[df_test[col] < norm_min, col] = norm_min
        df_test.loc[df_test[col] > norm_max, col] = norm_max
    
    test_X = []
    test_Y = []
    for i in range(int(params[1]), len(df_test) - int(params[2])):
        test_X.append(df_test.iloc[i - int(params[1]):i][input_vars].values)
        test_Y.append(df_test.iloc[i:i + int(params[2])][cols].values)
    test_X = np.array(test_X)
    test_Y = np.array(test_Y)

    model = keras.models.load_model(f'models/{model_path}')
    y_pred = model.predict(test_X)
    for idx, col in enumerate(cols):
        y_pred[:, idx] = ((y_pred[:, idx] - norm_min) * (min_maxs[col][1] - min_maxs[col][0]) / (norm_max - norm_min)) + min_maxs[col][0]
    
    results.append(y_pred)

for idx, col in enumerate(cols):
    test_Y[:, 0, idx] = ((test_Y[:, 0, idx] - norm_min) * (min_maxs[col][1] - min_maxs[col][0]) / (norm_max - norm_min)) + min_maxs[col][0]



In [12]:
minimum = np.inf
for idx, result in enumerate(results):
    if len(result) < minimum:
        minimum = len(result)
for idx, result in enumerate(results):
    results[idx] = result[-minimum:]
test_Y = test_Y[-minimum:, 0, :]

for idx, col in enumerate(cols):
    # plot each column
    plt.figure(figsize=(10, 6))
    plt.plot(test_Y[:480, idx], label='Real', color='blue')
    for idx2, result in enumerate(results):
        params = norm[idx2].strip('.h5').split('_')
        rmse = np.sqrt(np.mean(np.power((test_Y[:, idx] - result[:, idx]), 2)))
        plt.plot(result[:480, idx], label=f'RMSE {rmse:.4f}')
    plt.legend()
    plt.title(f'{col} - Drop out - {params[-1]} Normalization (No outliers)')
    plt.savefig(f'plots/{col}_dropOut_o{params[-1]}norm.png')
    plt.clf()


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

# 2. Treated outliers

Modelos entrenados usando IQR como tratamiento de outliers, hay dos casos:
- Datos normalizados entre 0 y 1
- Datos normalizados entre -1 y 1

Las imagenes en la carpeta plots se identifican de la siguiente forma:

{normalización}\_{variable}\_{neuronas}\_{ventana temporal}\_{epochs}ep_{batch size}bs.png

Por ejemplo: **o01_T_64_24_15ep_32bs.png**
- Los datos son normalizados entre 0 y 1, la letra o indica que se han tratado outliers. 
- La variable que muestra la grafica es la Temperatura (T).
- La capa LSTM tiene 64 neuronas.
- La ventana temporal es de 24 (24 datos previos al instante predicho, cada una de estas 24 representa una medición cada 30min)
- Entrenado durante 15 epochs
- Batch size de 32.

In [5]:
input_width = [24]
prediction_width = [1]
batch_size = [8]
epochs = [20]
dropout = [0.05, 0.1, 0.15, 0.3]
neurons = [64]
optimizer = ['adam']
normalization = [[-1, 1]]#, [0, 1]]

station = 'C6.zip'
variables = ['T', 'HR', 'P', 'u10', 'v10', 'day', 'time', 'date']
input_vars = ['T', 'HR', 'P', 'u10', 'v10', 'day', 'time']
cols = ['T', 'HR', 'P', 'u10', 'v10']

df_initial = pd.read_csv(f'data/data_by_station/{station}', compression='zip', header=0, sep=',')
df_initial['date'] = pd.to_datetime(df_initial['date'], format='%Y-%m-%d %H:%M:%S')
df_initial['day'] = df_initial['date'].dt.dayofyear / 365
df_initial['time'] = df_initial['date'].dt.hour / 24
df_initial = df_initial.astype({'T': 'float', 'HR': 'float', 'P': 'float', 'u2': 'float', 'v2': 'float', 'u6': 'float', 'v6': 'float', 'u10': 'float', 'v10': 'float', 'altitud': 'float', 'latitud': 'float', 'longitud': 'float'})
df_initial = df_initial[variables]

parameters = []
for i in input_width:
    for j in prediction_width:
        for k in batch_size:
            for l in epochs:
                for m in dropout:
                    for n in neurons:
                            for p in optimizer:
                                for s in normalization:
                                    parameters.append({'width':i, 'output': j, 'batch': k, 'epochs': l, 'dropout': m, 'neurons': n, 'opt': p, 'norm': s})

In [6]:
last_params = {'width': 0, 'norm': []}
for params in parameters:
    df = df_initial.copy()
    for col in cols:
        df[col] = ((params['norm'][1] - params['norm'][0]) * (df[col] - df_initial[col].min()) / (df_initial[col].max() - df_initial[col].min())) + params['norm'][0]

    df_train = df[df['date'] < '2019-01-01'].copy()
    df_test = df[df['date'] >= '2019-01-01'].copy()

    train_X = []
    train_Y = []
    for i in range(params['width'], len(df_train) - params['output']):
        train_X.append(df_train.iloc[i - params['width']:i][input_vars].values)
        train_Y.append(df_train.iloc[i:i + params['output']][cols].values)
    train_X = np.array(train_X)
    train_Y = np.array(train_Y)

    test_X = []
    test_Y = []
    for i in range(params['width'], len(df_test) - params['output']):
        test_X.append(df_test.iloc[i - params['width']:i][input_vars].values)
        test_Y.append(df_test.iloc[i:i + params['output']][cols].values)
    test_X = np.array(test_X)
    test_Y = np.array(test_Y)
        
    model = Sequential()
    model.add(LSTM(params['neurons'], activation='tanh', input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=False))
    model.add(Dropout(params['dropout']))
    model.add(Dense(units=len(cols), activation='linear'))
    model.compile(optimizer=params['opt'], loss='mse', metrics=['mae'])

    with tf.device('/device:GPU:0'):
        history = model.fit(train_X, train_Y, epochs=params['epochs'], batch_size=params['batch'], validation_split=0.1, verbose=1, shuffle=False)
    model.save(f'models/{station.strip(".zip")}_{params["width"]}_{params["output"]}_{params["batch"]}_{params["epochs"]}_{params["dropout"]}_{params["neurons"]}_{params["opt"]}_{params["norm"][0]}{params["norm"][1]}.h5')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## 1.2. Prediction and plotting phase 
In this phase the predictions of the models trained are done and saved to plot afterwards

In [None]:
directories = os.listdir('models')

norm = [model for model in directories if '11' in model]

results = []

for model_path in norm:
    params = model_path.strip('.h5').split('_')
    if '01' in params[-1]:
        norm_min, norm_max = 0, 1
    else:
        norm_min, norm_max = -1, 1
    df_test = df_initial[df_initial['date'] >= '2019-01-01'].copy()

    for col in cols:
        df_test[col] = ((norm_max - norm_min) * (df_test[col] - df_initial[col].min()) / (df_initial[col].max() - df_initial[col].min())) + norm_min
    
    test_X = []
    test_Y = []
    for i in range(int(params[1]), len(df_test) - int(params[2])):
        test_X.append(df_test.iloc[i - int(params[1]):i][input_vars].values)
        test_Y.append(df_test.iloc[i:i + int(params[2])][cols].values)
    test_X = np.array(test_X)
    test_Y = np.array(test_Y)

    model = keras.models.load_model(f'models/{model_path}')
    y_pred = model.predict(test_X)
    for idx, col in enumerate(cols):
        y_pred[:, idx] = ((y_pred[:, idx] - norm_min) * (df_initial[col].max() - df_initial[col].min()) / (norm_max - norm_min)) + df_initial[col].min()
    
    results.append(y_pred)

for idx, col in enumerate(cols):
    test_Y[:, 0, idx] = ((test_Y[:, 0, idx] - norm_min) * (df_initial[col].max() - df_initial[col].min()) / (norm_max - norm_min)) + df_initial[col].min()



In [None]:
minimum = np.inf
for idx, result in enumerate(results):
    if len(result) < minimum:
        minimum = len(result)
for idx, result in enumerate(results):
    results[idx] = result[-minimum:]
test_Y = test_Y[-minimum:, 0, :]

for idx, col in enumerate(cols):
    # plot each column
    plt.figure(figsize=(10, 6))
    plt.plot(test_Y[:480, idx], label='Real', color='blue')
    for idx2, result in enumerate(results):
        params = norm[idx2].strip('.h5').split('_')
        rmse = np.sqrt(np.mean(np.power((test_Y[:, idx] - result[:, idx]), 2)))
        plt.plot(result[:480, idx], label=f'{params[3]} Batch size: {rmse:.4f}')
    plt.legend()
    plt.title(f'{col} - Batch size - {params[-1]} Normalization')
    plt.savefig(f'plots/{col}_batchSize_{params[-1]}norm.png')
    plt.clf()


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [None]:
%reset -f