In [1]:
import matplotlib.pyplot as plt
import pandas as pd

from functions.checks_and_preprocessing.lagging_and_splitting import split_dataframe, sliding_window
from functions.models.models_and_training import create_multiple_LSTM, train_model
from functions.data_load_and_transform.sql_connections import get_database_connector, get_beach_data
from functions.plotting.forecast_plot import plot_forecast
from sklearn.preprocessing import MinMaxScaler


Standardization assumes that your observations fit a Gaussian distribution (bell curve) with a well behaved mean and standard deviation. 
You can still standardize your time series data if this expectation is not met, but you may not get reliable results.

Make histogram to check if standartization or normalization should be done

In [None]:
# The experimental run of the LSTM should be run 10 times, as LSTM is stochastic. Or set a random seed

In [2]:
single_beach_data, beach_name_sql_table = get_beach_data(get_database_connector())


Selected Beach Details:
силистар


In [None]:
#Make a function, where we specify the temporal-scale of the data, which return the normalized version of that data.

In [None]:
single_beach_data_w = single_beach_data.resample("W").mean()

In [None]:
train, valid, test, test_index = split_dataframe(single_beach_data_w)
features = len(single_beach_data_w.columns)

#Scaling should be in a func
scaler = MinMaxScaler(feature_range=(0, 1))
train_scaled = scaler.fit_transform(train)
valid_scaled = scaler.transform(valid)
test_scaled =scaler.transform(test)

df_scaled = scaler.transform(single_beach_data_w.values)
#Reverse scaling should also be a func

In [None]:
window_size = 21

trainX, trainY = sliding_window(train_scaled, window_size)
valX, valY = sliding_window(valid_scaled, window_size)
testX, testY = sliding_window(test_scaled, window_size)

In [None]:
def generate_models(layers: list, units: list, window: int = window_size, features: int = features) -> dict:
    models = {}

    for n_layers in layers:
        for n_units in units:
            model_name = f'{n_layers} layers, {n_units} units'
            models[model_name] = {
                'model': create_multiple_LSTM(n_layers=n_layers, units=n_units, window=window, features=features),
                'history': None
            }

    return models

In [None]:
layers = [1, 2, 3, 4, 5]
units = [100, 150, 200, 250, 300]

models = generate_models(layers=layers, units=units, window=window_size, features=features)

In [None]:
for model_info in models.values():
    model_info['history'] = train_model(model=model_info['model'], trainX=trainX, trainY=trainY, valX=valX, valY=valY, patience=20)

In [None]:
def get_best_model(models: dict, metric: str = 'root_mean_squared_error') -> dict:
    best_model_name = None
    best_model_history = None
    best_metric_value = float('inf')

    for model_name, model_info in models.items():
        if model_info['history'] is not None:
            metric_values = model_info['history'].history[metric]
            if min(metric_values) < best_metric_value:
                best_metric_value = min(metric_values)
                best_model_name = model_name
                best_model_history = model_info['history']

    return {best_model_name: best_model_history}

In [None]:
best_model_history = get_best_model(models, 'root_mean_squared_error')

In [None]:
#Code get_best_models, so that all models are trained and stored in a dictionary, with all their best value, with the top values circled and best model selected

In [None]:
#Fix the plot_forecast

In [None]:
plot_forecast(models, list(best_model_history.keys())[0], testX, testY)

In [None]:
#ChatGPT on if I can use multiindex df for LSTM

In [None]:
#post this line it needs to be changed, due to the multi model change.

In [None]:
#function
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
valPredict = model.predict(valX)

In [None]:
#function
#Check which of these have been used so far. Not sure for testY ValY, etc.
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform(trainY)

testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform(testY)

valPredict = scaler.inverse_transform(valPredict)
valY = scaler.inverse_transform(valY)

In [None]:
import math
from sklearn.metrics import mean_squared_error

trainScore = math.sqrt(mean_squared_error(trainY.ravel(), trainPredict.ravel()))
print('Train Score: %.2f RMSE' % (trainScore))

testScore = math.sqrt(mean_squared_error(testY.ravel(), testPredict.ravel()))
print('Test Score: %.2f RMSE' % (testScore))

In [None]:
#training loss, validation loss/ training, validation accuracy plot as well

In [None]:
#check this
#train_loss, train_acc = model.evaluate_generator(train_generator, steps=16) not _generator
#validation_loss, test_acc = model.evaluate_generator(validation_generator, steps=16)
#print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

In [None]:
#batch normalization 

In [None]:
#this is to plot the loss per column. Not bad, even needed.
num_columns = trainY.shape[1]

# Create subplots for each column
fig, axes = plt.subplots(num_columns, 1, figsize=(10, 5*num_columns), sharex=True)

# Loop through each column and plot the actual vs. predicted values
for col in range(num_columns):
    actual = trainY[:, col]
    predicted = trainPredict[:, col]

    # Plot actual values in blue
    axes[col].plot(actual, label='Actual', color='blue')
    
    # Plot predicted values in orange
    axes[col].plot(predicted, label='Predicted', color='orange')
    
    # Add labels and legends
    axes[col].set_title(f'Column {col+1}')
    axes[col].set_xlabel('Sample')
    axes[col].set_ylabel('Value')
    axes[col].legend()

# Adjust layout for better readability
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
#Tip: ACF - window size that aligns with these significant lags to capture these patterns of seasonality or temporality 
#Tip2: PACF - how many previous time steps to include in your window

In [None]:
#Feature Engineering:

#Lag Features: Create lag features (i.e., features with past values of the target variable) to capture autocorrelation. Experiment with different lag values to see which ones are most informative.