In [None]:
import math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pprint
import sys
sys.path.append('./functions')

import warnings
warnings.filterwarnings('ignore')

from functions.checks_and_preprocessing.missing_or_nan import check_missing_or_nan
from functions.checks_and_preprocessing.stationarity_normality import kpss_adf_stationarity, normality_testing
from functions.data_load_and_transform.sql_connections import get_database_connector, get_beach_data
from functions.plotting.data_and_acf import create_widgets_and_plot

In [None]:
# We select one beach (via index), read it's data from the SQL database, and check for missing hours and NaN values
DATA_STARTDATE = "1979-01-01"
DATA_ENDDATE = "2021-12-31"

single_beach_data, beach_name_sql_table = get_beach_data(get_database_connector())
check_missing_or_nan(single_beach_data, beach_name_sql_table, DATA_STARTDATE, DATA_ENDDATE)

In [None]:
percentiles = [0.05, .10, .25, .5, .75, .90, .95]
single_beach_data.describe(percentiles=percentiles)

In [None]:
# We check if there are zero-valued or negative-valued features
zero_var_list = []
negative_var_list = []
for variable in single_beach_data.columns:
    if any(single_beach_data[variable] == 0):
        zero_var_list.append(variable)
    if any(single_beach_data[variable] < 0):
        negative_var_list.append(variable)

print(f'Features with zero values: {zero_var_list}') 
print(f'Features with negative values: {negative_var_list}') 


In [None]:
# Perform Kwiatkowski-Phillips-Schmidt-Shin (KPSS) and Augmented Dickey-Fuller (ADF) stationarity tests 
# on every column
stationarity_dict = {}
for col in single_beach_data.columns:
    kpssh, adfh = kpss_adf_stationarity(single_beach_data[col])
    stationarity_dict[col] = [kpssh, adfh]

pprint.pprint(stationarity_dict)

In their paper “Applying LSTM to Time Series Predictable through Time-Window Approaches”, Gers, Eck and Schmidhuber claim 
"Our results suggest to use LSTM only on tasks where traditional time window-based approaches must fail.
LSTM’s ability to track slow oscillations in the chaotic signal may be applicable to cognitive domains such as rhythm detection in speech and music."
Based on that, we do not apply differentiation to non-stationary columns at this point, seeing as how LTSM's could excel at handling such datasets.

In [None]:
# Carry out 4 normality tests
normality_dict = {}
for col in single_beach_data.columns:
    normality_results = normality_testing(single_beach_data[col])
    normality_dict[col] = normality_results

pprint.pprint(normality_dict)

In [None]:
# Checking distribution of each column. Binning done with Freedman-Diaconis Rule.

# Calculate the number of rows needed for subplots
num_vars = len(single_beach_data.columns)
num_rows = math.ceil(num_vars / 3)

fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, num_rows*5))

# Flatten the axes array and iterate over it and the columns at the same time
for ax, column in zip(axs.flatten(), single_beach_data.columns):
    IQR = single_beach_data[column].quantile(0.75) - single_beach_data[column].quantile(0.25)
    h = 2 * IQR * (len(single_beach_data[column])**(-1/3))
    num_bins = int((single_beach_data[column].max() - single_beach_data[column].min()) / h)
    
    single_beach_data[column].plot(kind='hist', bins=num_bins, ax=ax)
    ax.set_title(f'Distribution of {column}')

# Remove unused subplots
for i in range(num_vars, num_rows*3):
    fig.delaxes(axs.flatten()[i])

plt.tight_layout()
plt.show()

In [None]:
corr = single_beach_data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
plt.figure(figsize=(16, 11))
sns.heatmap(corr, mask=mask, cmap=cmap, annot=True)
plt.show()

In [None]:
# Interactive plot to help visualize the data and ACF/PACF, with customizable options.
create_widgets_and_plot(single_beach_data)