## Import packages

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from helper import series_to_supervised
from tensorflow.keras.models import load_model
from preprocess import water_postprocess
from scipy.stats import mannwhitneyu, wilcoxon, ttest_rel
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from performance import metrics_s1_t1

2024-02-03 15:03:12.584480: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

## Preprocess data

In [4]:
# https://keras.io/examples/timeseries/timeseries_weather_forecasting/#climate-data-timeseries
data = pd.read_csv("../data/pollution.csv", index_col=0)
data.fillna(0, inplace=True)
data.head()

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_spd,snow,rain,NE,NW,SE,cv
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,1.79,0,0,0,0,1,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,2.68,0,0,0,0,1,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,3.57,0,0,0,0,1,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,5.36,1,0,0,0,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,6.25,2,0,0,0,1,0


In [5]:
values = data.values

# specify the number of lag hours
n_hours = 24*3
n_features = data.shape[-1]
k = 12
split1 = 0.7
split2 = 0.85

# frame as supervised learning
reframed = series_to_supervised(values, n_hours, k)
print("reframed.shape:", reframed.shape)

reframed.shape: (43717, 924)


In [6]:
# split into train and test sets
reframed_values = reframed.values
n_train_hours = int(len(reframed_values)*split1)
n_valid_hours = int(len(reframed_values)*split2)

train = reframed_values[:n_train_hours, :]
val = reframed_values[n_train_hours:n_valid_hours, :]
test = reframed_values[n_valid_hours:, :]


# split into input and outputs
n_obs = n_hours * n_features
feature_idx = 0
train_X, train_y = train[:, :n_obs], train[:, [n_obs + feature_idx + n_features * i for i in range(k)]]
val_X, val_y = val[:, :n_obs], val[:, [n_obs + feature_idx + n_features * i for i in range(k)]]
test_X, test_y = test[:, :n_obs], test[:, [n_obs + feature_idx + n_features * i for i in range(k)]]


print("train_X.shape, train_y.shape, val_X.shape, val_y.shape, test_X.shape, test_y.shape", 
      train_X.shape, train_y.shape, val_X.shape, val_y.shape, test_X.shape, test_y.shape
     )

train_X.shape, train_y.shape, val_X.shape, val_y.shape, test_X.shape, test_y.shape (30601, 792) (30601, 12) (6558, 792) (6558, 12) (6558, 792) (6558, 12)


In [7]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))

train_X = scaler.fit_transform(train_X)
train_y = scaler.fit_transform(train_y)

val_X = scaler.fit_transform(val_X)
val_y = scaler.fit_transform(val_y)

test_X = scaler.fit_transform(test_X)
test_y = scaler.fit_transform(test_y)

# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
val_X = val_X.reshape((val_X.shape[0], n_hours, n_features))
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))

print("train_X.shape, train_y.shape, val_X.shape, val_y.shape, test_X.shape, test_y.shape", 
      train_X.shape, train_y.shape, val_X.shape, val_y.shape, test_X.shape, test_y.shape
     )

train_X.shape, train_y.shape, val_X.shape, val_y.shape, test_X.shape, test_y.shape (30601, 72, 11) (30601, 12) (6558, 72, 11) (6558, 12) (6558, 72, 11) (6558, 12)


### PM threshold

In [8]:
train_X_pm = train_X[:, 0, feature_idx]
print(train_X_pm.shape)

val_X_pm = val_X[:, 0, feature_idx]
print(val_X_pm.shape)

test_X_pm = test_X[:, 0, feature_idx]
print(test_X_pm.shape)

(30601,)
(6558,)
(6558,)


In [9]:
percentile = 95

merged_array = np.concatenate((train_X_pm, val_X_pm, test_X_pm))

percentile_pm = np.percentile(merged_array, percentile)

print("{}th Percentile of Daily Rain:".format(percentile), percentile_pm)

95th Percentile of Daily Rain: 0.33400402414486924


### train_X_filter

In [10]:
train_X_extreme = train_X[train_X_pm > percentile_pm]
print(train_X_extreme.shape)

train_y_extreme = train_y[train_X_pm > percentile_pm]
print(train_y_extreme.shape)

(848, 72, 11)
(848, 12)


In [11]:
train_X_normal = train_X[train_X_pm <= percentile_pm]
print(train_X_normal.shape)

train_y_normal = train_y[train_X_pm <= percentile_pm]
print(train_y_normal.shape)

(29753, 72, 11)
(29753, 12)


### val_X_filter

In [12]:
val_X_extreme = val_X[val_X_pm > percentile_pm]
print(val_X_extreme.shape)

val_y_extreme = val_y[val_X_pm > percentile_pm]
print(val_y_extreme.shape)

(752, 72, 11)
(752, 12)


In [13]:
val_X_normal = val_X[val_X_pm <= percentile_pm]
print(val_X_normal.shape)

val_y_normal = val_y[val_X_pm <= percentile_pm]
print(val_y_normal.shape)

(5806, 72, 11)
(5806, 12)


### test_X_filter

In [14]:
test_X_extreme = test_X[test_X_pm > percentile_pm]
print(test_X_extreme.shape)

test_y_extreme = test_y[test_X_pm > percentile_pm]
print(test_y_extreme.shape)

(581, 72, 11)
(581, 12)


In [15]:
test_X_normal = test_X[test_X_pm <= percentile_pm]
print(test_X_normal.shape)

test_y_normal = test_y[test_X_pm <= percentile_pm]
print(test_y_normal.shape)

(5977, 72, 11)
(5977, 12)


## Test model

In [16]:
# ws_threshold = 2.58
time_index = 0

#### Extreme

In [33]:
# saved_model = load_model('../saved_models_mlp/pm_N.h5') 
# saved_model = load_model('../saved_models_mlp/pm_E.h5') 


# saved_model = load_model('../saved_models_mlp/pm_all.h5') 
# saved_model = load_model('../saved_models_mlp/pm_all_95_ft.h5')


# saved_model = load_model('../saved_models_mlp/pm_all_weighted_IPF_95.h5')
# saved_model = load_model('../saved_models_mlp/pm_all_weighted_IPF_95_ft.h5')


# saved_model = load_model('../saved_models_mlp/pm_all_weighted_EVT_95.h5')
# saved_model = load_model('../saved_models_mlp/pm_all_weighted_EVT_95_ft.h5')


saved_model = load_model('../saved_models_mlp/pm_all_weighted_META_95.h5')
# saved_model = load_model('../saved_models_mlp/pm_all_weighted_META_95_ft.h5')


yhat_extreme = saved_model.predict(test_X_extreme)

inv_yhat_extreme = scaler.inverse_transform(yhat_extreme)
inv_y_extreme = scaler.inverse_transform(test_y_extreme)
test_errors_extreme = inv_yhat_extreme - inv_y_extreme

# print('MAE = {}'.format(float("{:.6f}".format(mae(inv_y_extreme, inv_yhat_extreme)))))
# print('RMSE = {}'.format(float("{:.6f}".format(np.sqrt(mse(inv_y_extreme, inv_yhat_extreme))))))

In [34]:
metrics_s1_t1(inv_y_extreme.min(), time_index, inv_y_extreme, inv_yhat_extreme, test_errors_extreme)

Peformance when water level is over 2.0 ft 

------ MAE & RMSE ------
MAE = 36.90641
RMSE = 56.602341 

------ Max Errors (t+1 at S1) ------
Max Error of Over Estimation: 130.31353759765625
Max Error of Under Estimation: -123.68858337402344
Max Abs Error of Under Estimation: 130.31353759765625 

------ Time # (t+1 at S1) ------
Time# of Over Estimation: 308
Time# of Under Estimation: 273 

------ Area (t+1 at S1) ------
Area of Over Estimation: 5037.882924079895
Area of Under Estimation: -4345.403095543385


#### Normal & Extreme hen water level is over threshold 2.58 feet (95 percentile)

In [23]:
# saved_model = load_model('../saved_models_mlp/pm_all.h5') 
# saved_model = load_model('../saved_models_mlp/pm_all_95_ft.h5')


# saved_model = load_model('../saved_models_mlp/pm_all_weighted_IPF_95.h5')
# saved_model = load_model('../saved_models_mlp/pm_all_weighted_IPF_95_ft.h5')


# saved_model = load_model('../saved_models_mlp/pm_all_weighted_EVT_95.h5')
# saved_model = load_model('../saved_models_mlp/pm_all_weighted_EVT_95_ft.h5')


saved_model = load_model('../saved_models_mlp/pm_all_weighted_META_95.h5')
# saved_model = load_model('../saved_models_mlp/pm_all_weighted_META_95_ft.h5')


yhat = saved_model.predict(test_X)


inv_yhat = scaler.inverse_transform(yhat)
inv_y = scaler.inverse_transform(test_y)
test_errors = inv_yhat - inv_y

metrics_s1_t1(inv_y.min(), time_index, inv_y, inv_yhat, test_errors)

Peformance when water level is over 0.0 ft 

------ MAE & RMSE ------
MAE = 32.433259
RMSE = 49.427912 

------ Max Errors (t+1 at S1) ------
Max Error of Over Estimation: 170.5828399658203
Max Error of Under Estimation: -508.7136459350586
Max Abs Error of Under Estimation: 508.7136459350586 

------ Time # (t+1 at S1) ------
Time# of Over Estimation: 3136
Time# of Under Estimation: 3334 

------ Area (t+1 at S1) ------
Area of Over Estimation: 41364.09931087494
Area of Under Estimation: -55552.61905416846


#### hyperparameter - frozen layers

In [23]:
layers = [1, 5, 9, 13, 17]
for layer in layers:
    print('layer: {}'.format(layer))
    saved_model = load_model('../saved_models_hyper/pm_all_weighted_META_ft_{}.h5'.format(layer))

    yhat_extreme = saved_model.predict(test_X_extreme)

    inv_yhat_extreme = scaler.inverse_transform(yhat_extreme)
    inv_y_extreme = scaler.inverse_transform(test_y_extreme)
    test_errors_extreme = inv_yhat_extreme - inv_y_extreme

    print('MAE = {}'.format(float("{:.6f}".format(mae(inv_y_extreme, inv_yhat_extreme)))))
    print('RMSE = {}'.format(float("{:.6f}".format(np.sqrt(mse(inv_y_extreme, inv_yhat_extreme))))))
    print('-------------------------')

layer: 1
MAE = 37.315949
RMSE = 56.256228
-------------------------
layer: 5
MAE = 35.6374
RMSE = 54.496486
-------------------------
layer: 9
MAE = 35.942333
RMSE = 54.47041
-------------------------
layer: 13
MAE = 35.315476
RMSE = 53.917494
-------------------------
layer: 17
MAE = 35.263393
RMSE = 53.878661
-------------------------


### p-values

In [18]:
saved_model = load_model('../saved_models_mlp/pm_all.h5') 

yhat_extreme = saved_model.predict(test_X_extreme)
inv_yhat_extreme = scaler.inverse_transform(yhat_extreme)
inv_y_extreme = scaler.inverse_transform(test_y_extreme)
test_errors_extreme_ori = inv_yhat_extreme - inv_y_extreme
test_errors_extreme_ori.shape

(581, 12)

In [19]:
saved_model = load_model('../saved_models_mlp/pm_all_weighted_META_95.h5')

yhat_extreme = saved_model.predict(test_X_extreme)
inv_yhat_extreme = scaler.inverse_transform(yhat_extreme)
inv_y_extreme = scaler.inverse_transform(test_y_extreme)
test_errors_extreme_re_meta = inv_yhat_extreme - inv_y_extreme

In [23]:
t_index = 5

# ========= Mann-Whitney U test =========
stat_mann, p_value_mann = mannwhitneyu(test_errors_extreme_ori[:,t_index], test_errors_extreme_re_meta[:,t_index], alternative='two-sided')
print(f"p_value_mann: {p_value_mann:.4e}")


# ========= wilcoxon U test =========
stat_wilcoxon, p_value_wilcoxon = wilcoxon(test_errors_extreme_ori[:, t_index], test_errors_extreme_re_meta[:, t_index])
print(f"p_value_wilcoxon: {p_value_wilcoxon:.4e}")


# ========= t-test =========
t_statistic, p_value = ttest_rel(test_errors_extreme_ori[:,t_index], test_errors_extreme_re_meta[:,t_index])
print(f"p_value_ttest: {p_value:.4e}")

p_value_mann: 1.3371e-02
p_value_wilcoxon: 7.1949e-06
p_value_ttest: 1.2713e-04
