In [1]:
# from prophet import Prophet
import os
import pandas as pd
import numpy as np
import datetime, itertools
# from prophet.plot import plot_yearly, plot_weekly, plot_plotly, plot_components_plotly
# import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error

In [4]:
# import cv2 as cv

# Transform Data

In [7]:
# DATA_DIR = os.path.join("..", "data")
# fp = os.path.join(DATA_DIR, "hydrograph-excel-sheet-tp-cleaned.xlsx")
fp = "hydrograph-excel-sheet-tp-cleaned.xlsx"
xl = pd.ExcelFile(fp)
gages = xl.sheet_names
hydro_data = {s: xl.parse(s) for s in gages}

In [8]:
def flatten_sheet(sheet_name: str, src_data: dict):
    src_df = src_data[sheet_name]
    
    # Check lengths of columns, some contain only notes so will be
    # much less than 100 and need to be dropped. Most columns
    # should have 365/366 values but a few are missing and need to be filled.
    col_lengths = {c: sum(src_df[c].notna()) for c in src_df.columns}
    keep_cols = [c for c, l in col_lengths.items() if l > 100]
    
    # Check columns are all in the correct order to combine:
    assert "time" in keep_cols[0].lower()
    correct_order = {"time": "ft", "ft": "discharge", "discharge": "time"}
    for i, col in enumerate(keep_cols[:-1]):
        next_col = keep_cols[i+1]
        for key in correct_order.keys():
            if key in col.lower():
                should_be = correct_order[key]
                assert should_be in next_col.lower(), sheet_name
    
    # Iterate through columns and collect data:
    data_subsets = list()
    for start_col in range(0, len(keep_cols), 3):
        df_columns = keep_cols[start_col: start_col+3]
        subset = src_df[df_columns]
        rename = dict(zip(subset.columns, ["time", "ft", "m3"]))
        subset = subset.rename(columns=rename).dropna(how="all")
        data_subsets.append(subset)
        
    # Combine to a single df:
    final =  pd.concat(data_subsets).reset_index(drop=True)
    final["gage"] = sheet_name
    return final

In [9]:
all_sheets = list()
for sname in gages:
    all_sheets.append(flatten_sheet(sname, hydro_data)) 
df = pd.concat(all_sheets).reset_index(drop=True)

In [10]:
min_date, max_date = min(df["time"]), max(df["time"])
all_dates = [min_date.to_pydatetime()]
while all_dates[-1] < max_date:
    all_dates.append(all_dates[-1] + datetime.timedelta(days=1))
    
full_index = list(itertools.product(df["gage"].unique(), all_dates))
df = df.set_index(["gage", "time"])
df = df.reindex(full_index)

In [11]:
gage_list = df.index.get_level_values('gage').unique().tolist()

In [12]:
gage_list

['11402000',
 '11318500',
 '11266500',
 '11208000',
 '11202710',
 '11185500',
 '11189500']

In [13]:
gage_ts = df.loc[gage_list[0]].reset_index(drop = False)
gage_ts.rename(columns = {'time':'ds', 'ft': 'y'}, inplace = True)
min_date = gage_ts['ds'].min()
max_date = gage_ts['ds'].max()

In [14]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ft,m3
gage,time,Unnamed: 2_level_1,Unnamed: 3_level_1
11402000,1984-10-01,54.00,1.529110
11402000,1984-10-02,52.00,1.472476
11402000,1984-10-03,49.00,1.387525
11402000,1984-10-04,49.00,1.387525
11402000,1984-10-05,48.00,1.359209
...,...,...,...
11189500,2018-09-26,2.86,0.080986
11189500,2018-09-27,2.78,0.078721
11189500,2018-09-28,2.99,0.084667
11189500,2018-09-29,3.12,0.088349


# Generate Multivariate Time Series Data

In [15]:
gage_df = df.loc[gage_list[0]].reset_index(drop = False)[['time','ft']]
gage_df.rename(columns = {'ft':f'ft_{gage_list[0]}'}, inplace = True)
for gage_num in gage_list[1:]:
    new_gage_df =  df.loc[gage_num].reset_index(drop = False)[['time','ft']]
    new_gage_df.rename(columns = {'ft':f'ft_{gage_num}'}, inplace = True)
    gage_df = gage_df.merge(new_gage_df, on = 'time', how = 'outer')

In [16]:
gage_df.head()

Unnamed: 0,time,ft_11402000,ft_11318500,ft_11266500,ft_11208000,ft_11202710,ft_11185500,ft_11189500
0,1984-10-01,54.0,10.0,53.0,1.7,,256.0,39.0
1,1984-10-02,52.0,12.0,52.0,1.4,,279.0,42.0
2,1984-10-03,49.0,14.0,51.0,1.4,,284.0,45.0
3,1984-10-04,49.0,13.0,49.0,1.4,,291.0,47.0
4,1984-10-05,48.0,14.0,46.0,1.4,,281.0,50.0


In [49]:
# most 4 years data seems to have no missing values
gage_df.iloc[-365 * 4:].isnull().sum()

time           0
ft_11402000    0
ft_11318500    0
ft_11266500    0
ft_11208000    0
ft_11202710    0
ft_11185500    0
ft_11189500    0
dtype: int64

In [50]:
gage_df = gage_df.iloc[-365 * 4:]

## Baseline Prophet Model

In [28]:
selected_params = {'seasonality_mode':'multiplicative'}
horizon = 30

In [31]:
gage_df.rename(columns = {'time':'ds', 'ft_11402000': 'y'}, inplace = True)
gage_train = gage_df.iloc[: -horizon,:]
gage_test = gage_df.iloc[-horizon:,:]
m = Prophet(**selected_params).fit(gage_train) # **best_params
future = m.make_future_dataframe(periods=horizon)
forecast = m.predict(future)
gage_test['yhat_corrected'] = forecast.iloc[-horizon:]['yhat'].apply(lambda x : max(x,0)).values
gage_test['yhat'] = forecast.iloc[-horizon:]['yhat'].values
forecast['yhat_corrected'] = forecast['yhat'].apply(lambda x : max(x,0))

# gage_test.drop(columns = ['m3'], inplace = True)

# if show_plots:
#     fig1 = m.plot_components(forecast)
#     fig2 = m.plot(forecast)

rmse = np.sqrt(mean_squared_error(gage_test['y'], gage_test['yhat_corrected']))

In [32]:
rmse

22.947919918866564

# Multivariate LSTM Seq2Seq Forecasting

In [4]:
import matplotlib.pyplot as plt
import tensorflow as tf

In [191]:
# def split_series(series, n_past, n_future):
#   #
#   # n_past ==> no of past observations
#   #
#   # n_future ==> no of future observations 
#   #
#     X, y = list(), list()
#     for window_start in range(len(series)):
#         past_end = window_start + n_past
#         future_end = past_end + n_future
#         if future_end > len(series):
#             break
#         # slicing the past and future parts of the window
#         past, future = series[window_start:past_end, :], series[past_end:future_end, :]
#         X.append(past)
#         y.append(future)
#     return np.array(X), np.array(y)


def transform_seq2seq_data(df, target_col, feature_col, n_past, n_future):
    date_list = []
    X_list = []
    y_list = []
    for i in range(n_past,len(df) - n_future):
        y = df.iloc[i:i + n_future][target_col].values.tolist()
        y_list.append(y)

        X = df.iloc[i - n_past: i][feature_cols].values.tolist()
        X_list.append(X)
        
        date_list.append(df.iloc[i - n_past: i].index)
        
    return np.array(X_list), np.array(y_list), date_list 
    

In [48]:
gage_df = pd.read_csv('11402000_complete_ts.csv')
gage_df.set_index('time', inplace = True)
# gage_df['target'] = gage_df['m3'].shift()
gage_df.head(28)

Unnamed: 0_level_0,m3,swe_avg,swe_max,tp_avg,t2m_avg,tp_max,t2m_max,pixel_sum,pixel_mean,pixel_min,pixel_max
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-01,1.599902,0.010051,10.0,36.59678,0.277207,54.22773,1.021534,12514.0,49.07451,34.0,61.0
2010-01-02,1.817942,0.003433,6.0,19.43233,2.140078,40.22646,2.642189,12514.0,49.07451,34.0,61.0
2010-01-03,1.758476,0.000796,4.0,0.766869,-2.116478,1.752489,-0.916607,12514.0,49.07451,34.0,61.0
2010-01-04,1.693347,0.000467,4.0,0.016986,-1.963441,0.045573,-0.204727,12514.0,49.07451,34.0,61.0
2010-01-05,1.662199,0.000412,4.0,0.218129,0.024585,0.302439,0.961063,12514.0,49.07451,34.0,61.0
2010-01-06,1.61406,0.000275,3.0,0.852008,-0.169998,1.713131,0.659763,12514.0,49.07451,34.0,61.0
2010-01-07,1.599902,0.000192,3.0,26.847263,1.220284,30.620916,2.069096,12514.0,49.07451,34.0,61.0
2010-01-08,1.608397,0.00011,2.0,15.534594,0.431471,17.719612,1.665302,12514.0,49.07451,34.0,61.0
2010-01-09,1.699011,8.2e-05,2.0,6.499332,1.175823,7.467758,1.933275,145640.0,44.24058,27.0,82.0
2010-01-10,1.772635,5.5e-05,1.0,1.533739,0.904211,2.409155,1.725673,145640.0,44.24058,27.0,82.0


In [119]:
n_past = 28
n_future = 14 


In [50]:
target_gage = '11402000'

In [51]:
gage_df

Unnamed: 0_level_0,m3,swe_avg,swe_max,tp_avg,t2m_avg,tp_max,t2m_max,pixel_sum,pixel_mean,pixel_min,pixel_max
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-01,1.599902,0.010051,10.0,36.596780,0.277207,54.227730,1.021534,12514.0,49.074510,34.0,61.0
2010-01-02,1.817942,0.003433,6.0,19.432330,2.140078,40.226460,2.642189,12514.0,49.074510,34.0,61.0
2010-01-03,1.758476,0.000796,4.0,0.766869,-2.116478,1.752489,-0.916607,12514.0,49.074510,34.0,61.0
2010-01-04,1.693347,0.000467,4.0,0.016986,-1.963441,0.045573,-0.204727,12514.0,49.074510,34.0,61.0
2010-01-05,1.662199,0.000412,4.0,0.218129,0.024585,0.302439,0.961063,12514.0,49.074510,34.0,61.0
...,...,...,...,...,...,...,...,...,...,...,...
2016-12-28,4.389111,0.255643,65.0,0.000207,-3.150443,0.002071,-1.477888,189237.0,34.921020,19.0,76.0
2016-12-29,4.105943,0.248476,64.0,0.123876,-3.386356,0.263080,-1.181665,189237.0,34.921020,19.0,76.0
2016-12-30,3.936042,0.243340,63.0,0.051373,-3.080120,0.084931,-1.323266,189237.0,34.921020,19.0,76.0
2016-12-31,3.794457,0.243340,63.0,0.021336,-4.441323,0.033144,-2.820215,189237.0,34.921020,19.0,76.0


In [52]:
gage_df.index = pd.to_datetime(gage_df.index)

In [53]:
gage_df[gage_df.index <= pd.to_datetime('2014-12-31')]

Unnamed: 0_level_0,m3,swe_avg,swe_max,tp_avg,t2m_avg,tp_max,t2m_max,pixel_sum,pixel_mean,pixel_min,pixel_max
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-01,1.599902,0.010051,10.0,36.596780,0.277207,54.227730,1.021534,12514.0,49.074510,34.0,61.0
2010-01-02,1.817942,0.003433,6.0,19.432330,2.140078,40.226460,2.642189,12514.0,49.074510,34.0,61.0
2010-01-03,1.758476,0.000796,4.0,0.766869,-2.116478,1.752489,-0.916607,12514.0,49.074510,34.0,61.0
2010-01-04,1.693347,0.000467,4.0,0.016986,-1.963441,0.045573,-0.204727,12514.0,49.074510,34.0,61.0
2010-01-05,1.662199,0.000412,4.0,0.218129,0.024585,0.302439,0.961063,12514.0,49.074510,34.0,61.0
...,...,...,...,...,...,...,...,...,...,...,...
2014-12-27,5.691686,0.000137,1.0,0.011600,-8.077459,0.026930,-5.313604,88355.0,17.799154,7.0,49.0
2014-12-28,5.097032,0.001126,1.0,0.000829,-2.481084,0.002071,-0.893384,88355.0,17.799154,7.0,49.0
2014-12-29,4.643963,0.000220,1.0,0.075610,-4.113014,0.145005,-2.107430,88355.0,17.799154,7.0,49.0
2014-12-30,4.247527,0.000165,1.0,32.732810,-6.328136,42.861410,-5.436406,88355.0,17.799154,7.0,49.0


In [381]:
# train_size = int(len(gage_df) * 0.8)
# train_df,test_df = gage_df[:train_size], gage_df[train_size:] 

train_df = gage_df[gage_df.index < pd.to_datetime('2015-01-01')]
validation_df = gage_df[(gage_df.index >= pd.to_datetime('2015-01-01')) & (gage_df.index < pd.to_datetime('2016-01-01'))]
test_df = gage_df[gage_df.index >= pd.to_datetime('2016-01-01')]

feature_cols = ['m3','swe_avg','swe_max','tp_avg','t2m_avg','tp_max','t2m_max','pixel_sum','pixel_mean','pixel_min','pixel_max']

train = train_df[feature_cols]
validation = validation_df[feature_cols]
test = test_df[feature_cols]

train_index = train.index.tolist()
validation_index = validation.index.tolist()
test_index = test.index.tolist()



In [382]:
len(test_df)

367

In [383]:
# Rescaling
from sklearn.preprocessing import MinMaxScaler

# train = train_df
scalers={}
for i in train_df.columns:
    scaler = MinMaxScaler(feature_range=(-1,1))
    s_s = scaler.fit_transform(train[i].values.reshape(-1,1))
    s_s=np.reshape(s_s,len(s_s))
    scalers['scaler_'+ i] = scaler
    train[i]=s_s

# validation = validation_df
for i in train_df.columns:
    scaler = scalers['scaler_'+i]
    s_s = scaler.transform(validation[i].values.reshape(-1,1))
    s_s=np.reshape(s_s,len(s_s))
    scalers['scaler_'+i] = scaler
    validation[i]=s_s    
    
    
# test = test_df
for i in train_df.columns:
    scaler = scalers['scaler_'+i]
    s_s = scaler.transform(test[i].values.reshape(-1,1))
    s_s=np.reshape(s_s,len(s_s))
    scalers['scaler_'+i] = scaler
    test[i]=s_s

In [384]:
# combined_df = pd.concat([train, validation, test], axis = 0)

In [385]:
feature_cols = ['m3','swe_avg','swe_max','tp_avg','t2m_avg','tp_max','t2m_max','pixel_sum','pixel_mean','pixel_min','pixel_max']

# X, y, dates = transform_seq2seq_data(combined_df, target_col = 'm3',
#                               feature_col = feature_cols, 
#                               n_past = 28, 
#                               n_future = 14)

In [386]:
prediction_date = np.array(dates)[:,0]

In [472]:
feature_cols = ['m3','swe_avg','swe_max','tp_avg','t2m_avg','tp_max','t2m_max','pixel_sum','pixel_mean','pixel_min','pixel_max']
X_train, y_train, dates_train = transform_seq2seq_data(train, target_col = 'm3',
                                          feature_col = feature_cols, 
                                          n_past = 28, 
                                          n_future = 14)

X_validation, y_validation, dates_validation = transform_seq2seq_data(validation, target_col = 'm3',
                                          feature_col = feature_cols, 
                                          n_past = 28, 
                                          n_future = 14)

X_test, y_test, dates_test = transform_seq2seq_data(test, target_col = 'm3',
                                          feature_col = feature_cols, 
                                          n_past = 28, 
                                          n_future = 14)


In [479]:
# E1D1
# n_features ==> no of features at each timestep in the data.
#
# num_target_col = 1
# n_features = len(feature_cols)
# n_future = 14



# encoder_inputs = tf.keras.layers.Input(shape=(n_past, n_features))
# encoder_l1 = tf.keras.layers.LSTM(100, return_state=True)
# encoder_outputs1 = encoder_l1(encoder_inputs)

# encoder_states1 = encoder_outputs1[1:]

# #
# decoder_inputs = tf.keras.layers.RepeatVector(n_future)(encoder_outputs1[0])

# #
# decoder_l1 = tf.keras.layers.LSTM(100, return_sequences=True)(decoder_inputs,initial_state = encoder_states1)
# decoder_outputs1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_target_col))(decoder_l1)

# #
# model_e1d1 = tf.keras.models.Model(encoder_inputs,decoder_outputs1)

# #
# model_e1d1.summary()


In [480]:
# reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 0.01 * 0.90 ** x)
# model_e1d1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=tf.keras.losses.Huber())
# history_e1d1=model_e1d1.fit(X_train,y_train,epochs=25,validation_data=(X_validation,y_validation),batch_size=32,verbose=2
   
#                             # callbacks=[reduce_lr]
#                            )

In [445]:
# E2D2
# n_features ==> no of features at each timestep in the data.
#
encoder_inputs = tf.keras.layers.Input(shape=(n_past, n_features))
encoder_l1 = tf.keras.layers.LSTM(100,return_sequences = True, return_state=True)
encoder_outputs1 = encoder_l1(encoder_inputs)
encoder_states1 = encoder_outputs1[1:]
encoder_l2 = tf.keras.layers.LSTM(100, return_state=True)
encoder_outputs2 = encoder_l2(encoder_outputs1[0])
encoder_states2 = encoder_outputs2[1:]
#
decoder_inputs = tf.keras.layers.RepeatVector(n_future)(encoder_outputs2[0])
#
decoder_l1 = tf.keras.layers.LSTM(100, return_sequences=True)(decoder_inputs,initial_state = encoder_states1)
decoder_l2 = tf.keras.layers.LSTM(100, return_sequences=True)(decoder_l1,initial_state = encoder_states2)
decoder_outputs2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_target_col))(decoder_l2)
#
model_e2d2 = tf.keras.models.Model(encoder_inputs,decoder_outputs2)
#
model_e2d2.summary()

Model: "model_37"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_38 (InputLayer)          [(None, 28, 11)]     0           []                               
                                                                                                  
 lstm_116 (LSTM)                [(None, 28, 100),    44800       ['input_38[0][0]']               
                                 (None, 100),                                                     
                                 (None, 100)]                                                     
                                                                                                  
 lstm_117 (LSTM)                [(None, 100),        80400       ['lstm_116[0][0]']               
                                 (None, 100),                                              

In [446]:
reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 0.001 * 0.90 ** x)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model_e2d2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=tf.keras.losses.Huber())
history_e2d2=model_e2d2.fit(X_train,y_train,epochs=25,validation_data=(X_validation,y_validation),batch_size=32,
                            verbose=2,
                            callbacks=[reduce_lr,early_stopping_callback]
                           
                           )

Epoch 1/25
56/56 - 8s - loss: 0.0284 - val_loss: 0.0027 - lr: 0.0010 - 8s/epoch - 137ms/step
Epoch 2/25
56/56 - 2s - loss: 0.0050 - val_loss: 0.0025 - lr: 9.0000e-04 - 2s/epoch - 37ms/step
Epoch 3/25
56/56 - 2s - loss: 0.0046 - val_loss: 0.0025 - lr: 8.1000e-04 - 2s/epoch - 37ms/step
Epoch 4/25
56/56 - 2s - loss: 0.0045 - val_loss: 0.0029 - lr: 7.2900e-04 - 2s/epoch - 38ms/step
Epoch 5/25
56/56 - 2s - loss: 0.0045 - val_loss: 0.0024 - lr: 6.5610e-04 - 2s/epoch - 39ms/step
Epoch 6/25
56/56 - 2s - loss: 0.0044 - val_loss: 0.0025 - lr: 5.9049e-04 - 2s/epoch - 37ms/step
Epoch 7/25
56/56 - 2s - loss: 0.0044 - val_loss: 0.0023 - lr: 5.3144e-04 - 2s/epoch - 37ms/step
Epoch 8/25
56/56 - 2s - loss: 0.0044 - val_loss: 0.0024 - lr: 4.7830e-04 - 2s/epoch - 37ms/step
Epoch 9/25
56/56 - 2s - loss: 0.0044 - val_loss: 0.0025 - lr: 4.3047e-04 - 2s/epoch - 37ms/step
Epoch 10/25
56/56 - 2s - loss: 0.0043 - val_loss: 0.0022 - lr: 3.8742e-04 - 2s/epoch - 37ms/step
Epoch 11/25
56/56 - 2s - loss: 0.0043 - va

In [481]:
# pred_e1d1=model_e1d1.predict(X_test)
pred_e2d2=model_e2d2.predict(X_test)

## Evaluate Results

In [448]:
# pred_e1d1 = pred_e1d1.reshape(y_test.shape[0], y_test.shape[1])
pred_e2d2 = pred_e2d2.reshape(y_test.shape[0], y_test.shape[1])

# pred_e1d1_inverse = scalers['scaler_m3'].inverse_transform(pred_e1d1)
pred_e2d2_inverse = scalers['scaler_m3'].inverse_transform(pred_e2d2)
y_test_inverse = scalers['scaler_m3'].inverse_transform(y_test)

In [449]:
def relative_root_mean_squared_error(true, pred):
    num = np.sum(np.square(true - pred))
    den = np.sum(np.square(pred))
    squared_error = num/den
    rrmse_loss = np.sqrt(squared_error)
    return rrmse_loss

In [450]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [482]:
# rmse_list = []
# mape_list = []
# rrmse_list = []

# for pred, truth in zip(pred_e1d1_inverse, y_test_inverse):
#     rmse_list.append(np.sqrt(mean_squared_error(truth, pred)))
#     mape_list.append(mean_absolute_percentage_error(truth, pred))
#     rrmse_list.append(relative_root_mean_squared_error(truth, pred))
    
# final_rmse = np.mean(rmse_list)
# final_mape = np.mean(mape_list)
# final_rrmse = np.mean(rrmse_list)

# print(f'RMSE = {final_rmse}')
# print(f'MAPE = {final_mape}')
# print(f'RRMSE = {final_rrmse}')

    

In [452]:
rmse_list = []
mape_list = []
rrmse_list = []

for pred, truth in zip(pred_e2d2_inverse, y_test_inverse):
    rmse_list.append(np.sqrt(mean_squared_error(truth, pred)))
    mape_list.append(mean_absolute_percentage_error(truth, pred))
    rrmse_list.append(relative_root_mean_squared_error(truth, pred))
    
final_rmse = np.mean(rmse_list)
final_mape = np.mean(mape_list)
final_rrmse = np.mean(rrmse_list)

print(f'RMSE = {final_rmse}')
print(f'MAPE = {final_mape}')
print(f'RRMSE = {final_rrmse}')

RMSE = 5.903185593126763
MAPE = 0.6777889380154445
RRMSE = 1.4167050655125675


In [473]:
test_csv = pd.DataFrame(data = pred_e2d2_inverse,
             columns = [ 
                        'day1_pred',
                        'day2_pred',
                        'day3_pred',
                        'day4_pred',
                        'day5_pred',
                        'day6_pred',
                        'day7_pred',
                        'day8_pred',
                        'day9_pred',
                        'day10_pred',
                        'day11_pred',
                        'day12_pred',
                        'day13_pred',
                        'day14_pred'
                       ]
             )

test_csv['pred_date'] = np.array(dates_test)[:,0]


In [474]:
test_csv.to_csv('LSTM_seq2seq_11402000_test_pred.csv', index = False)

In [475]:
test_csv

Unnamed: 0,day1_pred,day2_pred,day3_pred,day4_pred,day5_pred,day6_pred,day7_pred,day8_pred,day9_pred,day10_pred,day11_pred,day12_pred,day13_pred,day14_pred,pred_date
0,11.798745,10.855339,10.286883,9.905237,9.628758,9.434842,9.313064,9.254159,9.247949,9.283724,9.351079,9.440514,9.543841,9.654311,2016-01-01
1,12.058352,11.292212,10.785560,10.398519,10.082318,9.832934,9.650261,9.530663,9.466890,9.449650,9.469055,9.515676,9.581100,9.658252,2016-01-02
2,14.051343,13.468255,12.952317,12.451776,11.979128,11.562104,11.216535,10.946049,10.746178,10.608015,10.521000,10.474483,10.458760,10.465426,2016-01-03
3,13.828472,13.082376,12.505026,12.006319,11.570020,11.206116,10.919865,10.708847,10.565239,10.478546,10.437530,10.431528,10.451058,10.488154,2016-01-04
4,12.857921,12.003736,11.426052,10.990998,10.648526,10.389007,10.206843,10.093958,10.039886,10.033144,10.062488,10.117726,10.190126,10.272556,2016-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,9.503781,9.127569,8.875995,8.645601,8.420792,8.221723,8.064005,7.953970,7.890677,7.868852,7.881085,7.919511,7.976618,8.045862,2016-11-16
321,10.341876,9.950187,9.638951,9.330049,9.024297,8.749447,8.524110,8.355321,8.241784,8.177367,8.153749,8.162100,8.194076,8.242352,2016-11-17
322,12.103658,11.617126,11.174002,10.716412,10.264745,9.856021,9.512370,9.241190,9.040057,8.901163,8.814378,8.769203,8.755836,8.765705,2016-11-18
323,15.394838,14.881818,14.266912,13.571085,12.868525,12.221491,11.662682,11.202472,10.837710,10.558268,10.351229,10.203316,10.102208,10.037139,2016-11-19


# Multivariate LSTM Single-Step Forecasting

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
tf.random.set_seed(7)

In [456]:
# feature_cols = ['m3','swe_avg','swe_max','tp_avg','t2m_avg','tp_max','t2m_max','pixel_sum','pixel_mean','pixel_min','pixel_max']
# X_train, y_train = transform_seq2seq_data(train, target_col = 'm3',
#                                           feature_col = feature_cols, 
#                                           n_past = 28, 
#                                           n_future = 1)

# X_test, y_test = transform_seq2seq_data(test, target_col = 'm3',
#                                           feature_col = feature_cols, 
#                                           n_past = 28, 
#                                           n_future = 1)

# y_train = y_train.reshape(-1)
# y_test = y_test.reshape(-1)

feature_cols = ['m3','swe_avg','swe_max','tp_avg','t2m_avg','tp_max','t2m_max','pixel_sum','pixel_mean','pixel_min','pixel_max']
X_train, y_train, dates_train = transform_seq2seq_data(train, target_col = 'm3',
                                          feature_col = feature_cols, 
                                          n_past = 28, 
                                          n_future = 1)

X_validation, y_validation, dates_validation = transform_seq2seq_data(validation, target_col = 'm3',
                                          feature_col = feature_cols, 
                                          n_past = 28, 
                                          n_future = 1)

X_test, y_test, dates_test = transform_seq2seq_data(test, target_col = 'm3',
                                          feature_col = feature_cols, 
                                          n_past = 28, 
                                          n_future = 1)


y_train = y_train.reshape(-1)
y_test = y_test.reshape(-1)
y_validation = y_validation.reshape(-1)



In [256]:
# feature_cols = ['m3','swe_avg','swe_max','tp_avg','t2m_avg','tp_max','t2m_max','pixel_sum','pixel_mean','pixel_min','pixel_max']

# X, y, dates = transform_seq2seq_data(combined_df, target_col = 'm3',
#                               feature_col = feature_cols, 
#                               n_past = 28, 
#                               n_future = 1)

# prediction_date = np.array(dates)[:,0]



In [364]:
# X_train = X[:len(train)]
# X_validation = X[len(train): len(train) + len(validation)]
# X_test = X[len(train) + len(validation):]

# y_train = y[:len(train)]
# y_validation = y[len(train): len(train) + len(validation)]
# y_test = y[len(train) + len(validation):]

# dates_train = prediction_date[:len(train)]
# dates_validation = prediction_date[len(train): len(train) + len(validation)]
# dates_test = prediction_date[len(train) + len(validation):]

In [365]:
n_past = 28

In [367]:
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(n_past,11)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=2)

Epoch 1/10
113/113 - 2s - loss: 0.1226 - 2s/epoch - 17ms/step
Epoch 2/10
113/113 - 1s - loss: 0.0116 - 620ms/epoch - 5ms/step
Epoch 3/10
113/113 - 1s - loss: 0.0106 - 632ms/epoch - 6ms/step
Epoch 4/10
113/113 - 1s - loss: 0.0098 - 645ms/epoch - 6ms/step
Epoch 5/10
113/113 - 1s - loss: 0.0092 - 634ms/epoch - 6ms/step
Epoch 6/10
113/113 - 1s - loss: 0.0087 - 647ms/epoch - 6ms/step
Epoch 7/10
113/113 - 1s - loss: 0.0083 - 655ms/epoch - 6ms/step
Epoch 8/10
113/113 - 1s - loss: 0.0079 - 671ms/epoch - 6ms/step
Epoch 9/10
113/113 - 1s - loss: 0.0076 - 624ms/epoch - 6ms/step
Epoch 10/10
113/113 - 1s - loss: 0.0073 - 649ms/epoch - 6ms/step


<keras.callbacks.History at 0x7fde3e3ccdf0>

In [369]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_85 (LSTM)              (None, 4)                 256       
                                                                 
 dense_38 (Dense)            (None, 1)                 5         
                                                                 
Total params: 261
Trainable params: 261
Non-trainable params: 0
_________________________________________________________________


In [370]:
single_step_lstm_pred = model.predict(X_test)



In [371]:

single_step_lstm_pred = single_step_lstm_pred.reshape(y_test.shape[0])

single_step_lstm_pred_inverse = scalers['scaler_m3'].inverse_transform([single_step_lstm_pred])[0]
y_test_inverse = scalers['scaler_m3'].inverse_transform([y_test.reshape(-1)])[0]


In [372]:
# rmse_list = []
# mape_list = []
# rrmse_list = []

# for pred, truth in zip(single_step_lstm_pred, y_test_inverse):
#     rmse_list.append(np.sqrt(mean_squared_error(truth, pred)))
#     mape_list.append(mean_absolute_percentage_error(truth, pred))
#     rrmse_list.append(relative_root_mean_squared_error(truth, pred))
    
final_rmse = np.sqrt(mean_squared_error(y_test_inverse, single_step_lstm_pred_inverse))
final_mape = mean_absolute_percentage_error(y_test_inverse, single_step_lstm_pred_inverse)
final_rrmse = relative_root_mean_squared_error(y_test_inverse, single_step_lstm_pred_inverse)

print(f'RMSE = {final_rmse}')
print(f'MAPE = {final_mape}')
print(f'RRMSE = {final_rrmse}')


RMSE = 10.481814761483466
MAPE = 1.537144459102105
RRMSE = 1.159337182079452


In [470]:
test_csv = pd.DataFrame({'pred_date': np.array(dates_test)[:,0], 
                         'day1_pred': single_step_lstm_pred_inverse
                        }
             )
test_csv.to_csv('LSTM_next_day_11402000_test_pred.csv', index = False)
# test_csv['pred_date'] = dates_test

In [476]:
test_csv

Unnamed: 0,day1_pred,day2_pred,day3_pred,day4_pred,day5_pred,day6_pred,day7_pred,day8_pred,day9_pred,day10_pred,day11_pred,day12_pred,day13_pred,day14_pred,pred_date
0,11.798745,10.855339,10.286883,9.905237,9.628758,9.434842,9.313064,9.254159,9.247949,9.283724,9.351079,9.440514,9.543841,9.654311,2016-01-01
1,12.058352,11.292212,10.785560,10.398519,10.082318,9.832934,9.650261,9.530663,9.466890,9.449650,9.469055,9.515676,9.581100,9.658252,2016-01-02
2,14.051343,13.468255,12.952317,12.451776,11.979128,11.562104,11.216535,10.946049,10.746178,10.608015,10.521000,10.474483,10.458760,10.465426,2016-01-03
3,13.828472,13.082376,12.505026,12.006319,11.570020,11.206116,10.919865,10.708847,10.565239,10.478546,10.437530,10.431528,10.451058,10.488154,2016-01-04
4,12.857921,12.003736,11.426052,10.990998,10.648526,10.389007,10.206843,10.093958,10.039886,10.033144,10.062488,10.117726,10.190126,10.272556,2016-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,9.503781,9.127569,8.875995,8.645601,8.420792,8.221723,8.064005,7.953970,7.890677,7.868852,7.881085,7.919511,7.976618,8.045862,2016-11-16
321,10.341876,9.950187,9.638951,9.330049,9.024297,8.749447,8.524110,8.355321,8.241784,8.177367,8.153749,8.162100,8.194076,8.242352,2016-11-17
322,12.103658,11.617126,11.174002,10.716412,10.264745,9.856021,9.512370,9.241190,9.040057,8.901163,8.814378,8.769203,8.755836,8.765705,2016-11-18
323,15.394838,14.881818,14.266912,13.571085,12.868525,12.221491,11.662682,11.202472,10.837710,10.558268,10.351229,10.203316,10.102208,10.037139,2016-11-19


# Ignore Below