# preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler


import tensorflow as tf
import keras
import shap
from tensorflow.keras import backend as K
from keras import layers, models, Input, Sequential
from module import BiGRU_module, BiLSTM_module, AttentionLayer, build_model, build_model_withcyc


In [None]:
def create_sequences(data_nomalize, data, seq_length):
    data = data.to_numpy()
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data_nomalize[i:i+seq_length])
        y.append(data[i+seq_length, 0])
    return np.array(X), np.array(y)

## training set

In [None]:
train_data = pd.read_csv("./new_dataset/train_2years_set")
train_data['Timestamp'] = pd.to_datetime(train_data['Timestamp'])
train_data.set_index('Timestamp', inplace=True)
train_data = train_data.drop(['Unnamed: 0'], axis=1)
scaler = MinMaxScaler(feature_range=(0, 1), clip=True)
scaler.fit(train_data)
train_data_nomal = scaler.transform(train_data)
seq_length = 36
X_train_time, y_train = create_sequences(train_data_nomal, train_data, seq_length)

## validation set

In [None]:
val_data = pd.read_csv("./new_dataset/val_2years_set")
val_data['Timestamp'] = pd.to_datetime(val_data['Timestamp'])
val_data.set_index('Timestamp', inplace=True)
val_data = val_data.drop(['Unnamed: 0', 'Station', 'District', 'Freeway', 'DirectionofTravel', 'LaneType', 'Station Length', 'Samples', '%Observed', 'AvgOccupancy', 'AvgSpeed'], axis=1)
val_data_nomal = scaler.transform(val_data)
X_val_time, y_val = create_sequences(val_data_nomal, val_data, seq_length)

In [None]:
# test_data = pd.read_csv("testset")
# test_data['Timestamp'] = pd.to_datetime(test_data['Timestamp'])
# test_data.set_index('Timestamp', inplace=True)
# test_data = test_data.drop(['Unnamed: 0', 'Station', 'District', 'Freeway', 'DirectionofTravel', 'LaneType', 'Station Length', 'Samples', '%Observed', 'AvgOccupancy', 'AvgSpeed'], axis=1)
# # test_data = test_data.drop(['Unnamed: 0', 'Station', 'District', 'Freeway', 'DirectionofTravel', 'LaneType', 'Station Length', 'Samples', '%Observed', 'AvgSpeed'], axis=1)
# # test_scaler = MinMaxScaler(feature_range=(0, 1))
# test_data_nomal = scaler.transform(test_data)
# X_test_time, y_test = create_sequences(test_data_nomal, test_data, seq_length)
# # print(test_data[0:24])
# # print(test_data_nomal[0:24])
# # print(X_test_time[0])

In [None]:
# X_test_time.reshape((10056, 24,1,1))[0]

## cyclical data

In [None]:
def create_cyc_sequences(data, period_length, seq_length):
    X = []
    for i in range(len(data) - (288 * period_length) - seq_length):
        t = (i//288)
        X.append(data[data['time_of_day'] == data['time_of_day'].values[i+seq_length+1]]['TotalFlow'][t:t+period_length])
    return X

In [None]:
period_len = 7

train_cyc = pd.read_csv("./new_dataset/train_2years_cyc")
train_cyc['Timestamp'] = pd.to_datetime(train_cyc['Timestamp'])
train_cyc.set_index('Timestamp', inplace=True)
train_cyc = train_cyc.drop(['Unnamed: 0'], axis=1)
train_cyc['time_of_day'] = train_cyc.index.time
X_train_cyc = create_cyc_sequences(train_cyc, period_len, seq_length)
X_train_cyc_arr = np.array(X_train_cyc)
shape_x, shape_y= X_train_cyc_arr.shape
X_train_cyc_arr = X_train_cyc_arr.reshape(shape_x*shape_y, 1)
cyc_scaler = MinMaxScaler(feature_range=(0, 1), clip=True)
X_train_cyc_nomal = cyc_scaler.fit_transform(X_train_cyc_arr)
X_train_cyc_nomal = X_train_cyc_nomal.reshape(shape_x, shape_y)

In [None]:
val_cyc = pd.read_csv("./new_dataset/val_2years_cyc")
val_cyc['Timestamp'] = pd.to_datetime(val_cyc['Timestamp'])
val_cyc.set_index('Timestamp', inplace=True)
val_cyc = val_cyc.drop(['Unnamed: 0', 
                        'Station', 
                        'District', 
                        'Freeway', 
                        'DirectionofTravel', 
                        'LaneType', 
                        'Station Length', 
                        'Samples', 
                        '%Observed', 
                        'AvgOccupancy', 
                        'AvgSpeed'], axis=1)
val_cyc['time_of_day'] = val_cyc.index.time
X_val_cyc = create_cyc_sequences(val_cyc, period_len, seq_length)
X_val_cyc_arr = np.array(X_val_cyc)
shape_x, shape_y= X_val_cyc_arr.shape
X_val_cyc_arr = X_val_cyc_arr.reshape(shape_x*shape_y, 1)
X_val_cyc_nomal = cyc_scaler.transform(X_val_cyc_arr)
X_val_cyc_nomal = X_val_cyc_nomal.reshape(shape_x, shape_y)

## cyclical data2

In [None]:
def create_cyc_sequences_timeseries(data, period_length, seq_length):
    Xs = []
    for i in range(len(data) - (288 * period_len) - seq_length):
        X = []
        for j in range(7):
            X.append(data[((data['time_of_day'] >= data['time_of_day'].values[i+(j*288)]) & (data['time_of_day'] < data['time_of_day'].values[i+(j*288)+seq_length]))]['TotalFlow'][:])
        Xs.append(X)
    return np.array(Xs)

In [None]:
train_cyc_ts = pd.read_csv("./new_dataset/train_2years_cyc")
train_cyc_ts['Timestamp'] = pd.to_datetime(train_cyc_ts['Timestamp'])
train_cyc_ts.set_index('Timestamp', inplace=True)
train_cyc_ts = train_cyc_ts.drop(['Unnamed: 0'], axis=1)
train_cyc_ts['time_of_day'] = train_cyc_ts.index
X_train_cyc_ts = create_cyc_sequences_timeseries(train_cyc_ts, period_len, seq_length)
cyc_scaler_ts = MinMaxScaler(feature_range=(0, 1), clip=True)
shape_x, shape_y,  channel= X_train_cyc_ts.shape
X_train_cyc_ts_arr = X_train_cyc_ts.reshape(shape_x*shape_y*channel, 1)
X_train_cyc_ts_nomal = cyc_scaler_ts.fit_transform(X_train_cyc_ts_arr)
X_train_cyc_ts_nomal = X_train_cyc_ts_nomal.reshape(shape_x, shape_y, channel)

In [None]:
val_cyc_ts = pd.read_csv("./new_dataset/val_2years_cyc")
val_cyc_ts['Timestamp'] = pd.to_datetime(val_cyc_ts['Timestamp'])
val_cyc_ts.set_index('Timestamp', inplace=True)
val_cyc_ts = val_cyc_ts.drop(['Unnamed: 0', 
                        'Station', 
                        'District', 
                        'Freeway', 
                        'DirectionofTravel', 
                        'LaneType', 
                        'Station Length', 
                        'Samples', 
                        '%Observed', 
                        'AvgOccupancy', 
                        'AvgSpeed'], axis=1)
val_cyc_ts['time_of_day'] = val_cyc_ts.index
X_val_cyc_ts = create_cyc_sequences_timeseries(val_cyc_ts, period_len, seq_length)
shape_x, shape_y,  channel= X_val_cyc_ts.shape
X_val_cyc_ts_arr = X_val_cyc_ts.reshape(shape_x*shape_y*channel, 1)
X_val_cyc_ts_nomal = cyc_scaler_ts.transform(X_val_cyc_ts_arr)
X_val_cyc_ts_nomal = X_val_cyc_ts_nomal.reshape(shape_x, shape_y, channel)

In [None]:
p = 30
hidden_size = 64
windows_size = seq_length - p
lambda1 = 0.0001
lambda2 = 0.0005

model = build_model_withcyc(hidden_size, windows_size)

model.compile(
    optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
    loss=keras.losses.MeanSquaredError(name="mean_squared_error"),
    metrics=[
        keras.metrics.MeanAbsoluteError(name="mean_absolute_error",),
        keras.metrics.RootMeanSquaredError(name="root_mean_squared_error"),
        keras.metrics.MeanAbsolutePercentageError(name="mean_absolute_percentage_error")
    ],
)
model.summary()

In [None]:

checkpoint_filepath = 'checkpoint/pretrain_model.hdf5'
callbacks = [keras.callbacks.ModelCheckpoint(
             filepath=checkpoint_filepath,
             monitor='val_loss',
             mode='min',
             verbose = 1,
             save_best_only=True),]
history = model.fit([X_train_time,X_train_cyc_nomal,X_train_cyc_ts_nomal], y_train, epochs=30, batch_size=128, validation_data=([X_val_time,X_val_cyc_nomal,X_val_cyc_ts_nomal], y_val), callbacks=[callbacks])