In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
data_flow = None
for i in range(5):
    data_day = pd.read_csv('/kaggle/input/traffic-prediction-data/d11_text_station_5min_2019_09_0'+str(i+1)+'.csv',header=None)
    data_day = data_day.iloc[:,[0,1,9]]
    data_day.columns=['time','station_id','flow']
    data_flow=pd.concat([data_flow,data_day],ignore_index=True)
data_flow

In [None]:
data_flow_1100326 = data_flow.loc[data_flow['station_id']==1100326].dropna(subset=['flow'])
plt.plot(data_flow_1100326['time'],data_flow_1100326['flow'])

In [None]:
data_flow=data_flow.groupby('station_id').filter(lambda x : x.flow.count()!=0)
data_flow.fillna(method='pad',axis=0,inplace=True)
grouped=data_flow.groupby('time')
grouped.count()

In [None]:
data=[]
for name, group in grouped:
    data.append(group['flow'].tolist())
data=np.array(data)
pd.DataFrame(data)

In [None]:
def max_min_normalization(x, _max, _min):
    x = 1. * (x - _min)/(_max - _min)
    x = x * 2. - 1.
    return x


def re_max_min_normalization(x, _max, _min):
    x = (x + 1.) / 2.
    x = 1. * x * (_max - _min) + _min
    return x
def min_max(data_seq):
    all_data=[]
    for data in data_seq:
        min=np.min(data)
        max=np.max(data)
        for i in range(0,len(data)):
            data[i]=max_min_normalization(data[i],max,min)
        all_data.append(data)
    return all_data

def re_min_max(data_seq):
    all_data=[]
    for data in data_seq:
        min=np.min(data)
        max=np.max(data)
        for i in range(0,len(data)):
            data[i]=re_max_min_normalization(data[i],max,min)
        all_data.append(data)
    return all_data

In [None]:
scaler=MinMaxScaler(feature_range=(0,1))
data_input=scaler.fit_transform(data)
pd.DataFrame(data_input)

In [None]:
def read_and_generate_dataset(data_seq, num_of_depend):
    if len(data_seq) <= num_of_depend:
        return None
    X_data = []
    y_data = []
    for i in range(num_of_depend, len(data_seq)):
        X_start_idx = i - num_of_depend
        X_data.append(data_seq[X_start_idx:i])
        y_data.append(data_seq[i])
    val_line = int(len(X_data)*0.6)
    test_line = int(len(X_data)*0.8)
    return np.array(X_data[:val_line]),np.array(X_data[val_line:test_line]),np.array(X_data[test_line:]),np.array(y_data[:val_line]),np.array(y_data[val_line:test_line]),np.array(y_data[test_line:])
X_train, X_val,X_test, y_train,y_val, y_test = read_and_generate_dataset(data_input, 5)

In [None]:
data_shape = X_train.shape
data_shape

In [None]:
kfold = KFold(n_splits=3, shuffle=True)
fold_no = 1
cv_scores = []
for train, test in kfold.split(X_train, y_train):
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    model = Sequential()
    # model.add(Conv1D(filters=128, kernel_size=10, padding='same', strides=1, activation='relu',input_shape=(data_shape[1],data_shape[2])))
    model.add(LSTM(128, input_shape=(data_shape[1], data_shape[2]),activation='relu',return_sequences = True))
    model.add(LSTM(128, input_shape=(data_shape[1], 128),activation='relu'))
    model.add(Dense(data_shape[2],activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    history = model.fit(X_train, 
                        y_train, 
                        epochs=30, 
                        batch_size=32, 
                        verbose=2, 
                        shuffle=True)

    # Generate generalization metrics
    score = model.evaluate(X_train[test], y_train[test], verbose=0)
    cv_scores.append(score)
    # Increase fold number
    fold_no = fold_no + 1

pd.DataFrame(cv_scores)

In [None]:
cv_scores_df = pd.DataFrame(cv_scores)
cv_scores_df.columns=['cv_score']
cv_scores_df.index=['cv1','cv2','cv3']
cv_scores_df

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(data_shape[1], data_shape[2]),activation='relu',return_sequences = True))
model.add(LSTM(128, input_shape=(data_shape[1], 128),activation='relu'))
model.add(Dense(data_shape[2],activation='relu'))
model.compile(loss='mean_squared_error', optimizer='adam')
# fit network
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=2, shuffle=True)

In [None]:
score = model.evaluate(X_train[test], y_train[test], verbose=0)
score

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.show()

In [None]:
prediction = model.predict(X_test)
prediction = scaler.inverse_transform(prediction)

y_test=scaler.inverse_transform(y_test)

prediction_s = prediction[:,0]
y_test_s = y_test[:,0]
y_train_s=scaler.inverse_transform(y_train)[:,0]
y_val_s=scaler.inverse_transform(y_val)[:,0]

In [None]:
w=3
x1=np.linspace(0,y_train_s.shape[0]*w,y_train_s.shape[0])
x2=np.linspace(y_train_s.shape[0]*w,(y_train_s.shape[0]+y_val_s.shape[0])*w,y_val_s.shape[0])
x3=np.linspace((y_train_s.shape[0]+y_val_s.shape[0])*w,(y_train_s.shape[0]+y_val_s.shape[0]+y_test_s.shape[0])*w,y_test_s.shape[0])
# plt.plot(x1,y_train_s)
# plt.plot(x2,y_val_s)
plt.plot(x3,prediction_s,'r')
plt.plot(x3,y_test_s,'b')