In [1]:
import os
import sys

MODULE_PATH = '/content/drive/MyDrive/GitHub/DL_Study/Base'

sys.path.insert(0, MODULE_PATH)
sys.path

['/content/drive/MyDrive/GitHub/DL_Study/Base',
 '',
 '/content',
 '/env/python',
 '/usr/lib/python37.zip',
 '/usr/lib/python3.7',
 '/usr/lib/python3.7/lib-dynload',
 '/usr/local/lib/python3.7/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.7/dist-packages/IPython/extensions',
 '/root/.ipython']

In [2]:
# import
import numpy
from config import *
from optim import Adam
from models import CnnModelReg

# for time series split
!pip install scikit-learn==0.24.2



In [3]:
# configuration setting
def model_config():
    # parameter for CNN Model
    filter_num = [30]
    filter_size = [3]
    epochs = [30]
    batch_size = [64]
    learning_rate = [0.01, 0.001]
    
    # create config data
    configs = []
    for i in filter_num:
        for j in filter_size:
            for k in epochs:
                for l in batch_size:
                    for m in learning_rate:
                        config = [i, j, k, l, m]
                        configs.append(config)
    return configs

# fucntion for fit cnn model using configs
def model_fit(train_X, train_y, config):
    # unpack config
    n_filter, n_fsize, n_epochs, n_batch, learning_rate = config
    cnn_params = {'filter_num':n_filter, 'filter_size':n_fsize, 
                  'stride':1, 'padding':0}
    model = CnnModelReg(input_dim=(1, 24, 8),params=cnn_params)
    # fit model and return
    model.fit(train_X=train_X, train_y=train_y, epochs=n_epochs, 
              batch_size=n_batch, learning_rate=learning_rate)
    return model

def MAE_metric(x, t):
    t = np.array(t)
    return np.mean(numpy.abs(x-t))

def MSE_metric(x, t):
    t = np.array(t)
    return np.mean((x-t)**2)

In [4]:
import pandas as pd
import numpy
import time
from datetime import datetime

np.random.seed(42)
numpy.random.seed(42)

df_parser = lambda x: datetime.strptime(x, '%Y %m %d %H')

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00381/'
data_name = 'PRSA_data_2010.1.1-2014.12.31.csv'
df = pd.read_csv(data_url+data_name, sep=',', parse_dates=[['year', 'month', 'day', 'hour']], date_parser=df_parser, index_col=0)
del df['No']
df = df[24:]
df.head()

Unnamed: 0_level_0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
year_month_day_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [5]:
from scipy.stats import skew, kurtosis
from statsmodels.tsa.stattools import adfuller

# jb = (n/6)*(skewness**2 + (kurtosis**2/4))

def data_statistics(df):
    df = df.dropna()
    data = df.values
    num = len(df)
    skewness_ = skew(data)
    kurtosis_ = kurtosis(data)
    jarque_bera_ = (num/6)*(skewness_**2 + (kurtosis_**2/4))
    result = adfuller(data)
    adf_ = result[0]
    print(f'skewness : {skewness_}')
    print(f'kurtosis : {kurtosis_}')
    print(f'jarque bera : {jarque_bera_}')
    print(f'ADF : {adf_}')

data_statistics(df['pm2.5'])

  import pandas.util.testing as tm


skewness : 1.8022466754707596
kurtosis : 4.768218621208263
jarque bera : 62162.74314257471
ADF : -20.606824646403773


In [6]:
df.describe()

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir
count,41757.0,43800.0,43800.0,43800.0,43800.0,43800.0,43800.0
mean,98.613215,1.828516,12.459041,1016.447306,23.894307,0.052763,0.195023
std,92.050387,14.429326,12.193384,10.271411,50.022729,0.760582,1.416247
min,0.0,-40.0,-19.0,991.0,0.45,0.0,0.0
25%,29.0,-10.0,2.0,1008.0,1.79,0.0,0.0
50%,72.0,2.0,14.0,1016.0,5.37,0.0,0.0
75%,137.0,15.0,23.0,1025.0,21.91,0.0,0.0
max,994.0,28.0,42.0,1046.0,585.6,27.0,36.0


In [7]:
df.isnull().sum()

pm2.5    2043
DEWP        0
TEMP        0
PRES        0
cbwd        0
Iws         0
Is          0
Ir          0
dtype: int64

In [8]:
# series data to img function
def series_to_img(dataset, time_step=1):
    num = dataset.shape[1]      # features num
    df = pd.DataFrame(dataset)
    cols, names = list(), list()
    # sequence t-n to t-1
    for i in range(time_step, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(num)]

    for i in range(0, 1):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(num)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(num)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg.dropna(inplace=True)
    return agg

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Last observation carried forward (LOCF)
df.fillna(method='ffill', inplace=True)

dataset = df.values
label_encoder = LabelEncoder()
dataset[:, 4] = label_encoder.fit_transform(dataset[:, 4])  # for wind direction
dataset = dataset.astype('float')

n_inputs = 24
n_features = 8
del_idx = n_inputs * n_features + 1
del_cols = [i for i in range(del_idx, del_idx+n_features-1)]
new_df = series_to_img(dataset, n_inputs)
new_df.drop(new_df.columns[del_cols], axis=1, inplace=True)
new_df.head()

Unnamed: 0,var1(t-24),var2(t-24),var3(t-24),var4(t-24),var5(t-24),var6(t-24),var7(t-24),var8(t-24),var1(t-23),var2(t-23),var3(t-23),var4(t-23),var5(t-23),var6(t-23),var7(t-23),var8(t-23),var1(t-22),var2(t-22),var3(t-22),var4(t-22),var5(t-22),var6(t-22),var7(t-22),var8(t-22),var1(t-21),var2(t-21),var3(t-21),var4(t-21),var5(t-21),var6(t-21),var7(t-21),var8(t-21),var1(t-20),var2(t-20),var3(t-20),var4(t-20),var5(t-20),var6(t-20),var7(t-20),var8(t-20),...,var2(t-5),var3(t-5),var4(t-5),var5(t-5),var6(t-5),var7(t-5),var8(t-5),var1(t-4),var2(t-4),var3(t-4),var4(t-4),var5(t-4),var6(t-4),var7(t-4),var8(t-4),var1(t-3),var2(t-3),var3(t-3),var4(t-3),var5(t-3),var6(t-3),var7(t-3),var8(t-3),var1(t-2),var2(t-2),var3(t-2),var4(t-2),var5(t-2),var6(t-2),var7(t-2),var8(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var1(t)
24,129.0,-16.0,-4.0,1020.0,2.0,1.79,0.0,0.0,148.0,-15.0,-4.0,1020.0,2.0,2.68,0.0,0.0,159.0,-11.0,-5.0,1021.0,2.0,3.57,0.0,0.0,181.0,-7.0,-5.0,1022.0,2.0,5.36,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.25,2.0,0.0,...,-8.0,-5.0,1028.0,2.0,44.25,0.0,0.0,154.0,-7.0,-5.0,1028.0,2.0,46.04,0.0,0.0,164.0,-7.0,-5.0,1027.0,2.0,49.17,1.0,0.0,156.0,-8.0,-6.0,1028.0,2.0,52.3,2.0,0.0,126.0,-8.0,-6.0,1027.0,2.0,55.43,3.0,0.0,90.0
25,148.0,-15.0,-4.0,1020.0,2.0,2.68,0.0,0.0,159.0,-11.0,-5.0,1021.0,2.0,3.57,0.0,0.0,181.0,-7.0,-5.0,1022.0,2.0,5.36,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.25,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.14,3.0,0.0,...,-7.0,-5.0,1028.0,2.0,46.04,0.0,0.0,164.0,-7.0,-5.0,1027.0,2.0,49.17,1.0,0.0,156.0,-8.0,-6.0,1028.0,2.0,52.3,2.0,0.0,126.0,-8.0,-6.0,1027.0,2.0,55.43,3.0,0.0,90.0,-7.0,-6.0,1027.0,2.0,58.56,4.0,0.0,63.0
26,159.0,-11.0,-5.0,1021.0,2.0,3.57,0.0,0.0,181.0,-7.0,-5.0,1022.0,2.0,5.36,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.25,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.14,3.0,0.0,105.0,-7.0,-6.0,1023.0,2.0,8.93,4.0,0.0,...,-7.0,-5.0,1027.0,2.0,49.17,1.0,0.0,156.0,-8.0,-6.0,1028.0,2.0,52.3,2.0,0.0,126.0,-8.0,-6.0,1027.0,2.0,55.43,3.0,0.0,90.0,-7.0,-6.0,1027.0,2.0,58.56,4.0,0.0,63.0,-8.0,-6.0,1026.0,2.0,61.69,5.0,0.0,65.0
27,181.0,-7.0,-5.0,1022.0,2.0,5.36,1.0,0.0,138.0,-7.0,-5.0,1022.0,2.0,6.25,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.14,3.0,0.0,105.0,-7.0,-6.0,1023.0,2.0,8.93,4.0,0.0,124.0,-7.0,-5.0,1024.0,2.0,10.72,0.0,0.0,...,-8.0,-6.0,1028.0,2.0,52.3,2.0,0.0,126.0,-8.0,-6.0,1027.0,2.0,55.43,3.0,0.0,90.0,-7.0,-6.0,1027.0,2.0,58.56,4.0,0.0,63.0,-8.0,-6.0,1026.0,2.0,61.69,5.0,0.0,65.0,-8.0,-7.0,1026.0,2.0,65.71,6.0,0.0,55.0
28,138.0,-7.0,-5.0,1022.0,2.0,6.25,2.0,0.0,109.0,-7.0,-6.0,1022.0,2.0,7.14,3.0,0.0,105.0,-7.0,-6.0,1023.0,2.0,8.93,4.0,0.0,124.0,-7.0,-5.0,1024.0,2.0,10.72,0.0,0.0,120.0,-8.0,-6.0,1024.0,2.0,12.51,0.0,0.0,...,-8.0,-6.0,1027.0,2.0,55.43,3.0,0.0,90.0,-7.0,-6.0,1027.0,2.0,58.56,4.0,0.0,63.0,-8.0,-6.0,1026.0,2.0,61.69,5.0,0.0,65.0,-8.0,-7.0,1026.0,2.0,65.71,6.0,0.0,55.0,-8.0,-7.0,1025.0,2.0,68.84,7.0,0.0,65.0


In [9]:
n_splits = 3
test_size = (int)(len(new_df)*0.2)
train_test_split = TimeSeriesSplit(n_splits=n_splits+1, gap=n_inputs, test_size=test_size).split(new_df)
next(train_test_split)

configs = model_config()
history = []

best_rmse, best_mse, best_mae = [], [], []

i = 1

print('config : filter_num, filter_size, epochs, batch_size, learning_rate')

# nested cross validation for time series model
for train_cv_indices, test_cv_indices in train_test_split:
    print(f'fold : {i}/{n_splits}')
    i+=1

    # split x, y data
    train_cv_X, train_cv_y = new_df.iloc[train_cv_indices, :-1].values, new_df.iloc[train_cv_indices,-1].values
    test_cv_X, test_cv_y = new_df.iloc[test_cv_indices, :-1].values, new_df.iloc[test_cv_indices, -1].values

    # length for validation set
    test_length = len(test_cv_X)

    # scaling data
    scaler_x = MinMaxScaler()
    train_cv_X = scaler_x.fit_transform(train_cv_X)
    test_cv_X = scaler_x.transform(test_cv_X)

    train_X, val_X = train_cv_X[:-test_length, :], train_cv_X[-test_length:, :]
    train_y, val_y = train_cv_y[:-test_length], train_cv_y[-test_length:]

    # reshape
    # inner loop
    train_X = train_X.reshape(-1, 1, n_inputs, n_features)
    val_X = val_X.reshape(-1, 1, n_inputs, n_features)
    train_y = train_y.reshape(-1, 1)
    val_y = val_y.reshape(-1, 1)

    # outer loop
    train_cv_X = train_cv_X.reshape(-1, 1, n_inputs, n_features)
    test_cv_X = test_cv_X.reshape(-1, 1, n_inputs, n_features)
    train_cv_y = train_cv_y.reshape(-1, 1)
    test_cv_y = test_cv_y.reshape(-1, 1)

    # model fit, inner
    errors = []
    for idx, cfg in enumerate(configs):
        print(f' == train {cfg} model == ', end=' ')
        model = model_fit(train_X, train_y, cfg)
        predicted = model.predict(val_X)
        error = np.sqrt(MSE_metric(predicted, val_y))   # rmse
        print(f'error(rmse):{error.item():.2f}')
        if errors:
            if error < min(errors):
                param = idx
        else:
            param = idx
        errors.append(error)

    history.append(errors)

    # outer
    selected_model = model_fit(train_cv_X,train_cv_y, configs[param])
    predicted = selected_model.predict(test_cv_X)
    rmse = np.sqrt(MSE_metric(predicted, test_cv_y))
    mse = MSE_metric(predicted, test_cv_y)
    mae = MAE_metric(predicted, test_cv_y)
    best_rmse.append(rmse)
    best_mse.append(mse)
    best_mae.append(mae)

    # model eval
    print(f'train-size:{train_X.shape[0]}, val-size:{val_X.shape[0]}, test-size:{test_cv_X.shape[0]}')
    print(f'best_model => error(rmse) : {error.item():.2f}, param:{configs[param]}')
    print()

config : filter_num, filter_size, epochs, batch_size, learning_rate
fold : 1/3
 == train [30, 3, 30, 64, 0.01] model ==  error(rmse):28.12
 == train [30, 3, 30, 64, 0.001] model ==  error(rmse):34.15
train-size:8732, val-size:8755, test-size:8755
best_model => error(rmse) : 34.15, param:[30, 3, 30, 64, 0.01]

fold : 2/3
 == train [30, 3, 30, 64, 0.01] model ==  error(rmse):27.87
 == train [30, 3, 30, 64, 0.001] model ==  error(rmse):30.32
train-size:17487, val-size:8755, test-size:8755
best_model => error(rmse) : 30.32, param:[30, 3, 30, 64, 0.01]

fold : 3/3
 == train [30, 3, 30, 64, 0.01] model ==  error(rmse):28.53
 == train [30, 3, 30, 64, 0.001] model ==  error(rmse):30.12
train-size:26242, val-size:8755, test-size:8755
best_model => error(rmse) : 30.12, param:[30, 3, 30, 64, 0.01]



In [12]:
predicted = selected_model.predict(test_cv_X)

def model_evaluation(mse, rmse, mae):
    mse = np.array(mse)
    rmse = np.array(rmse)
    mae = np.array(mae)
    print(f'MSE: mean={np.mean(mse)}, std={np.std(mse)}')
    print(f'RMSE: mean={np.mean(rmse)}, std={np.std(rmse)}')
    print(f'MAE: mean={np.mean(mae)}, std={np.std(mae)}')

model_evaluation(best_mse, best_rmse, best_mae)

MSE: mean=796.914573729473, std=169.52315409772922
RMSE: mean=28.076025252033762, std=2.941322793682368
MAE: mean=16.333908802392926, std=0.6531725098096078
