In [1]:
# data prepare
import pandas as pd
from datetime import datetime

df_parser = lambda x: datetime.strptime(x, '%Y %m %d %H')    # string to datetime
# data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv'
data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv'
df = pd.read_csv(data_url, sep=',', parse_dates=[['year', 'month', 'day', 'hour']], date_parser=df_parser, index_col=0)

del df['No']
df.columns = ['pm2.5', 'dewp', 'temp', 'pres', 'cbwd','wind_speed', 'snow', 'rain']
df = df[24:]            # NaN values in first 24hours
df.head()

Unnamed: 0_level_0,pm2.5,dewp,temp,pres,cbwd,wind_speed,snow,rain
year_month_day_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [None]:
def series_to_img(dataset, time_step=1):
    num = dataset.shape[1]      # features num
    df = pd.DataFrame(dataset)
    cols, names = list(), list()
    # sequence t-n to t-1
    for i in range(time_step, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(num)]

    for i in range(0, 1):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(num)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(num)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg.dropna(inplace=True)
    return agg

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy

dataset = df.values
encoder = LabelEncoder()
dataset[:, 4] = encoder.fit_transform(dataset[:,4])
dataset = dataset.astype('f')

last_hours = 24
features = 8
del_idx = last_hours*features + 1
del_cols = [i for i in range(del_idx, del_idx+features-1)]
reframed = series_to_img(dataset, last_hours)
reframed.drop(reframed.columns[del_cols], axis=1, inplace=True)

dataset = reframed.values
train_hours = 365 * 24
train = dataset[:train_hours, :]
test = dataset[train_hours:, :]
# split
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# scaling
scaler_x = MinMaxScaler()
scaled_x = scaler_x.fit_transform(train_X)

test_X = scaler_x.transform(test_X)

# reshape
train_X = scaled_x.reshape(scaled_x.shape[0], last_hours, features)
test_X = test_X.reshape(test_X.shape[0], last_hours, features)

train_X = train_X.reshape(-1, 1, last_hours, features)
test_X = test_X.reshape(-1, 1, last_hours, features)

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

In [2]:
import numpy as np

nRows = 365     #days

df = pd.DataFrame(np.random.randint(0, 5, size=(nRows, 2)), columns=["X", "y"], index = pd.date_range("20210101", periods=nRows))
df.head()

Unnamed: 0,X,y
2021-01-01,4,3
2021-01-02,1,1
2021-01-03,4,4
2021-01-04,4,3
2021-01-05,1,2


In [6]:
from sklearn.model_selection import TimeSeriesSplit

n_splits = 3

trainTestSplit = TimeSeriesSplit(n_splits+1).split(df)
next(trainTestSplit) #Skip the first fold

for trainCvIndices, testIndices in trainTestSplit:
    # split Train, Cv, Test
    XTrainCv, yTrainCv = df.iloc[trainCvIndices, 0], df.iloc[trainCvIndices, 1]
    XTest, yTest = df.iloc[testIndices, 0], df.iloc[testIndices, 1]

    test_length = len(XTest)
    trainCvSplit = [(list(range(trainCvIndices[0], trainCvIndices[-test_length])),
                     list(range(trainCvIndices[-test_length], trainCvIndices[-1]+1)))]

    print(f'Training : {XTrainCv.index[0].date()} -- {XTrainCv.index[-test_length-1].date()}\
          , Cv : {XTrainCv.index[-test_length].date()} -- {XTrainCv.index[-1].date()}\
          , Test : {XTest.index[0].date()} -- {XTest.index[-1].date()}')

Training : 2021-01-01 -- 2021-03-14          , Cv : 2021-03-15 -- 2021-05-26          , Test : 2021-05-27 -- 2021-08-07
Training : 2021-01-01 -- 2021-05-26          , Cv : 2021-05-27 -- 2021-08-07          , Test : 2021-08-08 -- 2021-10-19
Training : 2021-01-01 -- 2021-08-07          , Cv : 2021-08-08 -- 2021-10-19          , Test : 2021-10-20 -- 2021-12-31
