In [5]:
import numpy as np
import pandas as pd

nRows = 365     #days

df = pd.DataFrame(np.random.randint(0, 5, size=(nRows, 2)), columns=["X", "y"], index = pd.date_range("20210101", periods=nRows))
df.head()

from sklearn.model_selection import TimeSeriesSplit

n_splits = 3

trainTestSplit = TimeSeriesSplit(n_splits+1).split(df)
next(trainTestSplit) #Skip the first fold

for trainCvIndices, testIndices in trainTestSplit:
    # split Train, Cv, Test
    XTrainCv, yTrainCv = df.iloc[trainCvIndices, 0], df.iloc[trainCvIndices, 1]
    XTest, yTest = df.iloc[testIndices, 0], df.iloc[testIndices, 1]

    test_length = len(XTest)
    trainCvSplit = [(list(range(trainCvIndices[0], trainCvIndices[-test_length])),
                     list(range(trainCvIndices[-test_length], trainCvIndices[-1]+1)))]

    # train model & evaluate

    print(f'Training : {XTrainCv.index[0].date()} -- {XTrainCv.index[-test_length-1].date()}\
          , Cv : {XTrainCv.index[-test_length].date()} -- {XTrainCv.index[-1].date()}\
          , Test : {XTest.index[0].date()} -- {XTest.index[-1].date()}')

Training : 2021-01-01 -- 2021-03-14          , Cv : 2021-03-15 -- 2021-05-26          , Test : 2021-05-27 -- 2021-08-07
Training : 2021-01-01 -- 2021-05-26          , Cv : 2021-05-27 -- 2021-08-07          , Test : 2021-08-08 -- 2021-10-19
Training : 2021-01-01 -- 2021-08-07          , Cv : 2021-08-08 -- 2021-10-19          , Test : 2021-10-20 -- 2021-12-31


In [10]:
def series_to_img(dataset, time_step=1):
    num = dataset.shape[1]      # features num
    df = pd.DataFrame(dataset)
    cols, names = list(), list()
    # sequence t-n to t-1
    for i in range(time_step, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(num)]

    for i in range(0, 1):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(num)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(num)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg.dropna(inplace=True)
    return agg

In [17]:
# use beijing air pollution data
from datetime import datetime

df_parser = lambda x: datetime.strptime(x, '%Y %m %d %H')    # string to datetime
# data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv'
data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv'
df = pd.read_csv(data_url, sep=',', parse_dates=[['year', 'month', 'day', 'hour']], date_parser=df_parser, index_col=0)

del df['No']
df.columns = ['pm2.5', 'dewp', 'temp', 'pres', 'cbwd','wind_speed', 'snow', 'rain']
df = df[24:]            # NaN values in first 24hours

# sklearn library for time series split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

dataset = df.values
label_encoder = LabelEncoder()
dataset[:, 4] = label_encoder.fit_transform(dataset[:, 4])  # for wind direction

n_inputs = 24
n_features = 8
del_idx = n_inputs * n_features + 1
del_cols = [i for i in range(del_idx, del_idx+n_features-1)]

n_splits = 10

train_test_split = TimeSeriesSplit(n_splits+1).split(df)
next(train_test_split)

for train_cv_indices, test_cv_indices in train_test_split:
    train_cv = df.iloc[train_cv_indices, :]
    test_cv = df.iloc[test_cv_indices, :]

    test_length = len(test_cv)
    train_cv_split = [(list(range(train_cv_indices[0], train_cv_indices[-test_length])),
                       list(range(train_cv_indices[-test_length], train_cv_indices[-1]+1)))]
    
    train = df.iloc[train_cv_split[0][0]].values
    val = df.iloc[train_cv_split[0][1]].values
    test = test_cv.values

    # reframed
    train = series_to_img(train, n_inputs)
    val = series_to_img(val, n_inputs)
    test = series_to_img(test, n_inputs)

    # drop
    train.drop(train.columns[del_cols], axis=1, inplace=True)
    val.drop(val.columns[del_cols], axis=1, inplace=True)
    test.drop(test.columns[del_cols], axis=1, inplace=True)

    # values
    train, val, test = train.values, val.values, test.values

    # split
    train_X, train_y = train[:, :-1], train[:, -1]
    val_X, val_y = train[:, :-1], train[:, -1]
    test_X, test_y = train[:, :-1], train[:, -1]

    # scaling
    scaler_x = MinMaxScaler()
    train_X = scaler_x.fit_transform(train_X)
    val_X = scaler_x.transform(val_X)
    test_X = scaler_x.transform(test_X)

    # rehape
    train_X = train_X.reshape(-1, 1, n_inputs, n_features)
    val_X = val_X.reshape(-1, 1, n_inputs, n_features)
    test_X = test_X.reshape(-1, 1, n_inputs, n_features)

    # model fit

    # model selection

    # model eval
    print(f'train : {train_cv.index[0].date()} -- {train_cv.index[-test_length-1].date()}')



train : 2010-01-02 -- 2010-06-03
train : 2010-01-02 -- 2010-11-02
train : 2010-01-02 -- 2011-04-03
train : 2010-01-02 -- 2011-09-02
train : 2010-01-02 -- 2012-02-01
train : 2010-01-02 -- 2012-07-02
train : 2010-01-02 -- 2012-12-01
train : 2010-01-02 -- 2013-05-02
train : 2010-01-02 -- 2013-10-01
train : 2010-01-02 -- 2014-03-02


In [9]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00549/HortonGeneralHospital.csv'
df = pd.read_csv(data_url, sep=',', index_col=0)
df.head()

Unnamed: 0,Cardio,Resp,Hypo,Adm,year,month
1,1,0,0,413,1999,11
2,1,0,0,443,1999,12
3,1,0,0,378,2000,1
4,0,0,0,385,2000,2
5,1,0,0,388,2000,3
