In [4]:
import glob
import numpy as np
import pandas as pd

In [8]:
filenames = glob.glob('./datasets/NYC_bike/*.npz')
print(filenames)

['./datasets/NYC_bike/coord_test.npz', './datasets/NYC_bike/x_st_train.npz', './datasets/NYC_bike/NYC_bike_prev_holiday_dataset.npz', './datasets/NYC_bike/NYC_bike_day_of_week_dataset.npz', './datasets/NYC_bike/y_st_test.npz', './datasets/NYC_bike/NYC_bike_holiday_dataset.npz', './datasets/NYC_bike/temporal_train.npz', './datasets/NYC_bike/y_st_train.npz', './datasets/NYC_bike/temporal_test.npz', './datasets/NYC_bike/x_end_train.npz', './datasets/NYC_bike/x_end_test.npz', './datasets/NYC_bike/NYC_bike_min_30_dataset.npz', './datasets/NYC_bike/coord_train.npz', './datasets/NYC_bike/x_st_test.npz']


## Load Dataset

In [5]:
def load_npz(filename, col='volume'):
    return np.load(filename)[col]

In [8]:
train_dataset = load_npz(filenames[1])
test_dataset = load_npz(filenames[0])
print("Train shape: ", np.shape(train_dataset), ", Test shape: ", np.shape(test_dataset))
start_train, end_train = train_dataset[:,:,:,0], train_dataset[:,:,:,1]
start_test, end_test = test_dataset[:,:,:,0], test_dataset[:,:,:,1]

Train shape:  (1920, 10, 20, 2) , Test shape:  (960, 10, 20, 2)


In [11]:
#Setting Some Parameters 
num_train, num_test = np.shape(start_train)[0], np.shape(start_test)[0]
num_row = num_train + num_test #1960+960

# Holiday and Prev.Holiday List
holiday_dict = {1:[1, 19], 2:[16]}
prev_holiday_dict = {1:[18], 2:[15]}

## Make Temporal Information

In [12]:
### Initialize numpy array of temporal information (one-hot encoding)
datasets_min_30 = np.zeros([num_row, 48])
datasets_dow = np.zeros([num_row, 7])
datasets_holiday = np.zeros([num_row, 1])
datasets_prev_holiday = np.zeros([num_row, 1])

In [13]:
# 30 mins, and day-of-week index are calculated below
for i in range(num_row):
    idx_30 = int(int(i)%48)
    idx_dow = int(int(i/48)%7)
    datasets_min_30[i,idx_30] = 1
    datasets_dow[i, idx_dow] = 1

In [19]:
# Calculate the number of day after 1-1 (on Holiday)
prev_month_dict={1:0, 2:31, 3:59, 4:90, 5:120, 6:151}
holiday_list, prev_holiday_list = [], []
for key in holiday_dict.keys():
    for value in holiday_dict[key]:
        holiday_list.extend([prev_month_dict[key]+value])
    for value in prev_holiday_dict[key]:
        prev_holiday_list.extend([prev_month_dict[key]+value])
print("Day of Holiday 0101 ~: ", holiday_list)
print("Day of Prev.Holiday (2018/01/01 ~): ", prev_holiday_list)

Day of Holiday 01/01 ~:  [1, 19, 47]
Day of Prev.Holiday (2018/01/01 ~):  [18, 46]


In [20]:
# For convicience, we substitute 1, to indicate index
holiday_list = [value -1 for value in holiday_list]
prev_holiday_list = [value -1 for value in prev_holiday_list]
print("Day of Holiday (01/01 ~): ", holiday_list)
print("Day of Prev.Holiday (01/01 ~): ", prev_holiday_list)

Day of Holiday (01/01 ~):  [0, 18, 46]
Day of Prev.Holiday (01/01 ~):  [17, 45]


In [24]:
for i in range(num_row):
    is_holiday = int(i/48) in holiday_list
    is_prev_holiday = int(i/48) in prev_holiday_list
    if is_holiday:
        datasets_holiday[i,0] = 1
    if is_prev_holiday:
        datasets_prev_holiday[i,0] = 1

In [25]:
np.savez('./NYC_dataset/NYC_min_30_dataset.npz', datasets_min_30)
np.savez('./NYC_dataset/NYC_day_of_week_dataset.npz', datasets_dow)
np.savez('./NYC_dataset/NYC_holiday_dataset.npz', datasets_holiday)
np.savez('./NYC_dataset/NYC_prev_holiday_dataset.npz', datasets_prev_holiday)

In [27]:
#dataset_min_10 = load_npz('kakao_min_10_dataset.npz')
dataset_min_30 = load_npz('./NYC_dataset/NYC_min_30_dataset.npz', 'arr_0')
dataset_dow = load_npz('./NYC_dataset/NYC_day_of_week_dataset.npz', 'arr_0')
dataset_holiay = load_npz('./NYC_dataset/NYC_holiday_dataset.npz', 'arr_0')
dataset_prev_holiday = load_npz('./NYC_dataset/NYC_prev_holiday_dataset.npz', 'arr_0')

In [29]:
print(np.shape(dataset_min_30), np.shape(dataset_dow), np.shape(dataset_holiay), np.shape(dataset_prev_holiday))

(2880, 48) (2880, 7) (2880, 1) (2880, 1)


In [30]:
def train_test_split(data, idx):
    return data[:idx], data[idx:]

## Split Train&Test Period

In [31]:
train_index = num_train #144 # 120 days and 144 time index
min_30_train, min_30_test=train_test_split(dataset_min_30, train_index)
dow_train, dow_test=train_test_split(dataset_dow, train_index)
holiday_train, holiday_test=train_test_split(dataset_holiay, train_index)
prev_holiday_train, prev_holiday_test=train_test_split(dataset_prev_holiday, train_index)

## Sampler

In [32]:
LAG = 8
END_LAG = 16
STEP = 1 #6
BIAS = END_LAG - LAG

In [33]:
def sampler(data, lag=8, bias=0, step=1, temp=False):
    """This function makes samples of the time series data
    args:
    - data : (# of data, height, width)
    - lag : the length of sampling
    - step : (step)-ahead forecasting label
    return: 
    - data_x (# of sample, height, width, lag)
    - data_y (# of sample, height, width, 1)
    """
    num_row = len(data)
    data_x, data_y = [], []
    for idx in range(num_row):
        strat_idx = idx + bias
        try:
            y = np.array(data[strat_idx+lag+(step-1)])
            data_y.append(y)
            if not temp:
                x = np.transpose(data[strat_idx:strat_idx+lag], [1,2,0])
                data_x.append(x)
        except:
            if len(np.shape(data_y)) <4 and not temp:
                data_y = np.expand_dims(data_y, axis=-1)
            print("Sampler Return", np.shape(data_x), np.shape(data_y))
            break
            
    if not temp:
        return np.array(data_x), np.array(data_y)
    else:
        return np.array(data_y)

In [34]:
start_train_x, start_train_y = sampler(start_train, lag=LAG, bias=BIAS, step=STEP)
start_test_x, start_test_y = sampler(start_test, lag=LAG, bias=BIAS, step=STEP)
end_train_x, end_train_y = sampler(end_train, lag=END_LAG, step=STEP)
end_test_x, end_test_y = sampler(end_test, lag=END_LAG, step=STEP)

Sampler Return (1904, 10, 20, 8) (1904, 10, 20, 1)
Sampler Return (944, 10, 20, 8) (944, 10, 20, 1)
Sampler Return (1904, 10, 20, 16) (1904, 10, 20, 1)
Sampler Return (944, 10, 20, 16) (944, 10, 20, 1)


In [35]:
assert np.shape(start_train_y)[0] == np.shape(end_train_y)[0]

In [36]:
'''
min_10_train_y = sampler(min_10_train, lag=LAG, step=STEP, temp=True)
min_10_test_y = sampler(min_10_test, lag=LAG, step=STEP, temp=True)
'''
min_30_train_y = sampler(min_30_train, lag=END_LAG, step=STEP, temp=True)
min_30_test_y = sampler(min_30_test, lag=END_LAG, step=STEP, temp=True)

dow_train_y = sampler(dow_train, lag=END_LAG, step=STEP, temp=True)
dow_test_y = sampler(dow_test, lag=END_LAG, step=STEP, temp=True)

holiday_train_y = sampler(holiday_train, lag=END_LAG, step=STEP, temp=True)
holiday_test_y = sampler(holiday_test, lag=END_LAG, step=STEP, temp=True)

prev_holiday_train_y = sampler(prev_holiday_train, lag=END_LAG, step=STEP, temp=True)
prev_holiday_test_y = sampler(prev_holiday_test, lag=END_LAG, step=STEP, temp=True)

Sampler Return (0,) (1904, 48)
Sampler Return (0,) (944, 48)
Sampler Return (0,) (1904, 7)
Sampler Return (0,) (944, 7)
Sampler Return (0,) (1904, 1)
Sampler Return (0,) (944, 1)
Sampler Return (0,) (1904, 1)
Sampler Return (0,) (944, 1)


In [37]:
temporal_train = np.concatenate((dow_train_y, min_30_train_y, prev_holiday_train_y, holiday_train_y), axis=-1)
temporal_test = np.concatenate((dow_test_y, min_30_test_y, prev_holiday_test_y, holiday_test_y), axis=-1)
print(np.shape(temporal_train), np.shape(temporal_test))

(1904, 57) (944, 57)


# Important Remark
For fair comparision with STDN setting, we only use 477 number of test dataset.

In [38]:
np.savez('./NYC_taxi_dataset/x_st_train.npz', start_train_x)
np.savez('./NYC_taxi_dataset/x_st_test.npz', start_test_x[-477:])
np.savez('./NYC_taxi_dataset/x_end_train.npz', end_train_x)
np.savez('./NYC_taxi_dataset/x_end_test.npz', end_test_x[-477:])
np.savez('./NYC_taxi_dataset/y_st_train.npz', start_train_y)
np.savez('./NYC_taxi_dataset/y_st_test.npz', start_test_y[-477:])
np.savez('./NYC_taxi_dataset/temporal_train.npz', temporal_train)
np.savez('./NYC_taxi_dataset/temporal_test.npz', temporal_test[-477:])

## Coordinate Information

In [19]:
train_num, h, w = np.shape(start_train_y)[:-1]
# train_num, h, w = 20,2,3
# num_row = 20
# test_num = 10
# import numpy as np
test_num = np.shape(start_test_y)[0]

In [17]:
coord_y = np.expand_dims(np.array([[y]*w for y in range(h)]), axis=-1)
coord_x = np.expand_dims(np.array([[x]*h for x in range(w)]), axis=-1)
print(coord_y.shape)
coord_x = np.transpose(coord_x, [1,0,2])
coord_xy = np.concatenate([coord_y, coord_x], axis=-1)
print(coord_xy.shape)
coord_xy = np.repeat(np.expand_dims(coord_xy, axis=0), repeats=num_row, axis=0)
print(np.shape(coord_xy))

(2, 3, 1)
(2, 3, 2)
(20, 2, 3, 2)


In [47]:
coord_train, coord_test = coord_xy[:len(start_train_y)], coord_xy[-477:]
print(np.shape(coord_train), np.shape(coord_test))
np.savez('./NYC_taxi_dataset/coord_train.npz', coord_train)
np.savez('./NYC_taxi_dataset/coord_test.npz', coord_test)

(1904, 10, 20, 2) (477, 10, 20, 2)
