In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import mlp.data_providers as data_providers
import os

In [2]:
data_path = os.path.join(
    os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
# load raw data from text file
window_size = 5
loaded = np.loadtxt(data_path, skiprows = 3, usecols = range(2,33))
loaded = loaded[loaded != -99.99].flatten()
print(loaded.shape)
shape = (loaded.shape[-1] - window_size + 1, window_size)
print(shape)
print(loaded.strides)
print((loaded.strides[-1], ))
print((7,) + (8,))
strides = loaded.strides + (loaded.strides[-1],)
print(strides)
windowed = np.lib.stride_tricks.as_strided(loaded, shape=shape, strides=strides)
print(windowed, windowed.shape, windowed.strides)

(30924,)
(30920, 5)
(8,)
(8,)
(7, 8)
(8, 8)
[[ 1.4   2.1   2.5   0.1   0.  ]
 [ 2.1   2.5   0.1   0.    0.  ]
 [ 2.5   0.1   0.    0.    0.9 ]
 ...
 [15.5   7.68  2.36  1.65  1.05]
 [ 7.68  2.36  1.65  1.05  0.24]
 [ 2.36  1.65  1.05  0.24  3.94]] (30920, 5) (8, 8)


In [3]:
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
z = np.lib.stride_tricks.as_strided(x, shape = (5,5), strides = (8,8))
y = np.array([[0, 1, 2, 3, 4],
              [5, 6, 7, 8, 9]])
w = np.array([[0, 1, 2, 3, 4], 
              [1, 2, 3, 4, 5],
              [2, 3, 4, 5, 6],
              [3, 4, 5, 6, 7],
              [4, 5, 6, 7, 8]])
print(x.shape, y.strides, w.strides, z.strides)
print(w)
print(z)
print(w == z)

(9,) (40, 8) (40, 8) (8, 8)
[[0 1 2 3 4]
 [1 2 3 4 5]
 [2 3 4 5 6]
 [3 4 5 6 7]
 [4 5 6 7 8]]
[[0 1 2 3 4]
 [1 2 3 4 5]
 [2 3 4 5 6]
 [3 4 5 6 7]
 [4 5 6 7 8]]
[[ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]]


In [4]:
from timeit import timeit
# strides is faster for data pre-processing
window_size = 5
# load raw data from text file
data_path = os.path.join(
    os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
raw = np.loadtxt(data_path, skiprows = 3, usecols = range(2,33))
# filter out all missing datapoints and flatten to a vector
loaded = raw[raw != -99.99].flatten()
# normalise data to zero mean, unit standard deviation
loaded = (loaded - np.mean(loaded)) // np.std(loaded)
# two methods to do data pre-processing

def make_X1():
    # Create array of zeros the same size as our final desired array
    X1 = np.zeros([len(loaded) - window_size + 1, window_size])
    for row in range(len(X1)):
        X1[row,:] = loaded[row:row+window_size]
    return X1
 
def make_X2():
    stride = loaded.strides + (loaded.strides[-1],)
    shape = (loaded.shape[-1] - window_size + 1, window_size)
    # Get a view of the prices with shape desired_shape, strides as defined, don't write to original array 
    X2 = np.lib.stride_tricks.as_strided(loaded, shape = shape, strides = stride, writeable=False)
    return X2 
 
# timeit(make_X1) # 56.7 seconds 
# timeit(make_X2) # 6.11 seconds, over 7x faster!

In [5]:
a = make_X1()
print(a.shape)
a[:,0:-1]

(30920, 5)


array([[-1., -1., -1., -1.],
       [-1., -1., -1., -1.],
       [-1., -1., -1., -1.],
       ...,
       [ 2.,  0., -1., -1.],
       [ 0., -1., -1., -1.],
       [-1., -1., -1., -1.]])

In [11]:
window_size = 10
batch_size = 3
met_dp = data_providers.MetOfficeDataProvider(
        window_size=window_size, batch_size=batch_size,
        max_num_batches=1, shuffle_order=False)
for inputs, targets in met_dp:
    print(inputs)
    a = np.c_[inputs, targets].T
    b = [window_size - 1]*batch_size
    print(targets)

[[1.4 2.1 2.5 0.1 0.  0.  0.9 6.2 1.9]
 [2.1 2.5 0.1 0.  0.  0.9 6.2 1.9 4.9]
 [2.5 0.1 0.  0.  0.9 6.2 1.9 4.9 7.3]]
[4.9 7.3 0.8]


In [12]:
print(a)
print(b)

[[1.4 2.1 2.5]
 [2.1 2.5 0.1]
 [2.5 0.1 0. ]
 [0.1 0.  0. ]
 [0.  0.  0.9]
 [0.  0.9 6.2]
 [0.9 6.2 1.9]
 [6.2 1.9 4.9]
 [1.9 4.9 7.3]
 [4.9 7.3 0.8]]
[9, 9, 9]


In [24]:
raw = np.loadtxt(data_path, skiprows = 3, usecols = range(2,32))
loaded = raw[raw != -99.99].flatten()
filtered = raw[raw >= 0].flatten()
print(raw.shape)
print(loaded.shape)
print(filtered.shape)
mean = np.mean(filtered)
std = np.std(filtered)
normalised = (filtered - mean) / std
loaded = (loaded - np.mean(loaded)) / np.std(loaded)
print(normalised == loaded)

(1020, 30)
(30331,)
(30331,)
[ True  True  True ...  True  True  True]
