In [3]:
GPU = True

if GPU:
    import cupy as np
    np.cuda.set_allocator(np.cuda.MemoryPool().malloc)
else:
    import numpy as np

In [31]:
# optimizer
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]

In [94]:
def MSE(y, t):
    return 0.5 * np.sum((y-t)**2)

class TimeFC:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        W, b = self.params
        N, T, D = x.shape

        reshaped_x = x.reshape(N*T, -1)
        y = np.dot(reshaped_x, W) + b
        self.x = x
        y = y.reshape(N, T, -1)
        return y

    def backward(self, dy):
        W, b = self.params
        x = self.x
        N, T, D = x.shape

        dy = dy.reshape(N*T, -1)
        reshaped_x = x.reshape(N*T, -1)

        db = np.sum(dy, axis=0)
        dx = np.matmul(dy, W.T)
        dW = np.matmul(reshaped_x.T, dy)
        
        dx = dx.reshape(*x.shape)

        self.grads[0][...] = dW
        self.grads[1][...] = db

        return dx

class TimeMSE:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None

    def forward(self, xs, ts):
        print(xs.shape)
        print(ts.shape)
        N, T, V = xs.shape
        
        xs = xs.reshape(N*T, V)
        ts = ts.reshape(N*T)

        ls = MSE(xs, ts)
        loss = -ls / N

        self.cache = (ts, xs, (N, T, V))

        return loss

    def backward(self, dy = 1):

        ts, xs, (N, T, V) = self.cache

        dx = dy * np.sum(xs - ts) / N
        return dx

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [76]:
class LSTM:
    def __init__(self, Wx, Wh, b):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev, c_prev):
        # Affine transformation (Wx[f, g, i, o], Wh[f, g, i, o], b[f, g, i, o])
        Wx, Wh, b = self.params
        N, H = h_prev.shape

        A = np.matmul(x, Wx) + np.matmul(h_prev, Wh) + b

        # slice for gates and get
        forget = A[:, :H]       # NxH
        get = A[:, H:2*H]
        input = A[:, 2*H:3*H]
        output = A[:, 3*H:4*H]

        forget = sigmoid(forget)   # forget gate
        get = np.tanh(get)        # new memory
        input = sigmoid(input)    # input gate
        output = sigmoid(output)    # output gate

        c_next = (c_prev * forget) + (get * input)
        h_next = np.tanh(c_next) * output

        self.cache = (x, h_prev, c_prev, input, forget, get, output, c_next)
        return h_next, c_next

    def backward(self, dh_next, dc_next):
        Wx, Wh, b = self.params
        x, h_prev, c_prev, input, forget, get, output, c_next = self.cache

        # chain rule
        do = dh_next * np.tanh(c_next)
        do_s = do * output*(1-output)
        dt = dh_next * output
        dt_c = dt * (1-(np.tanh(c_next)**2))

        di = dt_c * get
        dg = dt_c * input
        di_s = di * input*(1-input)
        dg_t = dg * (1-(g**2))

        dc_prev = dt_c * f
        df = dt_c * c_prev
        df_s = df * forget*(1-forget)

        dA = np.hstack(df_s, dg_t, di_s, do_s)

        db = np.sum(dA, axis = 0)
        dWh = np.matmul(h_prev.T, dA)
        dh_prev = np.matmul(dA, Wh.T)
        dWx = np.matmul(x.T, dA)
        dx = np.matmul(dA, Wx.T)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        return dx, dh_prev, dc_prev

In [80]:
# Time LSTM
class TimeLSTM:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None      # for LSTM layer
        self.h, self.c = None, None
        self.dh = None
        self.stateful = stateful

    def set_state(self, h, c=None):
        self.h, self.c = h, c

    def reset_state(self):
        self.h, self.c = None, None

    def forward(self, xs):
        Wx, Wh, b = self.params
        print(xs.shape)
        N, T, D = xs.shape      # mini-batch, time length, Dimension
        H = Wh.shape[0]         # Wh (H, 4H) H: hidden size
        
        self.layers = []        # for stacking LSTM layer (horizontal)
        hs = np.empty((N, T, H), dtype='f')   # for save (h0 ... ht)
        
        # if not stateful, initialize h and c
        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')

        for t in range(T):
            layer = LSTM(*self.params)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            
            hs[:, t, :] = self.h
            self.layers.append(layer)

        return hs
        
    def backward(self, dhs):
        Wx, Wh, b = self.params
        N, T, H = dhs.shape
        D = Wx.shape[0]

        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0

        grads = [0, 0, 0]   #dWx, dWh, db
        for t in reversed(range(T)):  # BPTT
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:,t ,:] + dh, dc)
            dxs[:, t, :] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad

        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        
        self.dh = dh
        return dxs
    

In [87]:
class Model:
    def __init__(self, time_size, hidden_size, feature_size):
        T, H, F = time_size, hidden_size, feature_size
        rand = np.random.randn

        # weights (Xavier)
        lstm_Wx = (rand(F, 4*H)/ np.sqrt(T)).astype('f')
        lstm_Wh = (rand(H, 4*H)/ np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4*H).astype('f')

        fc_W = (rand(H, 1)/ np.sqrt(H)).astype('f')
        fc_b = np.zeros(1).astype('f')

        print(lstm_Wx.shape)
        # layer
        self.layers = [
            TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True),
            TimeFC(fc_W, fc_b)
        ]
        self.loss_layer = TimeMSE()

        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads


    def predict(self, xs):
        xs = np.array(xs)
        for layer in self.layers:
            xs = layer.forward(xs)
        return xs

    def forward(self, xs, ts):
        xs = np.array(xs)
        ts = np.array(ts)
        for layer in self.layers:
            xs = layer.forward(xs)
        loss = self.loss_layer.forward(xs, ts)
        return loss

    def backward(self, dy = 1):
        dy = self.loss_layer.backward(dy)
        for layer in reversed(self.layers):
            dy = layer.backward(dy)
        return dy

    def reset_state(self):
        self.layers[0].reset_state()

In [88]:
import pandas as pd
from datetime import datetime

df_parser = lambda x: datetime.strptime(x, '%Y %m %d %H')    # string to datetime
# data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv'
data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv'
df = pd.read_csv(data_url, sep=',', parse_dates=[['year', 'month', 'day', 'hour']], date_parser=df_parser, index_col=0)

del df['No']
df.columns = ['pm2.5', 'dewp', 'temp', 'pres', 'cbwd','wind_speed', 'snow', 'rain']
df = df[24:]            # NaN values in first 24hours
df.head()

Unnamed: 0_level_0,pm2.5,dewp,temp,pres,cbwd,wind_speed,snow,rain
year_month_day_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [68]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy

dataset = df.values
encoder = LabelEncoder()        # for cbwd : wind direction
dataset[:, 4] = encoder.fit_transform(dataset[:, 4])
dataset = dataset.astype('float')

scaler = MinMaxScaler()
scaled_dataset = scaler.fit_transform(dataset)

def dataset_to_sequence(dataset, t_steps):
    N = len(dataset)        # data length
    iters = N - t_steps
    dataX, dataY = [], []

    for idx in range(iters):
        x = dataset[idx:idx+t_steps]
        y = dataset[idx+1:idx+t_steps+1, 0]

        dataX.append(x)
        dataY.append(y)

    return numpy.array(dataX), numpy.array(dataY)

dataX, dataY = dataset_to_sequence(scaled_dataset, 24)
print(scaled_dataset[0], scaled_dataset[1])

[0.67701863 0.55       1.         0.         0.66666667 0.00413451
 0.         0.        ] [0.79503106 0.6        1.         0.         0.66666667 0.00822308
 0.         0.        ]


In [93]:
import time

# 80 train, 20 test
batch_size = 20
hidden_size = 100
time_size = 24

learning_rate = 0.01
epochs = 10

model = Model(time_size, hidden_size, 8)
optimizer = SGD(learning_rate)

# traning 
start_time = time.time()

data_length = len(dataX)      # train_data
iter_len = data_length // batch_size

for epoch in range(1, epochs+1):

    for iter in range(iter_len):
        idx = iter*batch_size
        batch_x = dataX[idx:idx+batch_size]
        batch_t = dataY[idx:idx+batch_size]
        print(batch_x.shape)

        loss = model.forward(batch_x, batch_t)
        model.backward()
        params, grads = model.params, model.grads
        optimizer.update(params, grads)
        total_loss += loss

        during = time.time() - start_time
        print(f'epoch : {epoch}, 반복{iter}/{iter_len}, 시간 {during}[s] loss: {loss}')

(8, 400)
(20, 24, 8)
(20, 24, 8)
(20, 24, 1)
(20, 24)


ValueError: ignored

In [None]:
20, 23, 8