In [1]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class GetMiniBatch:
    """
    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
    y : ndarray, shape (n_samples, 1)

    batch_size : int
    seed : int
      NumPy
    """
    def __init__(self, X, y, batch_size = 10, seed=0):
        self.batch_size = batch_size
        np.random.seed(seed)
        shuffle_index = np.random.permutation(np.arange(X.shape[0]))
        self.X = X[shuffle_index]
        self.y = y[shuffle_index]
        self._stop = np.ceil(X.shape[0]/self.batch_size).astype(np.int)

    def __len__(self):
        return self._stop

    def __getitem__(self,item):
        p0 = item*self.batch_size
        p1 = item*self.batch_size + self.batch_size
        return self.X[p0:p1], self.y[p0:p1]

    def __iter__(self):
        self._counter = 0
        return self

    def __next__(self):
        if self._counter >= self._stop:
            raise StopIteration()
        p0 = self._counter*self.batch_size
        p1 = self._counter*self.batch_size + self.batch_size
        self._counter += 1
        return self.X[p0:p1], self.y[p0:p1]

In [3]:
class SimpleInitializer:
    """
    Parameters
    ----------
    sigma : float
    """
    def __init__(self, sigma = 0.01):
        self.sigma = sigma

    def W(self, n_nodes1, n_nodes2):
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        return W

    def B(self, n_nodes2):
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B

In [4]:
class FC:
    """
    Parameters
    ----------
    n_nodes1 : int
    n_nodes2 : int
    initializer
    optimizer
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer, dropout_rate=0.5):
        self.optimizer = optimizer
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
        self.dZ = 0
        self.dA = 0
        self.dropout_rate = dropout_rate
        self.mask = None
        self.input_X_forward = 0

    def forward(self, X):

        self.input_X_forward = X
        A = np.dot(X, self.W) + self.B

        return A

    def backward(self, dA):

        dW = np.dot(self.input_X_forward.T, dA)
        dZ = np.dot(dA, self.W.T)
        self.dA = dA
        self.dW = dW
        self.dZ = dZ
        self = self.optimizer.update(self)
        return dZ

In [5]:
class SGD:

    def __init__(self, lr):
        self.lr = lr
    def update(self, layer):

        layer.B = layer.B - self.lr * np.average(layer.dA, axis=0)
        layer.W = layer.W - self.lr * layer.dW / layer.dA.shape[0]

        return layer

In [6]:
class Sigmoid:

    def __init__(self):

        self.input_X_forward = 0

    def _func(self, X):
        return 1 / (1 + np.exp(-1 * X))

    def _func_diff(self, X):
        return (1 - self._func(X)) * self._func(X)

    def forward(self, X):

        self.input_X_forward = X
        A = self._func(X)
        return A

    def backward(self, dA):

        grad = self._func_diff(self.input_X_forward)
        dZ = grad * dA
        return dZ

In [8]:
class Tanh:

    def __init__(self):

        self.input_X_forward = 0

    def _func(self, X):
        return np.tanh(X)

    def _func_diff(self, X):
        return 1 - (self._func(X))**2

    def forward(self, X):

        self.input_X_forward = X
        A = self._func(X)
        return A

    def backward(self, dA):

        grad = self._func_diff(self.input_X_forward)
        dZ = grad * dA
        return dZ

In [9]:
class softmax:

    def __init__(self):

        self.input_X_forward = 0
        self.pred = 0

    def _func(self, X):
        X = X - np.max(X)
        tmp = np.exp(X)
        denominator = np.sum(tmp, axis=1)
        output = tmp / denominator[:, np.newaxis]
        return output

    def _func_diff(self, X):
        return X

    def forward(self, X):

        self.input_X_forward = X
        A = self._func(X)
        self.pred = A
        return A

    def backward(self, dA):

        dZ = self.pred - dA

        return dZ

In [10]:
class ReLU:

    def __init__(self):

        self.input_X_forward = 0

    def _func(self, X):
        return np.maximum(0, X)

    def _func_diff(self, X):
        return np.where( x > 0, 1, 0)

    def forward(self, X):

        self.input_X_forward = X
        A = self._func(X)
        return A

    def backward(self, dA):

        grad = self._func_diff(self.input_X_forward)
        dZ = grad * dA
        return dZ

In [11]:
class XavierInitializer:

    def __init__(self):
        self.n_prev_nodes = 1
        pass

    def W(self, n_nodes1, n_nodes2):
        self.n_prev_nodes = n_nodes1
        W = np.random.randn(n_nodes1, n_nodes2) / np.sqrt(n_nodes1)
        return W

    def B(self, n_nodes2):
        B = np.random.randn(1, n_nodes2) / np.sqrt(self.n_prev_nodes)
        return B

In [12]:
class HeInitializer:

    def __init__(self):
        self.n_prev_nodes = 1
        pass

    def W(self, n_nodes1, n_nodes2):
        self.n_prev_nodes = n_nodes1
        W = np.random.randn(n_nodes1, n_nodes2) * np.sqrt(2 / n_nodes1)
        return W

    def B(self, n_nodes2):
        B = np.random.randn(1, n_nodes2) * np.sqrt(2 / self.n_prev_nodes)
        return B

In [13]:
class AdaGrad:

    def __init__(self, lr):
        self.lr = lr
        self.H_B = 1
        self.H_W = 1
    def update(self, layer):

        self.H_B = self.H_B + np.average(layer.dA)**2
        self.H_W = self.H_W + np.average(layer.dW)**2

        layer.B = layer.B - self.lr * np.average(layer.dA, axis=0) / np.sqrt(self.H_B)
        layer.W = layer.W - self.lr * layer.dW / layer.dA.shape[0] / np.sqrt(self.H_W)

        return layer


Problem 1 Forward propagation implementation of SimpleRNN

Create a SimpleRNN class SimpleRNN. The basic structure will be the same as the FC class.

The forward propagation formula looks like this: It also describes what the shape of ndarray will be.

We denote the batch size batch_size, the number of input features n_features, and the number of RNN nodes . n_nodesThe activation function proceeds as tanh, but it can be replaced with ReLU, etc., as in previous neural networks.


In [14]:
class SimpleRNN:

    def __init__(self, W_x, B_x, W_h, initializer, optimizer, activation):
        self.optimizer = optimizer
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        #self.W1 = initializer.W(n_wx_nodes1, n_wx_nodes2)
        #self.B1 = initializer.B(1)
        self.Wx = W_x
        self.Bx = B_x
        self.Wh = W_h
        self.dA = 0
        self.dW = 0
        self.W = 0
        self.B = 0
        self.input_X_forward = 0
        self.input_prev_ht_forward = 0
        self.activation = activation
        self.n_sequece = 0

    def forward(self, X):

        self.input_X_forward = X
        self.n_sequece = X.shape[1]
        tmp_prev_h = np.zeros((X.shape[1]+1, X.shape[0], self.Wx.shape[1]))
        self.input_prev_ht_forward = np.zeros((X.shape[0], X.shape[1], self.Wx.shape[1]))
        y = np.zeros((X.shape[0], X.shape[1], self.Wx.shape[1]))
        tmp_y = np.zeros((X.shape[1], X.shape[0], self.Wx.shape[1]))
        for i in range(self.n_sequece):
            Xt = X[:,i]
            #Xt:(batch, Feature)
            tmp = np.dot(Xt, self.Wx) + self.Bx + tmp_prev_h[i]
            #tmp:(batch, Node1)
            tmp_y[i] = self.activation.forward(tmp)
            #h_prev:(batch, node2)
            tmp_prev_h[i+1] = np.dot(tmp_y[i], self.Wh)

        self.input_prev_ht_forward = tmp_prev_h.transpose(1,0,2)
        y = tmp_y.transpose(1,0,2)
        return y

    def backward(self, dA):

        dz = np.zeros_like(self.input_X_forward)
        tmp_dz = dz.transpose(1,0,2)

        loss_h = np.zeros((dA.shape[0], dA.shape[1]+1, dA.shape[2]))
        for i in reversed(range(self.n_sequece)):
            loss = dA[:,i,:] + loss_h[:,i,:]
            loss = self.activation.backward(loss) * loss
            dW = np.dot(self.input_X_forward[:,i,:].T, loss)
            tmp_dz[i] = np.dot(loss, self.Wx.T)
            self.dA = loss
            self.dW = dW
            self.W = self.Wx
            self.B = self.Bx
            self = self.optimizer.update(self)
            self.Wx = self.W
            self.Bx = self.B

            loss_h[:,i+1,:] = np.dot(loss, self.Wh.T)
            self.dA = loss
            dW = np.dot(self.input_prev_ht_forward[:,i,:].T, loss)
            self.dW = dW
            self.W = self.Wh
            self.B = 0
            self = self.optimizer.update(self)
            self.Wh = self.W

        dz = tmp_dz.transpose(1,0,2)
        return dz

Problem 2 Forward propagation experiment with small sequences

Consider forward propagation on small arrays. Let input x, initial state h, weights w_x and w_h, bias b be: Here the axes of the array x are in order of batch size, number of series, and number of features.

In [15]:
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h = np.zeros((batch_size, n_nodes))
b = np.array([1])

In [16]:
rnn = SimpleRNN(w_x, 1, w_h, initializer=SimpleInitializer(), optimizer=SGD(0.01), activation=Tanh())

In [17]:
h = rnn.forward(x)

In [18]:
h.shape

(1, 3, 4)

In [19]:
h[0,2]

array([0.79494228, 0.81839002, 0.83939649, 0.85584174])

Problem 3 (Advanced assignment) Implementing backpropagation

Time-trough backpropagation

In [20]:
dA = np.array([[[0.01, 0.02, 0.03, 0.04], [0.01, 0.02, 0.03, 0.04], [0.01, 0.02, 0.03, 0.04]]])

In [21]:
rnn.backward(dA)

array([[[4.75883037e-05, 6.05642872e-05],
        [4.75883582e-05, 6.05643690e-05],
        [4.75884400e-05, 6.05644781e-05]]])

In [22]:
x.shape, w_x.shape, w_h.shape

((1, 3, 2), (2, 4), (4, 4))

In [23]:
def forward(x,h):
    for n in range(n_sequences):
        h = np.tanh(x[:, n, :] @ w_x + h @ w_h + b)
    return h

In [24]:
forward(x,h)

array([[[0.79511838, 0.81866084, 0.8397396 , 0.85623049],
        [0.79513064, 0.81867967, 0.83976345, 0.8562575 ],
        [0.79513154, 0.81868106, 0.83976521, 0.85625949]]])