In [12]:
import numpy as np
import math

# RNN model
class RNN():
    def __init__(self, data, hidden_size, seq_length):
        # data preprocessing
        vocabs = list(set(data))
        data_size, vocab_size = len(data), len(vocabs)
        print(f"data has {data_size} characters, {vocab_size} unique.")
        self.char_to_idx = {ch:i for i, ch in enumerate(vocabs)} # {vocab : index, ...}
        self.idx_to_char = {i:ch for i, ch in enumerate(vocabs)} # (index : vocab, ...)
        self.vocab_size = vocab_size
        self.data_size = data_size

        # hyperparameters
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.batch_size = math.ceil(data_size / seq_length)

        # model parameters
        self.params = {}
        self.params['Wxh'] = np.random.randn(hidden_size, vocab_size) * 0.01 # input -> hidden
        self.params['Whh'] = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden -> hidden
        self.params['Why'] = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden -> output
        self.params['bh'] = np.zeros((hidden_size,1))
        self.params['by'] = np.zeros((vocab_size,1))

    def forward(self, inputs, targets, h_prev):
        """
        - inputs and targets are both list of integers
          ex. 
          inputs = [0,1,2,3,4,5]
          targets = [1,2,3,4,5,6]
        - hprev is (H,1) array of initial hidden states
        """
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(h_prev) # initialize h
        loss = 0
        # forward pass
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size,1))
            xs[t][inputs[t]] = 1 # change to one-hot-vector
            hs[t] = np.tanh(np.dot(self.params['Wxh'], xs[t]) + np.dot(self.params['Whh'], hs[t-1]) + self.params['bh'])
            ys[t] = np.dot(self.params['Why'], hs[t]) + self.params['by']
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # softmax probability for next char
            loss += -np.log(ps[t][targets[t],0]) # cross-entrophy loss

            # y_class = np.zeros((vocab_size,1))
            # y_class[targets[t]] = 1
            # loss += np.sum(-np.log(y_class * ps[t]))

        cache = (xs, hs, ps)
        return loss, cache

    def backward(self, inputs, targets, cache):
        xs, hs, ps = cache
        dWxh, dWhh, dWhy = np.zeros_like(self.params['Wxh']), np.zeros_like(self.params['Whh']), np.zeros_like(self.params['Why'])
        dbh, dby = np.zeros_like(self.params['bh']), np.zeros_like(self.params['by'])
        dhnext = np.zeros_like(hs[0])
        for t in reversed(range(len(inputs))):
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1
            dWhy += np.dot(dy, hs[t].T) # (vocab_size, hidden_size)
            dby += self.params['by']
            dh = np.dot(self.params['Why'].T, dy) + dhnext # (hidden_size, 1)
            dhraw = (1 - hs[t] * hs[t]) * dh
            dbh += dhraw
            dWhh += np.dot(dhraw, hs[t-1].T) # (hidden_size, hidden_size)
            dWxh += np.dot(dhraw, xs[t].T) # (hidden_size, vocab_size)
            dhnext = np.dot(self.params['Whh'].T, dhraw) # (hidden_size, 1)

        # Gradient clipping
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam) # clip to alleviate gradient explode

        return dWxh, dWhh, dWhy, dbh, dby

    def training(self, data, learning_rate=1e-1, iterations=5000):

        # for AdaGrad optimization : grad_squared
        mWxh, mWhh, mWhy = np.zeros_like(self.params['Wxh']), np.zeros_like(self.params['Whh']), np.zeros_like(self.params['Why'])
        mbh, mby = np.zeros_like(self.params['bh']), np.zeros_like(self.params['by'])

        for iter in range(iterations):

            data_pointer = 0
            h_prev = np.zeros((self.hidden_size,1))
            for b in range(self.batch_size):

                inputs = [self.char_to_idx[ch] for ch in data[data_pointer:data_pointer+self.seq_length]]
                targets = [self.char_to_idx[ch] for ch in data[data_pointer+1:data_pointer+self.seq_length+1]] # t+1

                if (data_pointer+self.seq_length+1 >= len(data) and b == self.batch_size-1): # processing of the last part of the input data.
                    targets.append(self.char_to_idx[data[0]])   # When the data doesn't fit, add space(" ") to the back.
                    
                loss, cache = self.forward(inputs, targets, h_prev)
                dWxh, dWhh, dWhy, dbh, dby = self.backward(inputs, targets, cache)

                # AdaGrad update
                for param, dparam, mparam in zip([self.params['Wxh'], self.params['Whh'], self.params['Why'], self.params['bh'], self.params['by']],
                                                [dWxh, dWhh, dWhy, dbh, dby],
                                                [mWxh, mWhh, mWhy, mbh, mby]):
                    mparam += dparam * dparam
                    param += -learning_rate * dparam / np.sqrt(mparam + 1e-8)
                data_pointer += self.seq_length

            if iter % 100 == 0:
                msg = f"iter: {iter}, loss: {loss:.4f}"
                print(msg)

    def predict(self, test_char, length):
        idxes = []
        x = np.zeros((self.vocab_size,1))
        x[self.char_to_idx[test_char]] = 1 # one-hot-vector encoding
        h = np.zeros((self.hidden_size, 1)) # initialize h
        for _ in range(length):
            hidden = np.tanh(np.dot(self.params['Wxh'], x) + np.dot(self.params['Whh'], h) + self.params['bh'])
            out = np.dot(self.params['Why'], hidden) + self.params['by']
            p = np.exp(out) / np.sum(np.exp(out)) # softmax probability
            max_idx = np.argmax(p)
            x = np.zeros((self.vocab_size,1))
            x[max_idx] = 1
            idxes.append(max_idx)
        txt = ''.join(self.idx_to_char[i] for i in idxes)
        print(txt)

In [13]:
data = "abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz "
hidden_size = 100
seq_length = 30

rnn = RNN(data, hidden_size, seq_length)
rnn.training(data, learning_rate=1e-1, iterations=5000)

data has 81 characters, 27 unique.
iter: 0, loss: 100.2905
iter: 100, loss: 0.2105
iter: 200, loss: 0.0916
iter: 300, loss: 0.0579
iter: 400, loss: 0.0421
iter: 500, loss: 0.0330
iter: 600, loss: 0.0271
iter: 700, loss: 0.0231
iter: 800, loss: 0.0200
iter: 900, loss: 0.0177
iter: 1000, loss: 0.0159
iter: 1100, loss: 0.0144
iter: 1200, loss: 0.0131
iter: 1300, loss: 0.0121
iter: 1400, loss: 0.0112
iter: 1500, loss: 0.0104
iter: 1600, loss: 0.0097
iter: 1700, loss: 0.0091
iter: 1800, loss: 0.0086
iter: 1900, loss: 0.0082
iter: 2000, loss: 0.0077
iter: 2100, loss: 0.0074
iter: 2200, loss: 0.0070
iter: 2300, loss: 0.0067
iter: 2400, loss: 0.0064
iter: 2500, loss: 0.0061
iter: 2600, loss: 0.0059
iter: 2700, loss: 0.0057
iter: 2800, loss: 0.0055
iter: 2900, loss: 0.0053
iter: 3000, loss: 0.0051
iter: 3100, loss: 0.0049
iter: 3200, loss: 0.0048
iter: 3300, loss: 0.0046
iter: 3400, loss: 0.0045
iter: 3500, loss: 0.0043
iter: 3600, loss: 0.0042
iter: 3700, loss: 0.0041
iter: 3800, loss: 0.0040


In [14]:
rnn.predict('a', 30)

bcdefghijklmnopqrstuvwxyz abcd
