original
* https://gist.github.com/karpathy/d4dee566867f8291f086

In [1]:
import numpy as np

In [3]:
with open('book.txt') as f:
    data = f.read()
    data = data.lower()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size}, {vocab_size} unique')

data has 371661, 66 unique


In [11]:
print(data[:500])

capítulo primeiro
do título
uma noite destas, vindo da cidade para o engenho novo, encontrei no trem da
central um rapaz aqui do bairro, que eu conheço de vista e de chapéu.
cumprimentou-me, sentou-se ao pé de mim, falou da lua e dos ministros, e
acabou recitando-me versos. a viagem era curta, e os versos pode ser que não
fossem inteiramente maus. sucedeu, porém, que, como eu estava cansado, fechei
os olhos três ou quatro vezes; tanto bastou para que ele interrompesse a leitura e
metesse os vers


In [12]:
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

In [91]:
class T(object):
    
    def __init__(self, data, seq_len):
        self.seq_len = seq_len
        
        self.data = data
        # pointer
        self.p = 0
    
    def get(self):
        if self.p + self.seq_len + 1 > len(self.data):
            self.p = 0
        
        X = [char_to_ix[char] for char in self.data[self.p  :self.p+self.seq_len  ]]
        y = [char_to_ix[char] for char in self.data[self.p+1:self.p+self.seq_len+1]]
        
        self.p += self.seq_len
        
        return X, y

In [247]:
class RNN(object):
    
    def __init__(self, vocab_dim, h_dim, seq_len, learning_rate, seed=None):
        if seed:
            np.random.seed(seed)
    
        # vocabulary dimension
        self.vocab_dim = vocab_dim
        # hidden nodes dimension
        self.h_dim = h_dim
        # sequence length
        self.seq_len = seq_len
        
        #
        self.learning_rate = learning_rate
        
        #
        self._build()

    def _build(self):
        self._Wx = np.random.randn(self.vocab_dim, self.h_dim) * 1e-2
        
        self._Wh = np.random.randn(self.h_dim, self.h_dim) * 1e-2
        self._Wy = np.random.randn(self.h_dim, self.vocab_dim) * 1e-2
        
        self._bh = np.zeros((1, self.h_dim))
        self._by = np.zeros((1, self.vocab_dim))
        
        self._mWx, self._mWh, self._mWy = np.zeros_like(self._Wx), np.zeros_like(self._Wh), np.zeros_like(self._Wy)
        self._mbh,  self._mby = np.zeros_like(self._bh), np.zeros_like(self._by)
        
    def _forward(self, X, y, hprev):
        """
        X (seq_len, vocab_size)
        """
        h = np.dot(X, self._Wx) + np.dot(hprev, self._Wh) + self._bh
        h = np.tanh(h)
        
        y_pred = np.dot(h, self._Wy) + self._by
        y_pred = self._softmax(y_pred)
        
        loss = -np.log(y_pred[0,y])
        
        return h, y_pred, loss
    
    def _backward(self, X, y, H, hprev, Y):
        Y[y] -= 1
        
        self._dWhy += np.dot(H[None].T, Y[None])
        self._dby  += Y[None]
        
        dh = np.dot(Y[None], self._Wy.T) + self._dhnext
        dhraw = self._tanh_derivative(H) * dh
        assert dhraw.shape == (1, self.h_dim)
        
        self._dWxh += np.dot(    X[None].T, dhraw)
        self._dWhh += np.dot(hprev[None].T, dhraw)
        self._dbh  += dhraw
        
        self._dhnext = np.dot(dhraw, self._Wh.T)
        assert self._dhnext.shape == (1, self.h_dim)
    
    def _softmax(self, x):
        y = np.exp(x - np.max(x))
        return y/np.sum(y)
    
    def _tanh_derivative(self, value):
        return 1 - value**2
    
    def _clip_gradients(self):
        for dparam in [self._dWxh, self._dWhh, self._dWhy, self._dbh, self._dby]:
            np.clip(dparam, -5, 5, out=dparam)
    
    def forward_backward(self, inputs, targets, hprev):
        X = np.zeros((self.seq_len, self.vocab_dim))
        for t, input in enumerate(inputs):
            X[t, input] = 1
        
        H = np.zeros((self.seq_len, self.h_dim))
        Y = np.zeros_like(X)
        total_loss = 0
        
        for t in range(self.seq_len):
            h = hprev if t==0 else H[t-1]
            
            H[t], Y[t], loss = self._forward(X[t], targets[t], h)
            total_loss += loss
        
        self._dWxh, self._dWhh, self._dWhy = np.zeros_like(self._Wx), np.zeros_like(self._Wh), np.zeros_like(self._Wy)
        self._dbh,  self._dby = np.zeros_like(self._bh), np.zeros_like(self._by)
        self._dhnext = np.zeros_like(H[0])
        
        for t in range(self.seq_len)[::-1]:
            self._backward(X[t], targets[t], H[t], H[t-1], Y[t])
        
        self._clip_gradients()
        
        params  = [self._Wx,   self._Wh,   self._Wy,   self._bh,  self._by]
        mparams = [self._mWx,  self._mWh,  self._mWy,  self._mbh, self._mby]
        dparams = [self._dWxh, self._dWhh, self._dWhy, self._dbh, self._dby]
        
#         print(f'Wx: {np.sum(self._Wx):4.4} - Wh: {np.sum(self._Wh):4.4f} - Wy: {np.sum(self._Wy):4.4f}')
#         print(f'Wx: {np.sum(self._dWxh):4.4} - Wh: {np.sum(self._dWhh):4.4f} - Wy: {np.sum(self._dWhy):4.4f}')
        for param, dparam, mparam in zip(params, dparams, mparams):
            mparam += dparam ** 2
            param -= self.learning_rate * dparam/np.sqrt(mparam + 1e-8)
#         print(f'Wx: {np.sum(self._Wx):4.4} - Wh: {np.sum(self._Wh):4.4f} - Wy: {np.sum(self._Wy):4.4f}')
        
        return total_loss, H[-1]
    
    def sample(self, input, hprev, seq_len):
        X = np.zeros((1, self.vocab_dim))
        X[0, input] = 1
        
        ixes = []
        for t in range(seq_len):
            h = np.dot(X, self._Wx) + np.dot(hprev, self._Wh) + self._bh
            h = np.tanh(h)
            
            y = np.dot(h, self._Wy) + self._by
            y = self._softmax(y)
            
            ix = np.random.choice(range(self.vocab_dim), p=y.ravel())
            
            X *= 0
            X[0,ix] = 1
            ixes.append(ix)
        
        print('\n\n', ''.join(ix_to_char[ix] for ix in ixes))

In [None]:
seq_len = 25
threshold = len(data)//25

data_supplier = T(data, seq_len)
rnn = RNN(vocab_size, 100, seq_len, 1e-1, seed=42)

In [None]:
n = 0
while n <= 1000000:
    inputs, targets = data_supplier.get()
    if n % threshold == 0:
        hprev = np.zeros((1, 100))
   
    loss, hprev = rnn.forward_backward(inputs, targets, hprev)
    
    print(f'\riter {n:2}, loss: {loss:4.6f}', end='')
    if (n+1) % 5000 == 0:
        rnn.sample(inputs[0], hprev, 200)

    n += 1

iter 484999, loss: 35.608285
 h! pa o s be flhhí be e da nhhh cho gê eginhelhe de hhí a hus fe a fí chhe qu gu e chhenhhe pre, tufé
o ubeme eleu a chhe me po s ca po da bra pigigre be s me go da de fê diça m
lhõe be, ra to go etit
iter 485621, loss: 48.023473