In [77]:
import numpy as np

def sigmoid(input, deriv=False):
    if deriv:
        return input * (1 - input)
    else:
        return 1 / (1 + np.exp(-input))

def tanh(input, deriv=False):
    if deriv:
        return 1 - input ** 2
    else:
        return np.tanh(input)

def softmax(input):
    e_input = np.exp(input - np.max(input))
    return e_input / e_input.sum(axis=0, keepdims=True)

class GRUModel:
    def __init__(self, vocab_size, hidden_size):
        self.vocab_size = vocab_size
        self.h_size = hidden_size
        self.learning_rate = 1e-1

        self.init_parameters()
        self.init_adagrad()

    def init_parameters(self):
        self.Wz = np.random.rand(self.h_size + self.vocab_size, self.h_size) * 0.1 - 0.05
        self.bz = np.zeros((self.h_size, 1))

        self.Wr = np.random.rand(self.h_size + self.vocab_size, self.h_size) * 0.1 - 0.05
        self.br = np.zeros((self.h_size, 1))

        self.Wh = np.random.rand(self.h_size + self.vocab_size, self.h_size) * 0.1 - 0.05
        self.bh = np.zeros((self.h_size, 1))

        self.Wy = np.random.rand(self.vocab_size, self.h_size) * 0.1 - 0.05
        self.by = np.zeros((self.vocab_size, 1))

    def init_adagrad(self):
        self.mdWy = np.zeros_like(self.Wy)
        self.mdWh = np.zeros_like(self.Wh)
        self.mdWr = np.zeros_like(self.Wr)
        self.mdWz = np.zeros_like(self.Wz)
        self.mdby = np.zeros_like(self.by)
        self.mdbh = np.zeros_like(self.bh)
        self.mdbr = np.zeros_like(self.br)
        self.mdbz = np.zeros_like(self.bz)

    def forward_pass(self, inputs):
        hprev = np.zeros((self.h_size, 1))
        z, r, h_hat, h = {}, {}, {}, {-1: hprev}

        for t in range(len(inputs)):
            x = np.zeros((self.vocab_size, 1))
            x[inputs[t]] = 1

            concat_hx = np.concatenate((h[t-1], x))
            r[t] = sigmoid(np.dot(self.Wr.T, concat_hx) + self.br)
            z[t] = sigmoid(np.dot(self.Wz.T, concat_hx) + self.bz)

            concat_hrx = np.concatenate((np.multiply(r[t], h[t-1]), x))
            h_hat[t] = tanh(np.dot(self.Wh.T, concat_hrx) + self.bh)
            h[t] = np.multiply(z[t], h[t-1]) + np.multiply(1 - z[t], h_hat[t])

        y = np.dot(self.Wy, h[t]) + self.by
        probDis = softmax(y)

        return z, r, h_hat, h, y, probDis

    def backward_pass(self, z, r, h_hat, h, y, probDis, inputs, targets):
        # Initialize the gradients for each parameter
        dWy, dby = np.zeros_like(self.Wy), np.zeros_like(self.by)
        dWh, dWr, dWz = np.zeros_like(self.Wh), np.zeros_like(self.Wr), np.zeros_like(self.Wz)
        dbh, dbr, dbz = np.zeros_like(self.bh), np.zeros_like(self.br), np.zeros_like(self.bz)

        # Initialize gradients for hidden states
        dh_next = np.zeros_like(h[0])
        dh_hat_next = np.zeros_like(h_hat[0])

        # Compute loss and gradients at output
        loss = -np.sum(np.log(probDis[targets, 0]))
        dy = probDis.copy()
        dy[targets] -= 1

        # Update gradients for Wy and by
        dWy += np.dot(dy, h[len(inputs) - 1].T)
        dby += dy

        # Backpropagation through time
        for t in reversed(range(len(inputs))):
            x = np.zeros((self.vocab_size, 1))
            x[inputs[t]] = 1
            # Derivatives of the hidden state
            dh = np.dot(self.Wy.T, dy) + dh_next
            dh_hat = dh * (1 - z[t])
            dz = dh * (h[t - 1] - h_hat[t])

            # Derivatives of candidate hidden state
            dh_hat_tilde = dh_hat * (1 - h_hat[t] ** 2)
            dWh += np.dot(np.concatenate((r[t] * h[t - 1], x), axis=0), dh_hat_tilde.T)
            dbh += dh_hat_tilde

            # Derivatives of reset gate
            dr = np.dot(self.Wh[:self.h_size, :].T, dh_hat_tilde) * h[t - 1]
            dr = dr * r[t] * (1 - r[t])
            dWr += np.dot(np.concatenate((h[t - 1], x), axis=0), dr.T)
            dbr += dr

            # Derivatives of update gate
            dz = dz * z[t] * (1 - z[t])
            dWz += np.dot(np.concatenate((h[t - 1], x), axis=0), dz.T)
            dbz += dz

            # Update dh_next for next timestep
            dh_next = dh * z[t] + np.dot(self.Wr[:, :self.h_size].T, dr) + np.dot(self.Wh[:, :self.h_size].T, dh_hat_tilde)

        # Clip gradients to prevent exploding gradients
        for dparam in [dWy, dby, dWh, dbh, dWr, dbr, dWz, dbz]:
            np.clip(dparam, -5, 5, out=dparam)

        # Update parameters using AdaGrad or similar optimizer steps
        self.update_params(dWy, dby, dWh, dbh, dWr, dbr, dWz, dbz)

        return loss

    def update_params(self, dWy, dby, dWh, dbh, dWr, dbr, dWz, dbz):
        for param, dparam, mem in zip([self.Wy, self.by, self.Wh, self.bh, self.Wr, self.br, self.Wz, self.bz],
                                    [dWy, dby, dWh, dbh, dWr, dbr, dWz, dbz],
                                    [self.mdWy, self.mdby, self.mdWh, self.mdbh, self.mdWr, self.mdbr, self.mdWz, self.mdbz]):
            mem += dparam * dparam
            param -= self.learning_rate * dparam / (np.sqrt(mem) + 1e-8)

    def train(self, inputs, targets, n_iters=100):
        for j in range(n_iters):
            for i in range(len(inputs)):
                z, r, h_hat, h, y, probDis = self.forward_pass(inputs[i])
                tgt = np.zeros((self.vocab_size, 1))
                tgt[targets] = 1
                loss = self.backward_pass(z, r, h_hat, h, y, probDis, inputs[i], targets[i])
            print(f"iteration: {j} loss: {loss}")

    def generate_text(self, inputs):
        _, _, _, _, _, probDis = self.forward_pass(inputs)
        return np.argmax(probDis, axis=0)
    


In [78]:
import numpy as np

# Seed random
np.random.seed(0)

# Read data and setup maps for integer encoding and decoding.
with open('input.txt', 'r') as file: 
	data = file.read() 
    
chars = sorted(list(set(data))) # Sort makes model predictable (if seeded).
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 57254 characters, 67 unique.


In [79]:
inputs = []
targets = []
seq_length = 5
oo = 0
for k in range(0, len(data) - seq_length):
    input_seq = [char_to_ix[ch] for ch in data[k:k+seq_length]]
    target_char = char_to_ix[data[k+seq_length]]
    inputs.append(input_seq)
    targets.append(target_char)

inputs = inputs[:100]
targets = targets[:100]

In [80]:
print(type(targets))

<class 'list'>


In [81]:
neee = GRUModel(vocab_size, 10)
neee.train(inputs, targets,100)
    

ValueError: shapes (10,77) and (10,1) not aligned: 77 (dim 1) != 10 (dim 0)

In [None]:
hh = neee.generate_text(inputs[2])
print(ix_to_char[targets[5]])
print(hh)

m
[1]
