In [223]:
import numpy as np

def sigmoid(input, deriv=False):
    if deriv:
        return input*(1-input)
    else:
        return 1 / (1 + np.exp(-input))

def tanh(input, deriv=False):
    if deriv:
        return 1 - input ** 2
    else:
        return np.tanh(input)

def softmax(input):
    # Subtraction of max value improves numerical stability.
    e_input = np.exp(input - np.max(input))
    return e_input / e_input.sum()


class GRUModel:
    def __init__(self, vocab_size, hidden_size):
        self.vocab_size = vocab_size
        self.h_size = hidden_size # hidden layer size
        self.learning_rate = 1e-1

        # Model parameters
        self.init_parameters()

        # AdaGrad memory
        self.init_adagrad()

    def init_parameters(self):
        # Initialize weights and biases for the gates and transformations
        self.Wz = np.random.rand(self.h_size + self.vocab_size, self.h_size) * 0.1 - 0.05
        self.bz = np.zeros((self.h_size, 1))

        self.Wr = np.random.rand(self.h_size + self.vocab_size, self.h_size) * 0.1 - 0.05
        self.br = np.zeros((self.h_size, 1))

        self.Wh = np.random.rand(self.h_size + self.vocab_size, self.h_size) * 0.1 - 0.05
        self.bh = np.zeros((self.h_size, 1))

        self.Wy = np.random.rand(self.vocab_size, self.h_size) * 0.1 - 0.05
        self.by = np.zeros((self.vocab_size, 1))

    def init_adagrad(self):
        self.mdWy = np.zeros_like(self.Wy)
        self.mdWh = np.zeros_like(self.Wh)
        self.mdWr = np.zeros_like(self.Wr)
        self.mdWz = np.zeros_like(self.Wz)
        self.mdby = np.zeros_like(self.by)
        self.mdbh = np.zeros_like(self.bh)
        self.mdbr = np.zeros_like(self.br)
        self.mdbz = np.zeros_like(self.bz)

    def forward_pass(self, inputs, hprev):
        z, r, h_hat, h, y =  {}, {}, {}, {-1: hprev}, {} # Dictionaries contain variables for each timestep.
        for t in range(len(inputs)):
            # Set up one-hot encoded input
            x = np.zeros((self.vocab_size, 1))
            x[inputs[t]] = 1
                
            # Calculate update and reset gates
            r[t] = sigmoid(np.dot(self.Wr.T, np.concatenate((h[t-1], x))) + self.br)
            z[t] = sigmoid(np.dot(self.Wz.T, np.concatenate((h[t-1], x))) + self.bz)
 
            # Calculate hidden units
            h_hat[t] = tanh(np.dot(self.Wh.T, np.concatenate(( np.multiply(r[t], h[t-1]), x)) ) + self.bh)  
            h[t] = np.multiply(z[t], h_hat[t]) + np.multiply((1 - z[t]), h[t-1])

        y = np.dot(self.Wy, h[t]) + self.by
        
        # Probability distribution
        probDis = softmax(y)
        return z, r, h_hat, h, y, probDis

    def backward_pass(self, z, r, h_hat, h, y, probDis, inputs, targets):
            # Gradients for each parameter
        dWy, dWh, dWr, dWz = np.zeros_like(self.Wy), np.zeros_like(self.Wh), np.zeros_like(self.Wr), np.zeros_like(self.Wz)
        dby, dbh, dbr, dbz = np.zeros_like(self.by), np.zeros_like(self.bh), np.zeros_like(self.br), np.zeros_like(self.bz)
        dhnext = np.zeros_like(h[0])

        # Output error
        dy = probDis.copy()
        dy -= 1  # Derivative of cross-entropy loss

        # Iterate backwards through time
        for t in reversed(range(len(inputs))):
            x = np.zeros((self.vocab_size, 1))
            x[inputs[t]] = 1
            dWy += np.dot(dy, h[t].T)
            dby += dy

            # Gradient for h
            dh = np.dot(self.Wy.T, dy) + dhnext
            # Update gate gradient
            dz = dh * (h_hat[t] - h[t-1])
            dWz += np.dot( np.concatenate((h[t-1], x)), dz.T )
            dbz += dz

            # Candidate hidden state gradient
            dh_hat = dh * z[t]
            dh_hat_raw = dh_hat * (1 - h_hat[t]**2)  # tanh derivative
            dWh += np.dot(np.concatenate((r[t] * h[t-1], x)), dh_hat_raw.T)
            dbh += dh_hat_raw

            # Reset gate gradient
            dr = np.dot(self.Wh.T, dh_hat_raw) * h[t-1]
            dWr += np.dot(np.concatenate((h[t-1], x)), dr.T)
            dbr += dr

            # Gradient for next h iteration (backpropagation through time)
            dhnext = dh * (1 - z[t]) + np.dot(self.Wh.T, dh_hat_raw) * r[t]

            # Update the gradients w.r.t input if necessary (for input embedding learning)
            # (Omitted here for simplicity, depends on your model setup)

        # Apply AdaGrad or another form of gradient normalization/clipping here if needed
        # (Omitted here for simplicity)

        return dWy, dWh, dWr, dWz, dby, dbh, dbr, dbz


    # def train(self, inputs, targets, n_iters=1000):
      
    #    for i in range(n_iters):
           

    #        for param, dparam, mem in zip([self.Wy, self.Wh, self.Wr, self.Wz, self.by, self.bh, self.br, self.bz],
    #                           [dWy, dWh, dWr, dWz, dby, dbh, dbr, dbz],
    #                           [self.mdWy, self.mdWh, self.mdWr, self.mdWz, self.mdby, self.mdbh, self.mdbr, self.mdbz]):
    #             np.clip(dparam, -5, 5, out=dparam)
    #             mem += dparam * dparam
    #             param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8)  # Small added term for numerical stability


    def calculate_loss(self, targets, outputs):
        # Calculate and return the loss
        pass


    def update_parameters(self):
        # Update model parameters with gradients using AdaGrad
        pass

    def reset_state(self):
        # Reset the hidden state
        pass

    def generate_text(self, seed_ix, n):
        # Generate text starting from a seed index
        pass


In [224]:
import numpy as np

# Seed random
np.random.seed(0)

# Read data and setup maps for integer encoding and decoding.
with open('input.txt', 'r') as file: 
	data = file.read() 
    
chars = sorted(list(set(data))) # Sort makes model predictable (if seeded).
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 57254 characters, 67 unique.


In [225]:
inputs = []
targets = []
seq_length = 5
oo = 0
for k in range(0, len(data) - seq_length):
    input_seq = [char_to_ix[ch] for ch in data[k:k+seq_length]]
    target_char = char_to_ix[data[k+seq_length]]
    inputs.append(input_seq)
    targets.append(target_char)

inputs = inputs[:5]
targets = targets[:5]

In [236]:
neee = GRUModel(vocab_size, 10)
h_s = np.zeros((10, 1))
for i in range(len(inputs)):
    z, r, h_hat, h, y, probDis = neee.forward_pass(inputs[i], h_s)
    output = probDis
    tgt = np.zeros((vocab_size, 1))
    tgt[targets[i]] = 1
    loss = -np.sum(tgt * np.log(output + 1e-9))

    #loss, dWy, dWh, dWr, dWz, dby, dbh, dbr, dbz, hprev = neee.backward_pass(z, r, h_hat, h, y, probDis, inputs[i], targets[i])
    dWy = np.zeros_like(neee.Wy)
    dWz = np.zeros_like(neee.Wz)
    dWh = np.zeros_like(neee.Wh)
    dWr = np.zeros_like(neee.Wr)
    dy = probDis.copy()
    dy -= 1  # Derivative of cross-entropy loss
    for t in reversed(range(len(inputs))):
            x = np.zeros((vocab_size, 1))
            x[inputs[t]] = 1
            dWy += np.dot(dy, h[t].T)

            # Gradient for h
            dh = np.dot(neee.Wy.T, dy)
            dz = dh * (h_hat[t] - h[t-1])
            dWz += np.dot( np.concatenate((h[t-1], x)), dz.T )
            
           # Candidate hidden state gradient
            dh_hat = dh * z[t]
            dh_hat_raw = dh_hat * (1 - h_hat[t]**2)  # tanh derivative
            dWh += np.dot(np.concatenate((r[t] * h[t-1], x)), dh_hat_raw.T)

            # Reset gate gradient
            print(dWh.shape)
            print(neee.Wh.shape)
            dr = np.dot(neee.Wh, dh_hat_raw) * h[t-1]
            dWr += np.dot(np.concatenate((h[t-1], x)), dr.T)
            break
    break
    

(77, 10)
(77, 10)


ValueError: operands could not be broadcast together with shapes (77,1) (10,1) 

In [None]:
# sigmoid(np.dot(self.Wz, x[t]) + np.dot(self.Uz, h[t-1]) + self.bz)

W = np.random.rand(7, 4) * 0.1 - 0.05
h = np.random.rand(4,1) * 0.1 - 0.05
h2 = np.random.rand(4,1) * 0.1 - 0.05
x = np.random.rand(3,1) * 0.1 - 0.05

In [None]:
# np.zeros((vocab_size, 1))
for t in range(len(inputs)):
            # Set up one-hot encoded input
            x = np.zeros((vocab_size, 1))
            x[inputs[t]] = 1
            print(inputs[t])


[20, 47, 50, 37, 45]
[47, 50, 37, 45, 1]
[50, 37, 45, 1, 41]
[37, 45, 1, 41, 48]
[45, 1, 41, 48, 51]


In [None]:
for t in range(5):  
    x = np.zeros((10, 1))
    x[t] = 1
    #print(x)