In [19]:
import numpy as np

# Seed random
np.random.seed(0)

# Read data and setup maps for integer encoding and decoding.
with open('input.txt', 'r') as file: 
	data = file.read() 
    
chars = sorted(list(set(data))) # Sort makes model predictable (if seeded).
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 57254 characters, 67 unique.


In [20]:
# Activation functions
# NOTE: Derivatives are calculated using outcomes of their primitives (which are already calculated during forward prop).
def sigmoid(input, deriv=False):
    if deriv:
        return input*(1-input)
    else:
        return 1 / (1 + np.exp(-input))

def tanh(input, deriv=False):
    if deriv:
        return 1 - input ** 2
    else:
        return np.tanh(input)

# Derivative is directly calculated in backprop (in combination with cross-entropy loss function).
def softmax(input):
    # Subtraction of max value improves numerical stability.
    e_input = np.exp(input - np.max(input))
    return e_input / e_input.sum()

# Hyper parameters
N, h_size, o_size = vocab_size, 5, vocab_size # Hidden size is set to vocab_size, assuming that level of abstractness is approximately proportional to vocab_size (but can be set to any other value).
seq_length = 5 # Longer sequence lengths allow for lengthier latent dependencies to be trained.
learning_rate = 1e-1

# Model parameter initialization
Wz = np.random.rand(h_size, N) * 0.1 - 0.05
Uz = np.random.rand(h_size, h_size) * 0.1 - 0.05
bz = np.zeros((h_size, 1))

Wr = np.random.rand(h_size, N) * 0.1 - 0.05
Ur = np.random.rand(h_size, h_size) * 0.1 - 0.05
br = np.zeros((h_size, 1))

Wh = np.random.rand(h_size, N) * 0.1 - 0.05
Uh = np.random.rand(h_size, h_size) * 0.1 - 0.05
bh = np.zeros((h_size, 1))

Wy = np.random.rand(o_size, h_size) * 0.1 - 0.05
by = np.zeros((o_size, 1))

def lossFun(inputs, targets, hprev):
    # Initialize variables
    x, z, r, h_hat, h, y, p = {}, {}, {}, {}, {-1: hprev}, {}, {} # Dictionaries contain variables for each timestep.
    sequence_loss = 0

    cntt = 0
    # Forward prop
    for eachInput in inputs:

        for t in range(len(eachInput)):
            # Set up one-hot encoded input
            x[t] = np.zeros((vocab_size, 1))
            x[t][eachInput[t]] = 1
            
            # Calculate update and reset gates
            z[t] = sigmoid(np.dot(Wz, x[t]) + np.dot(Uz, h[t-1]) + bz)
            r[t] = sigmoid(np.dot(Wr, x[t]) + np.dot(Ur, h[t-1]) + br)
            
            # Calculate hidden units
            h_hat[t] = tanh(np.dot(Wh, x[t]) + np.dot(Uh, np.multiply(r[t], h[t-1])) + bh)
            h[t] = np.multiply(z[t], h[t-1]) + np.multiply((1 - z[t]), h_hat[t])
        
        # Regular output unit
        y[cntt] = np.dot(Wy, h[t]) + by
    
        # Probability distribution
        p[cntt] = softmax(y[cntt])
        
        # Cross-entropy loss
        loss = -np.sum(np.log(p[cntt][targets[cntt]]))
        sequence_loss += loss
        cntt += 1

    # Parameter gradient initialization
    dWy, dWh, dWr, dWz = np.zeros_like(Wy), np.zeros_like(Wh), np.zeros_like(Wr), np.zeros_like(Wz)
    dUh, dUr, dUz = np.zeros_like(Uh), np.zeros_like(Ur), np.zeros_like(Uz)
    dby, dbh, dbr, dbz = np.zeros_like(by), np.zeros_like(bh), np.zeros_like(br), np.zeros_like(bz)
    dhnext = np.zeros_like(h[0])
    
    # Backward prop
    for t in reversed(range(5)):
        # ∂loss/∂y
        dy = np.copy(p[t])
        dy[targets[t]] -= 1
        
        # ∂loss/∂Wy and ∂loss/∂by
        dWy += np.dot(dy, h[t].T)
        dby += dy
        
        # Intermediary derivatives
        dh = np.dot(Wy.T, dy) + dhnext
        dh_hat = np.multiply(dh, (1 - z[t]))
        dh_hat_l = dh_hat * tanh(h_hat[t], deriv=True)
        
        # ∂loss/∂Wh, ∂loss/∂Uh and ∂loss/∂bh
        dWh += np.dot(dh_hat_l, x[t].T)
        dUh += np.dot(dh_hat_l, np.multiply(r[t], h[t-1]).T)
        dbh += dh_hat_l
        
        # Intermediary derivatives
        drhp = np.dot(Uh.T, dh_hat_l)
        dr = np.multiply(drhp, h[t-1])
        dr_l = dr * sigmoid(r[t], deriv=True)
        
        # ∂loss/∂Wr, ∂loss/∂Ur and ∂loss/∂br
        dWr += np.dot(dr_l, x[t].T)
        dUr += np.dot(dr_l, h[t-1].T)
        dbr += dr_l
        
        # Intermediary derivatives
        dz = np.multiply(dh, h[t-1] - h_hat[t])
        dz_l = dz * sigmoid(z[t], deriv=True)
        
        # ∂loss/∂Wz, ∂loss/∂Uz and ∂loss/∂bz
        dWz += np.dot(dz_l, x[t].T)
        dUz += np.dot(dz_l, h[t-1].T)
        dbz += dz_l
        
        # All influences of previous layer to loss
        dh_fz_inner = np.dot(Uz.T, dz_l)
        dh_fz = np.multiply(dh, z[t])
        dh_fhh = np.multiply(drhp, r[t])
        dh_fr = np.dot(Ur.T, dr_l)
        
        # ∂loss/∂h𝑡₋₁
        dhnext = dh_fz_inner + dh_fz + dh_fhh + dh_fr

    return sequence_loss, dWy, dWh, dWr, dWz, dUh, dUr, dUz, dby, dbh, dbr, dbz, h[5 - 1]

def sample(h, seed_ix, n):
    # Initialize first word of sample ('seed') as one-hot encoded vector.

    
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = [seed_ix]
    
    for t in range(n):
        # Calculate update and reset gates
        z = sigmoid(np.dot(Wz, x) + np.dot(Uz, h) + bz)
        r = sigmoid(np.dot(Wr, x) + np.dot(Ur, h) + br)
        
        # Calculate hidden units
        h_hat = tanh(np.dot(Wh, x) + np.dot(Uh, np.multiply(r, h)) + bh)
        h = np.multiply(z, h) + np.multiply((1 - z), h_hat)
        
        # Regular output unit
        y = np.dot(Wy, h) + by
        
        # Probability distribution
        p = softmax(y)

        # Choose next char according to the distribution
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    
    return ixes

In [21]:
inputs = []
targets = []

for k in range(0, len(data) - seq_length):
    input_seq = [char_to_ix[ch] for ch in data[k:k+seq_length]]
    target_char = char_to_ix[data[k+seq_length]]
    inputs.append(input_seq)
    targets.append(target_char)

inputs = inputs[:20]
targets = targets[:20]

In [22]:
# Set the maximum number of iterations
max_iters = 1000

n, k = 0, 0
mdWy, mdWh, mdWr, mdWz = np.zeros_like(Wy), np.zeros_like(Wh), np.zeros_like(Wr), np.zeros_like(Wz)
mdUh, mdUr, mdUz = np.zeros_like(Uh), np.zeros_like(Ur), np.zeros_like(Uz)
mdby, mdbh, mdbr, mdbz = np.zeros_like(by), np.zeros_like(bh), np.zeros_like(br), np.zeros_like(bz)
smooth_loss = -np.log(1.0/vocab_size)*seq_length

print_interval = 10


while n < max_iters:
    # Reset memory if appropriate
    if k + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros((h_size, 1))
        k = 0



    # Get gradients for current model based on input and target sequences
    loss, dWy, dWh, dWr, dWz, dUh, dUr, dUz, dby, dbh, dbr, dbz, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    
    # Occasionally print loss information
    if n % print_interval == 0:
        print('iter %d, loss: %f, smooth loss: %f' % (n, loss, smooth_loss))

    if loss < 15:
        break

    # Update model with adagrad (stochastic) gradient descent
    for param, dparam, mem in zip([Wy,  Wh,  Wr,  Wz,  Uh,  Ur,  Uz,  by,  bh,  br,  bz],
                                  [dWy, dWh, dWr, dWz, dUh, dUr, dUz, dby, dbh, dbr, dbz],
                                  [mdWy,mdWh,mdWr,mdWz,mdUh,mdUr,mdUz,mdby,mdbh,mdbr,mdbz]):
        np.clip(dparam, -5, 5, out=dparam)
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # Small added term for numerical stability

    # Prepare for next iteration
    k += seq_length
    n += 1


iter 0, loss: 84.095243, smooth loss: 21.086535
iter 10, loss: 84.085414, smooth loss: 21.659159
iter 20, loss: 92.043522, smooth loss: 22.329859
iter 30, loss: 96.212950, smooth loss: 23.048352
iter 40, loss: 98.894114, smooth loss: 23.792405
iter 50, loss: 98.917684, smooth loss: 24.545698
iter 60, loss: 95.087404, smooth loss: 25.244008
iter 70, loss: 98.146127, smooth loss: 25.968191
iter 80, loss: 99.484027, smooth loss: 26.685002
iter 90, loss: 97.172083, smooth loss: 27.409201
iter 100, loss: 95.816251, smooth loss: 28.104362
iter 110, loss: 96.351833, smooth loss: 28.764329
iter 120, loss: 104.071489, smooth loss: 29.480316
iter 130, loss: 110.205508, smooth loss: 30.257978
iter 140, loss: 114.487346, smooth loss: 31.078939
iter 150, loss: 118.460262, smooth loss: 31.929314
iter 160, loss: 120.764970, smooth loss: 32.809553
iter 170, loss: 120.334562, smooth loss: 33.680192
iter 180, loss: 122.121260, smooth loss: 34.551838
iter 190, loss: 124.195014, smooth loss: 35.435287
ite

In [23]:
print(Wz.shape)
print(Uz.shape)

(5, 67)
(5, 5)


In [24]:
# After training, you can use the sample function to generate predictions
seed_ix = char_to_ix['k']  # Set the seed character index
num_predictions = 100  # Set the desired number of predictions
predictions = sample(hprev, seed_ix, num_predictions)
predicted_text = ''.join(ix_to_char[ix] for ix in predictions)
print('Predicted text:\n', predicted_text)

Predicted text:
 k psu psu psutipsp psu psu psu psu psu psu psuiipsu psu psu psu psp psuiipsp psuiipsp psu psu psp psu


In [25]:
inputs[57248]

IndexError: list index out of range