In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
sns.set()

In [2]:
def get_vocab(file, lower = False):
    with open(file, 'r') as fopen:
        data = fopen.read()
    if lower:
        data = data.lower()
    vocab = list(set(data))
    return data, vocab

def embed_to_onehot(data, vocab):
    onehot = np.zeros((len(data), len(vocab)), dtype = np.float32)
    for i in range(len(data)):
        onehot[i, vocab.index(data[i])] = 1.0
    return onehot

In [3]:
text, text_vocab = get_vocab('input.txt', lower = False)
onehot = embed_to_onehot(text, text_vocab)

In [4]:
learning_rate = 0.01
batch_size = 64
sequence_length = 12
epoch = 1000
num_layers = 2
size_layer = 128
possible_batch_id = range(len(text) - sequence_length - 1)
dimension = onehot.shape[1]
epsilon = 1e-8

U = np.random.randn(size_layer, dimension) / np.sqrt(size_layer)
U_g = np.zeros(U.shape)
Wf = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
Wf_g = np.zeros(Wf.shape)
Wi = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
Wi_g = np.zeros(Wi.shape)
Wc = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
Wc_g = np.zeros(Wc.shape)
Wo = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
Wo_g = np.zeros(Wo.shape)
V = np.random.randn(dimension, size_layer) / np.sqrt(dimension)
V_g = np.zeros(V.shape)

In [5]:
def tanh(x, grad=False):
    if grad:
        output = np.tanh(x)
        return (1.0 - np.square(output))
    else:
        return np.tanh(x)
    
def sigmoid(x, grad=False):
    if grad:
        return sigmoid(x) * (1 - sigmoid(x))
    else:
        return 1 / (1 + np.exp(-x))
    
def softmax(x):
    exp_scores = np.exp(x - np.max(x))
    return exp_scores / (np.sum(exp_scores, axis=1, keepdims=True) + 1e-8)

def derivative_softmax_cross_entropy(x, y):
    delta = softmax(x)
    delta[range(X.shape[0]), y] -= 1
    return delta

def forward_multiply_gate(w, x):
    return np.dot(w, x)

def backward_multiply_gate(w, x, dz):
    dW = np.dot(dz.T, x)
    dx = np.dot(w.T, dz.T)
    return dW, dx

def forward_add_gate(x1, x2):
    return x1 + x2

def backward_add_gate(x1, x2, dz):
    dx1 = dz * np.ones_like(x1)
    dx2 = dz * np.ones_like(x2)
    return dx1, dx2

def cross_entropy(Y_hat, Y, epsilon=1e-12):
    Y_hat = np.clip(Y_hat, epsilon, 1. - epsilon)
    N = Y_hat.shape[0]
    return -np.sum(np.sum(Y * np.log(Y_hat+1e-9))) / N

def forward_recurrent(x, c_state, h_state, U, Wf, Wi, Wc, Wo, V):
    mul_u = forward_multiply_gate(x, U.T)
    mul_Wf = forward_multiply_gate(h_state, Wf.T)
    add_Wf = forward_add_gate(mul_u, mul_Wf)
    f = sigmoid(add_Wf)
    mul_Wi = forward_multiply_gate(h_state, Wi.T)
    add_Wi = forward_add_gate(mul_u, mul_Wi)
    i = sigmoid(add_Wi)
    mul_Wc = forward_multiply_gate(h_state, Wc.T)
    add_Wc = forward_add_gate(mul_u, mul_Wc)
    c_hat = tanh(add_Wc)
    C = c_state * f + i * c_hat
    mul_Wo = forward_multiply_gate(h_state, Wo.T)
    add_Wo = forward_add_gate(mul_u, mul_Wo)
    o = sigmoid(add_Wo)
    h = o * tanh(C)
    mul_v = forward_multiply_gate(h, V.T)
    return (mul_u, mul_Wf, add_Wf, mul_Wi, add_Wi, mul_Wc, add_Wc, C, mul_Wo, add_Wo, h, mul_v, i, o, c_hat)

def backward_recurrent(x, c_state, h_state, U, Wf, Wi, Wc, Wo, V, d_mul_v, saved_graph):
    mul_u, mul_Wf, add_Wf, mul_Wi, add_Wi, mul_Wc, add_Wc, C, mul_Wo, add_Wo, h, mul_v, i, o, c_hat = saved_graph
    dV, dh = backward_multiply_gate(V, h, d_mul_v)
    dC = tanh(C, True) * o * dh.T
    do = tanh(C) * dh.T
    dadd_Wo = sigmoid(add_Wo, True) * do
    dmul_u1, dmul_Wo = backward_add_gate(mul_u, mul_Wo, dadd_Wo)
    dWo, dprev_state = backward_multiply_gate(Wo, h_state, dmul_Wo)
    dc_hat = dC * i
    dadd_Wc = tanh(add_Wc, True) * dc_hat
    dmul_u2, dmul_Wc = backward_add_gate(mul_u, mul_Wc, dadd_Wc)
    dWc, dprev_state = backward_multiply_gate(Wc, h_state, dmul_Wc)
    di = dC * c_hat
    dadd_Wi = sigmoid(add_Wi, True) * di
    dmul_u3, dmul_Wi = backward_add_gate(mul_u, mul_Wi, dadd_Wi)
    dWi, dprev_state = backward_multiply_gate(Wi, h_state, dmul_Wi)
    df = dC * c_state
    dadd_Wf = sigmoid(add_Wf, True) * df
    dmul_u4, dmul_Wf = backward_add_gate(mul_u, mul_Wf, dadd_Wf)
    dWf, dprev_state = backward_multiply_gate(Wf, h_state, dmul_Wf)
    dU, dx = backward_multiply_gate(U, x, dmul_u4)
    return (dU, dWf, dWi, dWc, dWo, dV)

In [6]:
for i in range(epoch):
    batch_x = np.zeros((batch_size, sequence_length, dimension))
    batch_y = np.zeros((batch_size, sequence_length, dimension))
    batch_id = random.sample(possible_batch_id, batch_size)
    prev_c = np.zeros((batch_size, size_layer))
    prev_h = np.zeros((batch_size, size_layer))
    for n in range(sequence_length):
        id1 = [k + n for k in batch_id]
        id2 = [k + n + 1 for k in batch_id]
        batch_x[:,n,:] = onehot[id1, :]
        batch_y[:,n,:] = onehot[id2, :]
    layers = []
    out_logits = np.zeros((batch_size, sequence_length, dimension))
    for n in range(sequence_length):
        layers.append(forward_recurrent(batch_x[:,n,:], prev_c, prev_h, U, Wf, Wi, Wc, Wo, V))
        prev_c = layers[-1][7]
        prev_h = layers[-1][10]
        out_logits[:, n, :] = layers[-1][-4]
    probs = softmax(out_logits.reshape((-1, dimension)))
    y = np.argmax(batch_y.reshape((-1, dimension)),axis=1)
    accuracy = np.mean(np.argmax(probs,axis=1) == y)
    loss = cross_entropy(probs, batch_y.reshape((-1, dimension)))
    delta = probs
    delta[range(y.shape[0]), y] -= 1
    delta = delta.reshape((batch_size, sequence_length, dimension))
    dU = np.zeros(U.shape)
    dV = np.zeros(V.shape)
    dWf = np.zeros(Wf.shape)
    dWi = np.zeros(Wi.shape)
    dWc = np.zeros(Wc.shape)
    dWo = np.zeros(Wo.shape)
    prev_c = np.zeros((batch_size, size_layer))
    prev_h = np.zeros((batch_size, size_layer))
    for n in range(sequence_length):
        d_mul_v = delta[:, n, :]
        dU_t, dWf_t, dWi_t, dWc_t, dWo_t, dV_t = backward_recurrent(batch_x[:,n,:], prev_c, prev_h, U, Wf, Wi, 
                                                                    Wc, Wo, V, d_mul_v, layers[n])
        prev_c = layers[n][7]
        prev_h = layers[n][10]
        dU += dU_t
        dV += dV_t
        dWf += dWf_t
        dWi += dWi_t
        dWc += dWc_t
        dWo += dWo_t
    U_g += dU ** 2
    U += -learning_rate * dU / np.sqrt(U_g + epsilon)
    V_g += dV ** 2
    V += -learning_rate * dV / np.sqrt(V_g + epsilon)
    Wf_g += dWf ** 2
    Wf += -learning_rate * dWf / np.sqrt(Wf_g + epsilon)
    Wi_g += dWi ** 2
    Wi += -learning_rate * dWi / np.sqrt(Wi_g + epsilon)
    Wc_g += dWc ** 2
    Wc += -learning_rate * dWc / np.sqrt(Wc_g + epsilon)
    Wo_g += dWo ** 2
    Wo += -learning_rate * dWo / np.sqrt(Wo_g + epsilon)
    if (i+1) % 50 == 0:
        print('epoch %d, loss %f, accuracy %f'%(i+1, loss, accuracy))

epoch 50, loss 3.047362, accuracy 0.190104
epoch 100, loss 2.894192, accuracy 0.191406
epoch 150, loss 2.825371, accuracy 0.205729
epoch 200, loss 2.735073, accuracy 0.255208
epoch 250, loss 2.697010, accuracy 0.251302
epoch 300, loss 2.532525, accuracy 0.270833
epoch 350, loss 2.617189, accuracy 0.269531
epoch 400, loss 2.583401, accuracy 0.295573
epoch 450, loss 2.540628, accuracy 0.272135
epoch 500, loss 2.435920, accuracy 0.316406
epoch 550, loss 2.473716, accuracy 0.289062
epoch 600, loss 2.457829, accuracy 0.250000
epoch 650, loss 2.491240, accuracy 0.287760
epoch 700, loss 2.433066, accuracy 0.304688
epoch 750, loss 2.390053, accuracy 0.302083
epoch 800, loss 2.384216, accuracy 0.302083
epoch 850, loss 2.463424, accuracy 0.282552
epoch 900, loss 2.417894, accuracy 0.273438
epoch 950, loss 2.394249, accuracy 0.287760
epoch 1000, loss 2.387533, accuracy 0.287760
