In [1]:
import numpy as np
import time

def relu(x, d=0):
    #d==True during backprop
    x[x < 0] = 0
    if d: x = np.where(x <= 0, 1, 0)
    return x

def softmax(x, d=0):
    #d==True during backprop
    x = np.exp(x) / sum(np.exp(x))
    if d: x = np.where(x < 0, 0, 1)
    return x

def KL_divergence(X, Y, d=0):
    #d==True during backprop
    #Derived formula obtained from http://proceedings.mlr.press/v37/theis15-supp.pdf
    loss = - Y.dot(np.log(X)) if not d else np.dot((X-Y).T, np.identity(X.shape[0]))
    return loss

# Shape(3, 3)
# 3 input neurons and one layer with 10 hidden neurons and an output of three neurons

def rnn(shape, activation):
    # [3, 10, 3 ,1]
    # return layer_stack
    pass

def initialize_layer(this_layer_neurons, previous_layer_neurons, 
                     activation_function=relu): 
    size = (this_layer_neurons, previous_layer_neurons)
    weights = np.random.standard_normal(size)
    # http://cs231n.github.io/neural-networks-2/ initialized at 0.001
    biases = np.full(previous_layer_neurons, 0.001)
    previous_activation = np.zeros(previous_layer_neurons)
    previous_pre_activation = np.zeros(previous_layer_neurons)
    layer = dict([("W", weights), ("b", biases), ("f", activation_function), 
                  ("a", previous_activation)])
    return layer

def activate_layer(layer, d=0):
    #this_layer_weights
    W = layer["W"]
    #previous_layer_output
    a = layer["a"]
    #bias
    b = layer["b"]
    #activation_function
    f = layer["f"]
    #logits aka. pre-nonlinearity activation
    z = a.dot(W) + b
    return f(z, d=d)

def add_layer(layer, L=[]):
    #a layer must be a dict of weigts, bias, activation function, 
    #and a tensor of the previous layers activation output
    return L + [layer]

def feed_input_to_stack(input_data, layer_stack):
    #set previous_layer_activation of first layer to be the input
    layer_stack[0]["a"] = input_data
    return layer_stack

def forward_pass(layer_stack):
    #previous_activation output
    out = None
    for i, layer in enumerate(layer_stack):
        if out is not None: layer_stack[i]["a"] = out
        out = activate_layer(layer)
    return out, layer_stack

def back_prop(layer_stack, dloss):
    for layer in reversed(layer_stack[:-1]):
        #I recompute z to save some memory and lines of code
        out = activate_layer(layer, d=True)
        #previous_activation output
        a = layer["a"]
        x = np.multiply(dloss, out)
        print(x)
        dz = x.dot(a)
        break

input_data = np.array([0 for _ in range(99)] + [1])    
    
#layer_stack = add_layer(initialize_layer(100, 3, relu))
#layer_stack = add_layer(initialize_layer(3, 10, relu), layer_stack)
#layer_stack = add_layer(initialize_layer(10, 3, relu), layer_stack)
#layer_stack = add_layer(initialize_layer(3, 100, softmax), layer_stack)
#layer_stack = feed_input_to_stack(input_data, layer_stack)

layer_stack = add_layer(initialize_layer(10, 3, relu))

out, layer_stack = forward_pass(layer_stack)
loss = KL_divergence(out, input_data)
dloss = KL_divergence(out, input_data, d=True)
print(loss)
print(dloss)
print(out)

back_prop(layer_stack, dloss)
#for l, y in layer_stack: print(l,)
#activate_layer(W1, b1, relu)

7.763728430491088
[ 1.28683122e-04  6.87305926e-04  1.96977287e-05  1.06910455e-01
  6.70484202e-05  8.89449206e-03  4.68304275e-03  7.26125219e-03
  9.57541645e-04  5.89843765e-04  9.18477513e-05  1.69754876e-04
  1.12025435e-03  1.59886052e-03  7.99938106e-03  1.23569070e-04
  8.48263493e-05  2.81502864e-03  1.44517938e-03  2.79573771e-02
  5.00001131e-05  1.21391439e-02  1.48331266e-03  1.98616707e-02
  7.82199728e-05  3.88906271e-05  1.09189667e-02  3.29399329e-03
  2.34578239e-02  9.39871519e-05  1.76311410e-03  3.08748829e-04
  3.59665466e-04  5.60392341e-03  4.04458267e-04  2.87215836e-05
  5.14324306e-03  7.52385372e-04  1.50987171e-04  5.98638842e-05
  7.46385949e-03  1.69990376e-04  4.79140627e-03  1.98474069e-02
  1.76738554e-03  2.01260575e-02  2.09155470e-02  6.72387947e-04
  1.30957015e-02  4.38618181e-02  4.70914038e-04  1.68345510e-03
  5.19008353e-03  3.48707063e-03  1.96088496e-06  4.68117711e-02
  2.07288461e-04  1.12411821e-04  2.96875238e-04  7.70784013e-05
  3.857

ValueError: operands could not be broadcast together with shapes (100,) (3,) 

In [2]:
layer_stack[-1]

{'W': array([[-1.01021815e+00, -5.76838027e-01, -1.79904516e+00,
          2.32386539e+00, -1.43571296e+00,  1.09281349e+00,
          7.79127919e-01,  1.09636225e+00,  4.28375761e-01,
         -7.98242137e-01, -1.20590313e+00, -1.38917876e+00,
          8.28312075e-02, -5.65594187e-03,  9.99967186e-01,
         -1.24186656e+00, -1.01583585e+00,  3.86691121e-01,
          8.50199074e-02,  1.80029894e+00, -1.33764108e+00,
          1.18521292e+00,  3.82608203e-01,  1.14476559e+00,
         -1.17031402e+00, -1.73409200e+00,  1.05582941e+00,
         -9.59102425e-02,  1.24104277e+00, -1.35648320e+00,
          1.81294122e-01, -3.86528001e-01, -3.88608829e-01,
          4.35757865e-01, -9.57757592e-01, -1.91095031e+00,
          4.83586560e-01, -4.07495477e-01, -9.05027166e-01,
         -1.20617998e+00,  6.21360664e-01, -9.49408126e-01,
          5.29798010e-01,  1.85742697e+00,  7.13410254e-03,
          1.59637506e+00,  1.63837454e+00, -2.74462330e-01,
          1.10495318e+00,  1.541147

In [88]:
np.multiply(dloss,out).sum()

-1.734723475976807e-17

In [77]:
a

NameError: name 'a' is not defined

In [47]:
def one_hot_vectorizer(tokenized_text):
    words = set(tokenized_text)
    vocab_size = len(words)
    onehot = np.identity(vocab_size)
    
    word2onehot = dict(zip(words, onehot))
    onehotargmax2word = dict(zip(onehot.argmax(axis=1), words)) 
                                #cant use array as key
                                #onehot.argmax(axis=1) same as range(vocab_size)
    
    return word2onehot, onehotargmax2word
    
    
tokenized_text = "this is a test to se how well it all works -did i say it was a test?".split()

w2o, o2w = one_hot_vectorizer(tokenized_text)