In [2]:
import numpy as np
from numpy import matrix as M
import time

def relu(x, d=0):
    #d==True during backprop
    x[x < 0] = 0
    if d: x = np.where(x <= 0, 1, 0)
    return x

def sigma(x, d=0):
    x = np.divide(1, (1 + np.exp(-x))) 
    if d: x = x - (1 - x)
    return x

def softmax(x, d=0):
    #d==True during backprop
    x = np.divide(np.exp(x), sum(np.exp(x))) #if not d else x.T.dot(1-x) #OBS solve where i != j
    return x

def KL_divergence(X, Y, d=0):
    #d==True during backprop
    #
    #Below derived formula obtained from http://proceedings.mlr.press/v37/theis15-supp.pdf
    #The derivative of the KL divergence with respect to the output is np.dot((X-Y).T, np.identity(X.shape[0]))
    #
    #Below derived formula obtained from: https://deepnotes.io/softmax-crossentropy
    #the derivative of the loss with respect to the softmax-crossentropy is simply Y-X 
    #(so the following derivative only works if the last layer is a softmax layer)
    loss = - Y.dot(np.log(X)) if not d else np.dot((X-Y).T, np.identity(X.shape[0])) #X - Y
    return loss

# Shape(3, 3)
# 3 input neurons and one layer with 10 hidden neurons and an output of three neurons

def rnn(shape, activation):
    # [3, 10, 3 ,1]
    # return layer_stack
    pass

def initialize_layer(previous_layer_neurons, this_layer_neurons,
                     activation_function=relu): 
    
    size = (previous_layer_neurons, this_layer_neurons)
    weights = np.random.standard_normal(size)
    # http://cs231n.github.io/neural-networks-2/ initialized at 0.001
    biases = np.full(this_layer_neurons, 0.001)
    previous_activation = None #ready for input size(previous layer neuron)
    
    layer = dict([("W", weights), ("b", biases), ("f", activation_function), 
                  ("a", previous_activation)])
    return layer

def activate_layer(layer, d=0):
    #this_layer_weights
    W = layer["W"]
    #previous_layer_output
    a = layer["a"]
    #bias
    b = layer["b"]
    #activation_function
    f = layer["f"]
    #logits aka. pre-nonlinearity activations
    #print("b-" if d else "f-", "a", a.shape,"W",W.shape,"b", b.shape)
    z = a.dot(W) + b #if not d else a.dot(W.T) #OBS! needs + bias_t-1
    #print(z.shape)
    return f(z, d=d)

def add_layer(layer, L=[]):
    #a layer must be a dict of weigts, bias, activation function, 
    #and a tensor of the previous layers activation output
    return L + [layer]

def feed_input_to_stack(input_data, layer_stack):
    #set previous_layer_activation of first layer to be the input
    layer_stack[0]["a"] = input_data
    return layer_stack

def forward_pass(layer_stack, train_data):
    #previous_activation output
    out = train_data
    for i, layer in enumerate(layer_stack):
        layer_stack[i]["a"] = out
        out = activate_layer(layer)
        #print("f",i)
    return out, layer_stack

def back_prop(layer_stack, dloss):
    #loss of output layer
    g = dloss
    layer_out = layer_stack[-1]
    W = layer_out["W"]
    
    dzy_out = activate_layer(layer_out, d=True) #dy/dz
    g = np.multiply(g, dzy_out)
    
    dWz_out = layer_out["a"] #dzt/dWt = at-1
    dW_out = M(dWz_out).T.dot(M(g))    
    g = W.dot(g.T)
    
    #stochastic gradient update
    layer_stack[-1]["W"] = np.multiply(W , dW_out *1.1 )
    
    for i, layer in enumerate(reversed(layer_stack[:-1])):
        #activate_layer() recomputes z to minimize lines of code (and save some memory)
        # It is then activated with the derived activation function
        da = activate_layer(layer, d=True)
        W = layer["W"]
        b = layer["b"]
        #print()
        dWz = layer["a"]
        #print("a", dWz.shape)
        g = np.multiply(g, da)
        #print("g", g.shape)
        dW = np.array(M(dWz).T.dot(M(g)))
        #print("dW", dW.shape)
        #print()
        g = W.dot(g.T)
        #print("b", i)
        #
        #layer_stack[i]["b"] = np.subtract(b , g)
        #print(W.shape, dW.shape, np.subtract(W , dW).shape)
        layer_stack[i+1]["W"] = np.subtract(W , dW).T
        
    return layer_stack

train_data = np.array([0 for _ in range(99)] + [1])    
    
layer_stack = add_layer(initialize_layer(100, 3, relu))
layer_stack = add_layer(initialize_layer(3, 10, relu), layer_stack)
layer_stack = add_layer(initialize_layer(10, 3, relu), layer_stack)
layer_stack = add_layer(initialize_layer(3, 100, softmax), layer_stack)

#layer_stack = add_layer(initialize_layer(5, 10, relu))
#layer_stack = add_layer(initialize_layer(10, 10, relu), layer_stack)
#layer_stack = add_layer(initialize_layer(10, 5, softmax), layer_stack)


for _ in range(100): 
    out, layer_stack = forward_pass(layer_stack, train_data)
    dloss = KL_divergence(out, train_data, d=True)
    loss = KL_divergence(out, train_data)
    print("Error", loss)
    layer_stack = back_prop(layer_stack, dloss)
    
    
    
#for l, y in layer_stack: print(l,)
#activate_layer(W1, b1, relu)

Error 10.198116261874107
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174
Error 11.520360864672174


In [156]:
 np.random.standard_normal((5,5)).mean()

0.07943046772809711

numpy.matrixlib.defmatrix.matrix

In [109]:
def back_prop(layer_stack, dloss):
    #loss of output layer
    g = dloss
    layer_out = layer_stack[-1]
    W = layer_out["W"]
    
    dzy_out = activate_layer(layer_out, d=True) #dy/dz
    g = np.multiply(g, dzy_out)
    
    dWz_out = layer_out["a"] #dzt/dWt = at-1
    dW_out = np.dot(M(dWz_out).T, M(g))
    print(dW_out)
    
    g = W.dot(g.T)
    #stochastic gradient update
    #layer_stack[-1]["W"] = np.subtract(W , dW_out)
    layer_stack[-1]["da"] = dWz_out
    #print("output done")
    
    for i, layer in enumerate(reversed(layer_stack[:-1])):
        #activate_layer() recomputes z to minimize lines of code (and save some memory)
        # It is then activated with the derived activation function
        da = activate_layer(layer, d=True)
        layer_stack[i]["da"] = da
        W = layer["W"]
        b = layer["b"]
        
        g = np.multiply(g, da)
        dW = g.dot(da)
        g = W.dot(g.T)
        #print("b", i)
        #
        layer_stack[i]["b"] = np.subtract(b , g)
        #layer_stack[i]["W"] = np.subtract(W , dW)
        return W, dW
        break
W, dW = back_prop(layer_stack,dloss)

b- a (10,) W (10, 5) b (5,)
[[ 8.90074212e-19  2.35942377e-31  3.00566436e+00  8.68645379e-36
  -1.01907943e-07]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -0.00000000e+00]
 [ 1.52457056e-18  4.04135742e-31  5.14827566e+00  1.48786602e-35
  -1.74553815e-07]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -0.00000000e+00]
 [ 2.77489213e-18  7.35573098e-31  9.37044830e+00  2.70808568e-35
  -3.17707833e-07]
 [ 5.74623255e-20  1.52322104e-32  1.94042768e-01  5.60789009e-37
  -6.57907768e-09]
 [ 1.36701320e-18  3.62370170e-31  4.61622502e+00  1.33410190e-35
  -1.56514481e-07]
 [ 1.61925762e-18  4.29235547e-31  5.46802149e+00  1.58027345e-35
  -1.85394893e-07]
 [ 1.44709652e-19  3.83598792e-32  4.88665594e-01  1.41225719e-36
  -1.65683521e-08]]
b- a (10,) W (10, 10) b (10,)


In [111]:
W.shape

(10, 10)

In [114]:
dW.shape

()

In [25]:
np.dot(M(z).T, M(out)).shape

(10, 5)

In [8]:
np.dot(M(z), M(out).T)

ValueError: shapes (1,1) and (5,1) not aligned: 1 (dim 1) != 5 (dim 0)

In [19]:
z = layer_stack[-1]["a"]

In [267]:
def back_prop(layer_stack, dloss):
    #loss of output layer
    g = dloss
    layer_out = layer_stack[-1]
    dzy_out = activate_layer(layer_out, d=True)
    dWz_out = layer_out["a"]
    dW_out = np.multiply(np.dot(dzy_out, dWz_out), M(g).T)
    for i, layer in enumerate(reversed(layer_stack[1:])):
        #activate_layer() recomputes z to minimize lines of code (and save some memory)
        # It is then activated with the derived activation function
        W = layer["W"]
        a = layer["a"]
        da = activate_layer(layer, d=True)
        g = np.multiply(W, da)
        dW = np.dot(g, da)
        #W.T.dot(
        #update weights
        print(i)
        layer_stack[i]["W"] = np.subtract(W , dW)

back_prop(layer_stack, dloss)

ValueError: shapes (5,) and (10,5) not aligned: 5 (dim 0) != 10 (dim 0)

In [207]:
np.array(5).dot(np.matrix((10,10)))

matrix([[50, 50]])

In [162]:
np.dot(input_data, 1-input_data)

0

In [182]:
activate_layer(layer_stack[-1])

array([0.20173562, 0.20101412, 0.19914426, 0.1979388 , 0.2001672 ])

In [183]:
softmax(np.random.normal(1,1,(10,5)), d=1)

array([[-24.28723753,  -1.50403064,   3.58202782,   7.68577602,
         -4.0952484 ],
       [ -8.56865646, -14.92830019,  -2.76581367,   5.32833317,
          2.11724833],
       [ -1.54027289,  -0.82348857, -11.94957658,  -2.18242981,
         -0.91329806],
       [ -1.92267791,   2.78450506,  -6.66858303,  -7.24853466,
         -3.59767197],
       [ -8.08615323,   5.19096932,   0.21809782,   2.01987714,
        -10.75081245]])

In [92]:
for i in range(3): print(layer_stack[i]["a"].shape)

(10,)
(5,)
(5,)


In [88]:
layer_stack[1]

{'W': array([[-0.24816425,  0.21988261, -1.48449639, -1.5973357 , -0.74753241,
          0.12115882, -0.44619952,  0.63481989, -0.55062677,  1.12433947],
        [ 0.96408728,  0.4378771 , -1.30818295,  0.07434215, -1.10239108,
         -0.95139944,  0.10283105, -0.08025586, -1.39695337,  0.94091781],
        [-0.069817  ,  0.95687134, -0.67180139, -0.05761391,  1.96948677,
         -0.25810702, -1.34727975, -0.10348914, -0.27160298, -0.19018819],
        [ 0.37815759,  0.39078865,  0.39477496, -1.34882148, -0.64791109,
          0.66595316, -0.37375519,  0.23456121,  0.26838649,  2.0526692 ],
        [ 0.02667162,  1.34457704, -0.82727016,  0.29429427,  0.13759172,
          0.69230056, -0.93676542, -0.55213469, -0.54194377,  0.6625922 ],
        [-1.67791053,  0.73742752,  0.80887769,  0.89761816,  1.14922293,
         -1.28356169,  0.84502726,  0.93986819, -0.25530766,  0.89848511],
        [ 0.28397142,  1.36500138,  2.00430181,  0.29615313,  0.3974948 ,
          0.6702975 ,  0.03

In [86]:
list(reversed(layer_stack[:-1]))

[{'W': array([[-0.24816425,  0.21988261, -1.48449639, -1.5973357 , -0.74753241,
           0.12115882, -0.44619952,  0.63481989, -0.55062677,  1.12433947],
         [ 0.96408728,  0.4378771 , -1.30818295,  0.07434215, -1.10239108,
          -0.95139944,  0.10283105, -0.08025586, -1.39695337,  0.94091781],
         [-0.069817  ,  0.95687134, -0.67180139, -0.05761391,  1.96948677,
          -0.25810702, -1.34727975, -0.10348914, -0.27160298, -0.19018819],
         [ 0.37815759,  0.39078865,  0.39477496, -1.34882148, -0.64791109,
           0.66595316, -0.37375519,  0.23456121,  0.26838649,  2.0526692 ],
         [ 0.02667162,  1.34457704, -0.82727016,  0.29429427,  0.13759172,
           0.69230056, -0.93676542, -0.55213469, -0.54194377,  0.6625922 ],
         [-1.67791053,  0.73742752,  0.80887769,  0.89761816,  1.14922293,
          -1.28356169,  0.84502726,  0.93986819, -0.25530766,  0.89848511],
         [ 0.28397142,  1.36500138,  2.00430181,  0.29615313,  0.3974948 ,
           0.6

In [21]:
for layer in reversed(layer_stack[:-1]): print(layer)

In [17]:
back_prop(layer_stack, dloss, out)

In [6]:
softmax(out, d=1)

0.8000000000000002

In [22]:
layer_stack[-1]

{'W': array([[-1.36499269, -0.7921027 ,  1.65143855, -0.13963861,  0.56097456],
        [-0.55209666,  0.44693978,  0.71740411,  0.4495377 , -1.11230445],
        [ 1.72761561, -0.34032056,  0.89761843,  1.05949191, -0.00922064],
        [ 1.86546767,  0.07417857, -0.55126836, -0.83688171,  0.40181989],
        [-0.92705267,  0.36303249, -0.03409175, -1.58309083,  0.86056945]]),
 'a': array([0., 0., 0., 0., 0.]),
 'b': array([0.001, 0.001, 0.001, 0.001, 0.001]),
 'f': <function __main__.softmax>}

In [88]:
np.multiply(dloss,out).sum()

-1.734723475976807e-17

In [77]:
a

NameError: name 'a' is not defined

In [47]:
def one_hot_vectorizer(tokenized_text):
    words = set(tokenized_text)
    vocab_size = len(words)
    onehot = np.identity(vocab_size)
    
    word2onehot = dict(zip(words, onehot))
    onehotargmax2word = dict(zip(onehot.argmax(axis=1), words)) 
                                #cant use array as key
                                #onehot.argmax(axis=1) same as range(vocab_size)
    
    return word2onehot, onehotargmax2word
    
    
tokenized_text = "this is a test to se how well it all works -did i say it was a test?".split()

w2o, o2w = one_hot_vectorizer(tokenized_text)