In [298]:
import pandas as pd
from keras.models import Sequential, load_model, Model
from keras.layers import LSTM, Dropout, TimeDistributed, Dense, Activation, Embedding
import numpy as np
import string

In [299]:
df = pd.read_csv('data_etsy.csv')
w = df['description'].values
w = ''.join(w)

vocab = list(string.ascii_lowercase + string.digits + ' ')
vocab_size = len(vocab)

data_long_string = w.lower()
for char in list(set(data_long_string)):
    if char not in vocab:
        data_long_string = data_long_string.replace(char, '')
data_list_of_chars = np.array(list(data_long_string))

map_char_to_float = {char:i for i, char in enumerate(vocab)}
def encode_string(string):
    return np.array([map_char_to_float[char] for char in string])

data_list_of_ints = encode_string(data_list_of_chars)

In [300]:
BATCH_SIZE = 16
SEQ_LENGTH = 37

In [301]:
model = Sequential()
model.add(Embedding(vocab_size, 512, batch_input_shape=(BATCH_SIZE, SEQ_LENGTH)))
for i in range(3):
    model.add(LSTM(256, return_sequences=True, stateful=True))
    #model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(vocab_size)))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (16, 37, 512)             18944     
_________________________________________________________________
lstm_22 (LSTM)               (16, 37, 256)             787456    
_________________________________________________________________
lstm_23 (LSTM)               (16, 37, 256)             525312    
_________________________________________________________________
lstm_24 (LSTM)               (16, 37, 256)             525312    
_________________________________________________________________
time_distributed_8 (TimeDist (16, 37, 37)              9509      
_________________________________________________________________
activation_8 (Activation)    (16, 37, 37)              0         
Total params: 1,866,533
Trainable params: 1,866,533
Non-trainable params: 0
_________________________________________________________________


In [302]:
def read_batches(T, vocab_size):
    length = T.shape[0]
    batch_chars = length // BATCH_SIZE

    for start in range(0, batch_chars - SEQ_LENGTH, SEQ_LENGTH):
        X = np.zeros((BATCH_SIZE, SEQ_LENGTH))
        Y = np.zeros((BATCH_SIZE, SEQ_LENGTH, vocab_size))
        for batch_idx in range(0, BATCH_SIZE):
            for i in range(0, SEQ_LENGTH):
                X[batch_idx, i] = T[batch_chars * batch_idx + start + i]
                Y[batch_idx, i, T[batch_chars * batch_idx + start + i + 1]] = 1
        yield X, Y

In [303]:
for i, (X, Y) in enumerate(read_batches(data_list_of_ints, vocab_size)):
    loss, acc = model.train_on_batch(X, Y)
    print('Batch {}: loss = {:.4f}, acc = {:.5f}'.format(i + 1, loss, acc))
    
    if i == 2000: break

Batch 1: loss = 3.6111, acc = 0.03378
Batch 2: loss = 3.5991, acc = 0.14865
Batch 3: loss = 3.5727, acc = 0.15709
Batch 4: loss = 3.5041, acc = 0.17061
Batch 5: loss = 3.3195, acc = 0.16723
Batch 6: loss = 3.5000, acc = 0.16216
Batch 7: loss = 3.2732, acc = 0.15034
Batch 8: loss = 3.1081, acc = 0.17736
Batch 9: loss = 3.1936, acc = 0.08108
Batch 10: loss = 3.2175, acc = 0.07939
Batch 11: loss = 3.1686, acc = 0.09122
Batch 12: loss = 3.1169, acc = 0.09122
Batch 13: loss = 3.0877, acc = 0.08615
Batch 14: loss = 3.0841, acc = 0.09966
Batch 15: loss = 3.1039, acc = 0.08953
Batch 16: loss = 3.0152, acc = 0.10811
Batch 17: loss = 3.0072, acc = 0.16047
Batch 18: loss = 2.9554, acc = 0.15372
Batch 19: loss = 2.9496, acc = 0.17568
Batch 20: loss = 2.9928, acc = 0.15034
Batch 21: loss = 3.0696, acc = 0.16554
Batch 22: loss = 3.0155, acc = 0.15034
Batch 23: loss = 3.0171, acc = 0.17230
Batch 24: loss = 3.0322, acc = 0.15541
Batch 25: loss = 3.0212, acc = 0.15203
Batch 26: loss = 3.0640, acc = 0.1

Batch 209: loss = 2.7817, acc = 0.20439
Batch 210: loss = 2.8051, acc = 0.20270
Batch 211: loss = 2.8304, acc = 0.18919
Batch 212: loss = 2.8474, acc = 0.17905
Batch 213: loss = 2.8383, acc = 0.18750
Batch 214: loss = 2.8524, acc = 0.19595
Batch 215: loss = 2.8043, acc = 0.19088
Batch 216: loss = 2.8164, acc = 0.21115
Batch 217: loss = 2.7778, acc = 0.20439
Batch 218: loss = 2.8315, acc = 0.19595
Batch 219: loss = 2.7641, acc = 0.19257
Batch 220: loss = 2.7830, acc = 0.17905
Batch 221: loss = 2.7223, acc = 0.17905
Batch 222: loss = 2.7162, acc = 0.22973
Batch 223: loss = 2.7265, acc = 0.20777
Batch 224: loss = 2.7607, acc = 0.20101
Batch 225: loss = 2.7648, acc = 0.21791
Batch 226: loss = 2.6759, acc = 0.21453
Batch 227: loss = 2.7454, acc = 0.22635
Batch 228: loss = 2.6514, acc = 0.21284
Batch 229: loss = 2.6429, acc = 0.24662
Batch 230: loss = 2.5909, acc = 0.25000
Batch 231: loss = 2.6492, acc = 0.22466
Batch 232: loss = 2.5970, acc = 0.23986
Batch 233: loss = 2.5827, acc = 0.23649


Batch 415: loss = 2.0958, acc = 0.36149
Batch 416: loss = 2.0965, acc = 0.37838
Batch 417: loss = 2.1650, acc = 0.37669
Batch 418: loss = 1.9984, acc = 0.39189
Batch 419: loss = 2.0636, acc = 0.38851
Batch 420: loss = 2.1626, acc = 0.35811
Batch 421: loss = 2.1699, acc = 0.33784
Batch 422: loss = 2.1920, acc = 0.36149
Batch 423: loss = 2.2671, acc = 0.32770
Batch 424: loss = 2.1353, acc = 0.40034
Batch 425: loss = 2.1589, acc = 0.37162
Batch 426: loss = 2.1745, acc = 0.33277
Batch 427: loss = 2.1476, acc = 0.37331
Batch 428: loss = 2.1192, acc = 0.37331
Batch 429: loss = 2.1267, acc = 0.36318
Batch 430: loss = 2.0577, acc = 0.38345
Batch 431: loss = 2.0851, acc = 0.38345
Batch 432: loss = 2.0282, acc = 0.41385
Batch 433: loss = 2.1364, acc = 0.34797
Batch 434: loss = 2.2029, acc = 0.34122
Batch 435: loss = 2.0917, acc = 0.39189
Batch 436: loss = 2.1687, acc = 0.37838
Batch 437: loss = 2.0222, acc = 0.40372
Batch 438: loss = 2.0800, acc = 0.39865
Batch 439: loss = 2.0960, acc = 0.39358


Batch 621: loss = 1.8136, acc = 0.46959
Batch 622: loss = 1.9125, acc = 0.43412
Batch 623: loss = 1.8520, acc = 0.46622
Batch 624: loss = 1.9391, acc = 0.42399
Batch 625: loss = 1.8787, acc = 0.45270
Batch 626: loss = 1.7699, acc = 0.47804
Batch 627: loss = 1.9296, acc = 0.42230
Batch 628: loss = 1.8095, acc = 0.48142
Batch 629: loss = 1.8957, acc = 0.44088
Batch 630: loss = 1.8999, acc = 0.42568
Batch 631: loss = 1.8887, acc = 0.43750
Batch 632: loss = 1.7784, acc = 0.48142
Batch 633: loss = 1.8471, acc = 0.45777
Batch 634: loss = 2.0084, acc = 0.42061
Batch 635: loss = 1.8511, acc = 0.44257
Batch 636: loss = 2.0269, acc = 0.42736
Batch 637: loss = 1.8487, acc = 0.45946
Batch 638: loss = 1.7424, acc = 0.49493
Batch 639: loss = 1.7903, acc = 0.46284
Batch 640: loss = 1.7628, acc = 0.46791
Batch 641: loss = 1.8078, acc = 0.51182
Batch 642: loss = 1.8436, acc = 0.46959
Batch 643: loss = 1.8496, acc = 0.45101
Batch 644: loss = 1.8660, acc = 0.47466
Batch 645: loss = 1.6513, acc = 0.51858


KeyboardInterrupt: 

In [416]:
from keras import backend as K

X,Y = next(read_batches(data_list_of_ints, vocab_size))

def get_layer_i_output_on_X(i, X):
    
    X = X.reshape(-1, SEQ_LENGTH)
    get_i_layer_output = K.function([model.layers[0].input],
                                    [model.layers[i].output])
    layer_output = get_i_layer_output([X])[0]
    
    return layer_output

def get_layer_j_parameters(j):
    layer = model.layers[j]
    n_units = layer.units
    
    kernel_i = layer.get_weights()[0][:,:n_units]
    kernel_f = layer.get_weights()[0][:,n_units:2*n_units]
    kernel_c = layer.get_weights()[0][:,2*n_units:3*n_units]
    kernel_o = layer.get_weights()[0][:,3*n_units:]
    
    recur_i = layer.get_weights()[1][:,:n_units]
    recur_f = layer.get_weights()[1][:,n_units:2*n_units]
    recur_c = layer.get_weights()[1][:,2*n_units:3*n_units]
    recur_o = layer.get_weights()[1][:,3*n_units:]
    
    bias_i = layer.get_weights()[2][:n_units]
    bias_f = layer.get_weights()[2][n_units:2*n_units]
    bias_c = layer.get_weights()[2][2*n_units:3*n_units]
    bias_o = layer.get_weights()[2][3*n_units:]
    
    dict_with_params = {'kernel_i':kernel_i,
                        'kernel_f':kernel_f,
                        'kernel_c':kernel_c,
                        'kernel_o':kernel_o,
                        'recur_i':recur_i,
                        'recur_f':recur_f,
                        'recur_c':recur_c,
                        'recur_o':recur_o,
                        'bias_i':bias_i,
                        'bias_f':bias_f,
                        'bias_c':bias_c,
                        'bias_o':bias_o,}
    
    return dict_with_params

def hard_sigmoid(x):
    return (x < -2.5).astype(int) * 0 + (x > 2.5).astype(int) * 1 + ((-2.5<=x) & (x<=2.5)).astype(int) * (0.2 * x + 0.5)

def evaluate_layer_j_on_input(j, x, h, c):
    d = get_layer_j_parameters(j)
    
    x = x.reshape(1,-1); h = h.reshape(1,-1); c = c.reshape(1,-1) 
    
    i = hard_sigmoid(x @ d['kernel_i'] + h @ d['recur_i'] + d['bias_i'])
    f = hard_sigmoid(x @ d['kernel_f'] + h @ d['recur_f'] + d['bias_f'])
    c = f * c + i * np.tanh(x @ d['kernel_c'] + h @ d['recur_c'] + d['bias_c'])
    o = hard_sigmoid(x @ d['kernel_o'] + h @ d['recur_o'] + d['bias_o'])
    
    h = o * np.tanh(c)
    return h, c 

In [417]:
inp = get_layer_i_output_on_X(2, X)
inp = inp[0]

In [431]:
h, c = np.zeros(256), np.zeros(256)

for i in range(37):
    h, c = evaluate_layer_j_on_input(3, 
                                      inp[i],
                                      h,
                                      c)
    if i == 10: break
h

array([[ 0.36997015, -0.20243904,  0.06580108, -0.08376692, -0.49079166,
         0.48357672,  0.42694387, -0.87016739,  0.4308302 , -0.08088644,
         0.91210242, -0.17258206,  0.40084778, -0.41717416, -0.51342262,
         0.44991216,  0.02379311, -0.4938219 , -0.87104302,  0.76510008,
        -0.60268491, -0.152367  , -0.02231249,  0.53911072,  0.19190407,
         0.72176333, -0.13597204,  0.        ,  0.19019175, -0.74812226,
        -0.01043332,  0.        , -0.03869087,  0.70331416,  0.6289672 ,
         0.00245176, -0.47211593,  0.23353648,  0.77533029, -0.75906905,
        -0.7471119 , -0.08046857, -0.47160222,  0.67509451, -0.61650389,
        -0.82328658, -0.62277423,  0.67052023, -0.39676215, -0.10206082,
        -0.59735924,  0.27712175, -0.71738176, -0.27167856,  0.78664425,
         0.59797541, -0.34370182, -0.99572807,  0.02462915,  0.43336879,
         0.00970127,  0.24647308, -0.89373355, -0.67407787,  0.4244087 ,
         0.17682114, -0.73011373, -0.92515331,  0.1

In [432]:
r = get_layer_i_output_on_X(3, X)[0][10]
r

array([ 0.36934754, -0.20502695,  0.07099158, -0.07985611, -0.49090534,
        0.4773793 ,  0.4237625 , -0.8673782 ,  0.43301553, -0.08046973,
        0.91521806, -0.16861847,  0.5445328 , -0.4210522 , -0.5043266 ,
        0.45119017,  0.02018598, -0.500369  , -0.86776054,  0.7624329 ,
       -0.59585583, -0.15962595, -0.0213448 ,  0.589288  ,  0.18945381,
        0.71223366, -0.10085215,  0.        ,  0.18663205, -0.747557  ,
       -0.01136942,  0.        , -0.04426956,  0.70020896,  0.65854555,
        0.00374589, -0.49357435,  0.23091209,  0.76838475, -0.7691674 ,
       -0.7381811 , -0.2151977 , -0.46745336,  0.67120683, -0.62464386,
       -0.819101  , -0.6256754 ,  0.6852928 , -0.39654216, -0.09991623,
       -0.59631824,  0.35408163, -0.71468186, -0.2691934 ,  0.7848156 ,
        0.60598415, -0.34383225, -1.        ,  0.02580756,  0.432583  ,
        0.02433919,  0.25338635, -0.89056635, -0.6769468 ,  0.4226778 ,
        0.18839926, -0.7309489 , -0.95304793,  0.191396  , -0.74

In [433]:
np.abs(h-r).mean()

0.011895180755745461

In [422]:
f = K.function([model.layers[2].output], [model.layers[3].output])
f([get_layer_i_output_on_X(2, X)])[0][0][0]

array([-0.93136036, -1.        , -0.17444812, -0.68670225,  0.4133347 ,
       -0.5250608 , -0.79048383, -0.72331417,  0.        , -0.14976543,
        0.        , -0.6946399 ,  0.91270244,  0.75910044,  0.22572684,
        0.07792626,  0.        , -0.08412182, -0.20709842, -0.3203006 ,
        0.65081555,  0.85162616, -0.9411663 ,  0.5341775 ,  0.42733675,
       -0.        ,  0.03931336,  0.76374185,  0.16513526, -0.08022057,
       -0.8024534 ,  0.46092355, -1.        , -0.37287325,  0.58876956,
        0.06760677, -0.27462685,  0.41723663,  0.01741885,  0.77375   ,
        0.        , -0.4277774 , -0.5852314 ,  0.        , -0.8089063 ,
       -0.03152403,  0.        ,  0.27586657, -0.33503497,  0.8570576 ,
        0.77225584,  0.05322284,  0.727936  ,  0.        , -0.        ,
       -0.75565785, -0.        , -1.        , -0.70560694,  0.        ,
        0.58829767,  0.72832584, -0.        ,  0.03522956, -0.74437356,
       -0.6367787 , -0.59861875, -0.19066182, -0.63795483, -0.31

In [367]:
np.abs(r-h).max()

0.9999999999998942

In [353]:
q = model.layers[3]

In [354]:
q.trainable_weights

[<tf.Variable 'lstm_24/kernel:0' shape=(256, 1024) dtype=float32_ref>,
 <tf.Variable 'lstm_24/recurrent_kernel:0' shape=(256, 1024) dtype=float32_ref>,
 <tf.Variable 'lstm_24/bias:0' shape=(1024,) dtype=float32_ref>]

In [307]:
inp = layer_output[0]
mat = q.get_weights()[1][:, :256]

In [308]:
res = layer_output[0] @ mat

In [309]:
mat.shape, inp.shape, res.shape

((256, 256), (37, 256), (37, 256))

In [310]:
inp.shape

(37, 256)

In [414]:
q.cell.unit_forget_bias

True