In [1]:
# dependencies
"""
Numpy: matrix manipulation and math
Pandas: csv parsing and various data structure tasks
Mathpltlib.pyplot: data visualization
set_trace: debug breaks
keras: a machine learning library that is intuitive to read
tensorflow: backend for keras... also the most widely used machine learning library
re: regular expressions
"""
from copy import deepcopy as copy
from IPython.core.debugger import set_trace

import sys
import numpy as np
import pandas as pd
import scipy.special as sci
import matplotlib.pyplot as plt 
import os
import tensorflow as tf
import keras
import re

tf.config.optimizer.set_jit(True) # optimizes cpu usage

from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

Using TensorFlow backend.


In [2]:
"""
concat_files
----------

Concatenate text files in a directory as a string

dependent on 'os' Python module

parameters
----------
directory: string; path of the target directory

f_type:    tuple of strings; target file extentsions Ex: ('.py', '.cpp')

return
---------
content:   concatenated string

"""
def concat_files(directory,f_type):
    import os
    # List all file in the dataset directory
    # ------------------
    all_file = []
    content = ""

    # walk through every directory and open every f_type file
    # concatenate into var string "content"
    for root, dirs, files in os.walk(directory): 
        for name in files:
            if name.endswith(f_type): # we only care about .py
                all_file.append(name)
                with open(os.path.join(root,name), "r",encoding="utf8") as f:
                    content += f.read() + "\n"
    return content

In [3]:
content = concat_files("dataset",('.py'))

In [5]:
r_all_ascii = "[^\x00-\x7F]"

In [17]:
"""
encode_string
-----------
Generate a dictionary representation of the characters found 
in a string keyed with integer representations

Returns two dictionaries and an array. The two dictionaries are 
necessary to convert the string to integer representation
and back again. The array is the string encoded as integer values.

parameters
----------
content:      string; to be processed

return
----------
vocab_to_int: dict; character to integer representation of unique characters in the string

int_to_vocab: dict; integer to string representation

encoded:      array; string encoded as integer values
"""

def encode_string(content):   
    # Convert the string "content" into a list of intergers

    ### creates a set of the individual characters
    vocab = set(content)
    ### attempt to clean out non-ascii characters
    vocab_c = copy(vocab)
    for i, char in enumerate(vocab_c):
        if re.search(r_all_ascii,char):
            vocab.remove(char)
    print(vocab)
    print(len(vocab))
    ### use the set to sequentially generate a dictionary
    vocab_to_int = {c: i for i, c in enumerate(vocab)} 
    # print(vocab_to_int)
    ### make keys the numerical values
    int_to_vocab = dict(enumerate(vocab)) 
    
    ### encode the "content" string using dict
    ### encoded = np.array([vocab_to_int[c] for c in content], dtype=np.int32)
    
    # *** Uncomment the below lines if you haven't saved the encoded array
    # Then rerun cell
#   -------------------------------------------------
#     encoded = np.array([],dtype=np.int16)
#     for c in content:
#         if c in vocab_to_int:
#             encoded = np.append(encoded,vocab_to_int[c])
#   -------------------------------------------------
    infile = "./encoded.txt"
    encoded = np.loadtxt(infile, dtype=int) # comment out if above lines are uncommented
    
    return vocab_to_int, int_to_vocab, encoded

In [18]:
vocab_to_int, int_to_vocab, encoded = encode_string(content)

{'T', 'e', 'l', 'i', 'x', '-', 'y', '0', '*', '9', '[', 'j', 'D', '7', 's', '<', '+', '!', '^', 'r', 'd', '3', 'M', 'L', 'C', '.', '/', 'O', '\\', 'E', 'm', '@', "'", 'k', 'o', '?', '(', ',', '_', 'R', 'X', '8', 'N', 'S', 'f', '$', 'Z', '1', 'K', '5', 'U', 'v', '"', '{', 'G', '|', '&', 'F', 'I', '=', 't', 'g', '%', '~', 'a', 'p', '}', 'Q', 'V', '>', 'c', ')', 'J', 'A', '6', '#', '`', 'b', 'B', 'P', ':', '\n', 'H', ']', ';', ' ', 'u', 'w', '4', 'W', 'Y', 'q', 'z', 'n', '2', 'h', '\t'}
97


## $\rightarrow$ Save encoded array to avoid heavy computation

In [16]:
#from tempfile import TemporaryFile as TF
outfile = "./encoded.txt"

np.savetxt(outfile,encoded, fmt='%d')

In [8]:
#print(content)
print(int_to_vocab)
# this is all of the files concatenated. with each character encoded using the int_to_vocab
print(encoded)

{0: 'T', 1: 'e', 2: 'l', 3: 'i', 4: 'x', 5: '-', 6: 'y', 7: '0', 8: '*', 9: '9', 10: '[', 11: 'j', 12: 'D', 13: '7', 14: 's', 15: '<', 16: '+', 17: '!', 18: '^', 19: 'r', 20: 'd', 21: '3', 22: 'M', 23: 'L', 24: 'C', 25: '.', 26: '/', 27: 'O', 28: '\\', 29: 'E', 30: 'm', 31: '@', 32: "'", 33: 'k', 34: 'o', 35: '?', 36: '(', 37: ',', 38: '_', 39: 'R', 40: 'X', 41: '8', 42: 'N', 43: 'S', 44: 'f', 45: '$', 46: 'Z', 47: '1', 48: 'K', 49: '5', 50: 'U', 51: 'v', 52: '"', 53: '{', 54: 'G', 55: '|', 56: '&', 57: 'F', 58: 'I', 59: '=', 60: 't', 61: 'g', 62: '%', 63: '~', 64: 'a', 65: 'p', 66: '}', 67: 'Q', 68: 'V', 69: '>', 70: 'c', 71: ')', 72: 'J', 73: 'A', 74: '6', 75: '#', 76: '`', 77: 'b', 78: 'B', 79: 'P', 80: ':', 81: '\n', 82: 'H', 83: ']', 84: ';', 85: ' ', 86: 'u', 87: 'w', 88: '4', 89: 'W', 90: 'Y', 91: 'q', 92: 'z', 93: 'n', 94: '2', 95: 'h', 96: '\t'}
[38 56 83 ... 62 60 60]


## Reshape data into sequences

In [63]:
n_chars = len(encoded)
n_vocab = len(vocab_to_int)
seq_len = 100 # change from 50
datax = []
datay = []

# Loop through the encoded data and store 
# sequences in datax and datay
for i in range(0, n_chars - seq_len, 1):
    seq_in = encoded[i:i + seq_len] 
    seq_out = encoded[i + seq_len]
    datax.append(seq_in)
    datay.append(seq_out)
n_patterns = len(datax)
print("Total patterns: ", n_patterns)
print("Total unique characters: ", n_vocab)
print ("\"", ''.join([int_to_vocab[value] for value in datax[100]]), "\"")

Total patterns:  1112800
Total unique characters:  97
" ing bolzano

    start = a
    end = b
    if function(a) == 0:  # one of the a or b is a root for t "


In [64]:
from keras.utils import np_utils

# reshape datax -- > [n_patterns, time steps, features]
X = np.reshape(datax, (n_patterns,seq_len,1))
X = X / float(n_vocab)
Y = np_utils.to_categorical(datay)
#Y = np.asarray(datay) # for sparse categorical cross-entropy

In [65]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop, SGD

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.5))
model.add(Dense(Y.shape[1], activation='softmax'))

# optimizer = RMSprop(learning_rate=0.05)

# model.compile(loss='categorical_crossentropy',
#               optimizer=optimizer,
#               metrics=['accuracy'])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
# checkpoint
from tensorflow.keras.callbacks import ModelCheckpoint
filepath = "best-weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',verbose=1, save_best_only=True, mode='min')
callback_list = [checkpoint]

In [66]:
# model.fit(inp, targets, steps_per_epoch=10, epochs=10)
#model.get_weights().shape

model.fit(X[:60000] , Y[:60000], epochs=100, batch_size=300, callbacks=callback_list)

Train on 60000 samples
Epoch 1/100
Epoch 00001: loss did not improve from 0.28177
Epoch 2/100
Epoch 00002: loss did not improve from 0.28177
Epoch 3/100
Epoch 00003: loss did not improve from 0.28177
Epoch 4/100
Epoch 00004: loss did not improve from 0.28177
Epoch 5/100
Epoch 00005: loss did not improve from 0.28177
Epoch 6/100
Epoch 00006: loss did not improve from 0.28177
Epoch 7/100
Epoch 00007: loss did not improve from 0.28177
Epoch 8/100
Epoch 00008: loss did not improve from 0.28177
Epoch 9/100
Epoch 00009: loss did not improve from 0.28177
Epoch 10/100
Epoch 00010: loss did not improve from 0.28177
Epoch 11/100
Epoch 00011: loss did not improve from 0.28177
Epoch 12/100
Epoch 00012: loss did not improve from 0.28177
Epoch 13/100
Epoch 00013: loss did not improve from 0.28177
Epoch 14/100
Epoch 00014: loss did not improve from 0.28177
Epoch 15/100
Epoch 00015: loss did not improve from 0.28177
Epoch 16/100
Epoch 00016: loss did not improve from 0.28177
Epoch 17/100
Epoch 00017: 

Epoch 00034: loss did not improve from 0.28177
Epoch 35/100
Epoch 00035: loss did not improve from 0.28177
Epoch 36/100
Epoch 00036: loss did not improve from 0.28177
Epoch 37/100
Epoch 00037: loss did not improve from 0.28177
Epoch 38/100
Epoch 00038: loss did not improve from 0.28177
Epoch 39/100
Epoch 00039: loss did not improve from 0.28177
Epoch 40/100
Epoch 00040: loss did not improve from 0.28177
Epoch 41/100
Epoch 00041: loss did not improve from 0.28177
Epoch 42/100
Epoch 00042: loss did not improve from 0.28177
Epoch 43/100
Epoch 00043: loss did not improve from 0.28177
Epoch 44/100
Epoch 00044: loss did not improve from 0.28177
Epoch 45/100
Epoch 00045: loss did not improve from 0.28177
Epoch 46/100
Epoch 00046: loss did not improve from 0.28177
Epoch 47/100
Epoch 00047: loss did not improve from 0.28177
Epoch 48/100
Epoch 00048: loss did not improve from 0.28177
Epoch 49/100
Epoch 00049: loss did not improve from 0.28177
Epoch 50/100
Epoch 00050: loss did not improve from 0

Epoch 00067: loss did not improve from 0.28177
Epoch 68/100
Epoch 00068: loss did not improve from 0.28177
Epoch 69/100
Epoch 00069: loss did not improve from 0.28177
Epoch 70/100
Epoch 00070: loss did not improve from 0.28177
Epoch 71/100
Epoch 00071: loss did not improve from 0.28177
Epoch 72/100
Epoch 00072: loss did not improve from 0.28177
Epoch 73/100
Epoch 00073: loss did not improve from 0.28177
Epoch 74/100
Epoch 00074: loss did not improve from 0.28177
Epoch 75/100
Epoch 00075: loss did not improve from 0.28177
Epoch 76/100
Epoch 00076: loss did not improve from 0.28177
Epoch 77/100
Epoch 00077: loss did not improve from 0.28177
Epoch 78/100
Epoch 00078: loss did not improve from 0.28177
Epoch 79/100
Epoch 00079: loss did not improve from 0.28177
Epoch 80/100
Epoch 00080: loss did not improve from 0.28177
Epoch 81/100
Epoch 00081: loss did not improve from 0.28177
Epoch 82/100
Epoch 00082: loss did not improve from 0.28177
Epoch 83/100
Epoch 00083: loss did not improve from 0

<tensorflow.python.keras.callbacks.History at 0x24294df3b88>

In [61]:
from tensorflow.keras.models import load_model
# model.save('.\model')
score = model.evaluate(X[60000:120000], Y[60000:120000])
print(score)

[4.67418819476366, 0.2396]


### Load best weights recorded from training 

In [41]:
# filename should reflect the name of the best weights available 
# in th local directory after training
filename = "best-weights.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [62]:
# pick a random seed
start = np.random.randint(0, len(datax)-1)
pattern = []
pattern = datax[start]
print("Seed:")
print ("\"", ''.join([int_to_vocab[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_vocab[index]
    seq_in = [int_to_vocab[value] for value in pattern]
    sys.stdout.write(result)
    pattern = np.append(pattern, [index], axis=0) # TODO make so length of pattern is 101, as it should be
    length = pattern.shape[0] 
    pattern = pattern[1:length] # issue with the length. no matter what, length of pattern become 99
print ("\nDone.")

Seed:
" xtreme points belong to the upper hull
    # all points to the right (below) the line joining the ex "


ValueError: Error when checking input: expected lstm_10_input to have shape (30, 1) but got array with shape (100, 1)

# References

1. [LSTM: A search space odyssey](https://arxiv.org/pdf/1503.04069.pdf?utm_content=buffereddc5&utm_medium=social&utm_source=plus.google.com&utm_campaign=buffer)