In [3]:
# dependencies
"""
Numpy: matrix manipulation and math
Pandas: csv parsing and various data structure tasks
Mathpltlib.pyplot: data visualization
set_trace: debug breaks
keras: a machine learning library that is intuitive to read
tensorflow: backend for keras... also the most widely used machine learning library
re: regular expressions
"""
from copy import deepcopy as copy
from IPython.core.debugger import set_trace

import sys
import numpy as np
import pandas as pd
import scipy.special as sci
import matplotlib.pyplot as plt 
import os
import tensorflow as tf
import keras

tf.config.optimizer.set_jit(True) # optimizes cpu usage

from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [4]:
"""
concat_files
----------

Concatenate text files in a directory as a string

dependent on 'os' Python module

parameters
----------
directory: string; path of the target directory

f_type:    tuple of strings; target file extentsions Ex: ('.py', '.cpp')

return
---------
content:   concatenated string

"""
def concat_files(directory,f_type):
    import os
    # List all file in the dataset directory
    # ------------------
    all_file = []
    content = ""

    # walk through every directory and open every f_type file
    # concatenate into var string "content"
    for root, dirs, files in os.walk(directory): 
        for name in files:
            if name.endswith(f_type): # we only care about .py
                all_file.append(name)
                with open(os.path.join(root,name), "r",encoding="utf8") as f:
                    content += f.read() + "\n"
    return content

In [5]:
content = concat_files("dataset",('.py'))

In [6]:
r_all_ascii = "[^\x00-\x7F]"

In [55]:
"""
encode_string
-----------
Generate a dictionary representation of the characters found 
in a string keyed with integer representations

Returns two dictionaries and an array. The two dictionaries are 
necessary to convert the string to integer representation
and back again. The array is the string encoded as integer values.

parameters
----------
content:      string; to be processed

return
----------
vocab_to_int: dict; character to integer representation of unique characters in the string

int_to_vocab: dict; integer to string representation

encoded:      array; string encoded as integer values
"""

def encode_string(content):   
    # Convert the string "content" into a list of intergers

    ### creates a set of the individual characters
    vocab = set(content)
    ### attempt to clean out non-ascii characters
    vocab_c = copy(vocab)
    for i, char in enumerate(vocab_c):
        if re.search(r_all_ascii,char):
            vocab.remove(char)
    print(vocab)
    print(len(vocab))
    ### use the set to sequentially generate a dictionary
    vocab_to_int = {c: i for i, c in enumerate(vocab)} 
    # print(vocab_to_int)
    ### make keys the numerical values
    int_to_vocab = dict(enumerate(vocab)) 
    
    ### encode the "content" string using dict
    ### encoded = np.array([vocab_to_int[c] for c in content], dtype=np.int32)
    
    # *** Uncomment the below lines if you haven't saved the encoded array
    # Then rerun cell
#   -------------------------------------------------
#     encoded = np.array([],dtype=np.int16)
#     for c in content:
#         if c in vocab_to_int:
#             encoded = np.append(encoded,vocab_to_int[c])
#   -------------------------------------------------
    encoded = np.load('./encoded.npy') # comment out if above lines are uncommented
    
    return vocab_to_int, int_to_vocab, encoded

In [56]:
vocab_to_int, int_to_vocab, encoded = encode_string(content)

{'A', '+', 'j', '.', 'f', 'z', 'X', '\n', '}', '-', '^', 'd', '4', ' ', 'x', '3', 'B', '`', 'p', 'm', 'Z', 'K', 'Y', '2', '>', 'Q', 'o', '{', 's', 'F', 'M', '(', '&', 'h', '1', 'G', 't', '!', 'D', '9', 'k', 'V', '?', '$', '|', 'E', 'b', ':', 'i', '[', 'U', 'n', 'T', '@', 'L', '%', '\\', '0', '=', 'u', '\t', 'w', '#', 'e', 'C', '8', '*', '<', '7', 'r', 'g', 'q', '5', 'N', 'a', "'", 'v', '/', 'I', '_', 'O', 'W', 'y', '6', 'J', 'l', ';', 'R', 'S', ',', ']', 'c', '"', ')', '~', 'P', 'H'}
97
{'A': 0, '+': 1, 'j': 2, '.': 3, 'f': 4, 'z': 5, 'X': 6, '\n': 7, '}': 8, '-': 9, '^': 10, 'd': 11, '4': 12, ' ': 13, 'x': 14, '3': 15, 'B': 16, '`': 17, 'p': 18, 'm': 19, 'Z': 20, 'K': 21, 'Y': 22, '2': 23, '>': 24, 'Q': 25, 'o': 26, '{': 27, 's': 28, 'F': 29, 'M': 30, '(': 31, '&': 32, 'h': 33, '1': 34, 'G': 35, 't': 36, '!': 37, 'D': 38, '9': 39, 'k': 40, 'V': 41, '?': 42, '$': 43, '|': 44, 'E': 45, 'b': 46, ':': 47, 'i': 48, '[': 49, 'U': 50, 'n': 51, 'T': 52, '@': 53, 'L': 54, '%': 55, '\\': 56, '0

## $\rightarrow$ Save encoded array to avoid heavy computation

In [57]:
#from tempfile import TemporaryFile as TF
outfile = "./encoded"

np.save(outfile,encoded)

In [58]:
#print(content)
print(int_to_vocab)
# this is all of the files concatenated. with each character encoded using the int_to_vocab
print(encoded)

{0: 'A', 1: '+', 2: 'j', 3: '.', 4: 'f', 5: 'z', 6: 'X', 7: '\n', 8: '}', 9: '-', 10: '^', 11: 'd', 12: '4', 13: ' ', 14: 'x', 15: '3', 16: 'B', 17: '`', 18: 'p', 19: 'm', 20: 'Z', 21: 'K', 22: 'Y', 23: '2', 24: '>', 25: 'Q', 26: 'o', 27: '{', 28: 's', 29: 'F', 30: 'M', 31: '(', 32: '&', 33: 'h', 34: '1', 35: 'G', 36: 't', 37: '!', 38: 'D', 39: '9', 40: 'k', 41: 'V', 42: '?', 43: '$', 44: '|', 45: 'E', 46: 'b', 47: ':', 48: 'i', 49: '[', 50: 'U', 51: 'n', 52: 'T', 53: '@', 54: 'L', 55: '%', 56: '\\', 57: '0', 58: '=', 59: 'u', 60: '\t', 61: 'w', 62: '#', 63: 'e', 64: 'C', 65: '8', 66: '*', 67: '<', 68: '7', 69: 'r', 70: 'g', 71: 'q', 72: '5', 73: 'N', 74: 'a', 75: "'", 76: 'v', 77: '/', 78: 'I', 79: '_', 80: 'O', 81: 'W', 82: 'y', 83: '6', 84: 'J', 85: 'l', 86: ';', 87: 'R', 88: 'S', 89: ',', 90: ']', 91: 'c', 92: '"', 93: ')', 94: '~', 95: 'P', 96: 'H'}
[48 19 18 ... 93  7  7]


## Reshape data into sequences

In [87]:
n_chars = len(encoded)
n_vocab = len(vocab_to_int)
seq_len = 300
datax = []
datay = []

# Loop through the encoded data and store 
# sequences in datax and datay
for i in range(0, n_chars - seq_len, 1):
    seq_in = encoded[i:i + seq_len] 
    seq_out = encoded[i + seq_len]
    datax.append(seq_in)
    datay.append(seq_out)
n_patterns = len(datax)
print("Total patterns: ", n_patterns)
print("Total unique characters: ", n_vocab)
print ("\"", ''.join([int_to_vocab[value] for value in datax[100]]), "\"")

Total patterns:  1112600
Total unique characters:  97
" ing bolzano

    start = a
    end = b
    if function(a) == 0:  # one of the a or b is a root for the function
        return a
    elif function(b) == 0:
        return b
    elif (
        function(a) * function(b) > 0
    ):  # if none of these are root and they are both positive or negative,
   "


In [101]:
from keras.utils import np_utils

# reshape datax -- > [n_patterns, time steps, features]
X = np.reshape(datax, (n_patterns,seq_len,1))
X = X / float(n_vocab)
Y = np_utils.to_categorical(datay)
#Y = np.asarray(datay) # for sparse categorical cross-entropy

In [104]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop, SGD

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.3))
model.add(Dense(Y.shape[1], activation='softmax'))

# optimizer = RMSprop(learning_rate=0.05)

# model.compile(loss='categorical_crossentropy',
#               optimizer=optimizer,
#               metrics=['accuracy'])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [80]:
# checkpoint
from tensorflow.keras.callbacks import ModelCheckpoint
filepath = "best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',verbose=1, save_best_only=True, mode='min')
callback_list = [checkpoint]

In [105]:
# model.fit(inp, targets, steps_per_epoch=10, epochs=10)
#model.get_weights().shape

model.fit(X[:60000] , Y[:60000], epochs=10, batch_size=100, callbacks=callback_list)

Train on 60000 samples
Epoch 1/10
Epoch 00001: loss did not improve from 2.85788
Epoch 2/10
Epoch 00002: loss did not improve from 2.85788
Epoch 3/10
Epoch 00003: loss did not improve from 2.85788
Epoch 4/10
Epoch 00004: loss did not improve from 2.85788


KeyboardInterrupt: 

In [72]:
from tensorflow.keras.models import load_model
# model.save('.\model')
score = model.evaluate(X[60000:120000], Y[60000:120000])
print(score)

[3.07553720202446, 0.28926668]


### Load best weights recorded from training 

In [84]:
# filename should reflect the name of the best weights available 
# in th local directory after training
filename = "best.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [86]:
# pick a random seed
start = np.random.randint(0, len(datax)-1)
pattern = []
pattern = datax[start]
print("Seed:")
print ("\"", ''.join([int_to_vocab[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_vocab[index]
    seq_in = [int_to_vocab[value] for value in pattern]
    sys.stdout.write(result)
    pattern = np.append(pattern, [index], axis=0) # TODO make so length of pattern is 101, as it should be
    length = pattern.shape[0] 
    pattern = pattern[1:length] # issue with the length. no matter what, length of pattern become 99
print ("\nDone.")

Seed:
"  :param key: Key to insert.
        :param value: Value associated with given key.

        >>> skip_list = SkipList()
        >>> skip_list.insert(2, "Two")
        >>> skip_list.find(2)
        'Two'
        >>> list(skip_list)
        [2]
        """

        node, update_vector = self._locate_node(key)
        if node is not None:
            node.value = value
        else:
            level = self.random_level()

            if level > self.level:
                # After level increase we  "
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

# References

1. [LSTM: A search space odyssey](https://arxiv.org/pdf/1503.04069.pdf?utm_content=buffereddc5&utm_medium=social&utm_source=plus.google.com&utm_campaign=buffer)