In [1]:
import os
os.environ['THEANO_FLAGS'] = 'floatX=float32,device=gpu'
##os.environ['THEANO_FLAGS'] = 'floatX=float32,device=gpu,optimizer=fast_compile'
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin/"

#del os.environ["THEANO_FLAGS"]

In [2]:
#import tensorflow as tf
from theano import tensor as T
import numpy as np
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, RepeatVector
from keras.layers import LSTM, GRU, Input, Merge, Reshape, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model

# fix random seed for reproducibility
np.random.seed(7)

Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5110)
Using Theano backend.


In [3]:
# UNK token will be second to last dimension
# EOS token will always be the last dimension
# If desired_length is not specified, desired_length will be len(document)
# If len(document) < desired_length, add an EOS token an pad with zero vectors to reach desired_length
# If len(document) > desired_length, truncate to desired_length
def encode_document(document, desired_length=-1, min_unicode_idx=0, max_unicode_idx=128):
    UNK_IDX = max_unicode_idx
    EOS_IDX = max_unicode_idx + 1
    if desired_length == -1:
        desired_length = len(document)
    encoded = np.zeros((desired_length, max_unicode_idx-min_unicode_idx+2)) # +2 for UNK and EOS tokens
    for doc_idx, char in enumerate(document[:desired_length]):
        char_encoding = ord(char)
        if not min_unicode_idx <= char_encoding < max_unicode_idx:
            char_encoding = UNK_IDX
        encoded[doc_idx, char_encoding-min_unicode_idx] = 1
    if len(document) < desired_length:
        encoded[len(document[:desired_length]):, EOS_IDX-min_unicode_idx] = 1
    #encoded[len(document[:desired_length]), EOS_IDX-min_unicode_idx] = 1
    return encoded.reshape(encoded.shape[0], 1, encoded.shape[1])

# By default, desired_length will be the length of the longest document in documents.
def encode_documents(documents, desired_length=-1, min_unicode_idx=0, max_unicode_idx=128):
    if desired_length == -1:
        desired_length = max([len(document) for document in documents])
    encodeds = []
    for document in documents:
        encodeds.append(encode_document(document, desired_length, min_unicode_idx, max_unicode_idx))
    e = np.array(encodeds)
    return e

# encoded must be one-hot, encoded via encode_document()
def decode_document(encoded, min_unicode_idx=0, max_unicode_idx=128, unk_decode_idx=32):
    UNK_IDX = max_unicode_idx
    EOS_IDX = max_unicode_idx + 1
    decoded = ""
    for idx in np.nonzero(encoded)[1]:
        candidate = idx + min_unicode_idx
        if candidate == UNK_IDX:
            candidate = unk_decode_idx
        elif candidate == EOS_IDX:
            continue
        decoded += chr(candidate)
    return decoded

def decode_documents(encodeds, min_unicode_idx=0, max_unicode_idx=128):
    decodeds = []
    for encoded in encodeds:
        decodeds.append(decode_document(encoded, min_unicode_idx, max_unicode_idx))
    return decodeds

In [4]:
def one_hot_conversion(predictions):
    converted = np.zeros(predictions.shape)
    for prediction_idx, prediction in enumerate(predictions):
        for elem_idx, elem in enumerate(prediction):
            converted[prediction_idx, elem_idx, np.argmax(elem)] = 1
    return converted

In [5]:
from redbaron import RedBaron

with open("sample.py", "r") as f:
    source = f.read()
with open("sample.py", "r") as f:
    lines = f.readlines()

red = RedBaron(source)
data = []
for fn_node in red.findAll("DefNode"):
    starting_line = fn_node.absolute_bounding_box.top_left.to_tuple()[0]
    ending_line = fn_node.absolute_bounding_box.bottom_right.to_tuple()[0]
    fn_lines = lines[starting_line-1:ending_line-1]
    data.append("".join(fn_lines).rstrip())

In [6]:
#raw = ["This is a test.", "This is a test2", "This is a test3"]

In [7]:
raw = data
print(raw)

["    def __init__(self, options, selectedoptions=None):\n        QDialog.__init__(self)\n        self.setupUi(self)\n\n        self.options = options\n        self.selectedoptions = selectedoptions\n\n        # Additional buttons\n        self.btnSelectAll = QPushButton(self.tr('Select all'))\n        self.buttonBox.addButton(self.btnSelectAll,\n                                 QDialogButtonBox.ActionRole)\n        self.btnClearSelection = QPushButton(self.tr('Clear selection'))\n        self.buttonBox.addButton(self.btnClearSelection,\n                                 QDialogButtonBox.ActionRole)\n        self.btnToggleSelection = QPushButton(self.tr('Toggle selection'))\n        self.buttonBox.addButton(self.btnToggleSelection,\n                                 QDialogButtonBox.ActionRole)\n\n        self.btnSelectAll.clicked.connect(self.selectAll)\n        self.btnClearSelection.clicked.connect(self.lstLayers.clearSelection)\n        self.btnToggleSelection.clicked.connect(self.to

In [8]:
dimIn = 130
dim = 260
num_epochs = 5000

In [9]:
e = encode_documents(raw)
X = e.reshape(e.shape[0], e.shape[1], e.shape[3])

In [10]:
X.shape

(6, 1021, 130)

In [11]:
# create and fit the model
x0 = Input(shape=X[0].shape)
#print(x0.get_shape())

In [12]:
def fork_and_gru(input_layer, activation='relu', return_sequences=False):
    fork = Dense(dim, activation='linear')(input_layer)
    gru = GRU(dim, activation=activation, return_sequences=return_sequences)(fork)
    return fork, gru

In [13]:
fork1, gru1 = fork_and_gru(x0, return_sequences=True)
fork2, gru2 = fork_and_gru(gru1, return_sequences=True)
fork3, gru3 = fork_and_gru(gru2, return_sequences=False)

#fork3, gru3 = fork_and_gru(x0, return_sequences=False)

In [14]:
x3 = Dense(dimIn, activation='linear')(gru3)
#print(x3.get_shape())

In [15]:
x3b = Reshape((1, dimIn))(x3)
#print(x3b.get_shape())

In [16]:
def shift_by_one(t1, t2):
    t3 = T.concatenate([t1, t2], axis=1) #tf.concat(values=[t1, t2], concat_dim=1)
    return t3[:, :-1, :]
x4 = Lambda(shift_by_one, output_shape=(X[0].shape), arguments={"t2": x0})(x3b)
#print(x4.get_shape())

In [17]:
x5 = Dense(dimIn, activation='linear')(x4)
#print(x5.get_shape())

In [18]:
x6 = GRU(dimIn, activation='relu', return_sequences=True)(x5)
#print(x6.get_shape())

In [19]:
x7 = Activation("softmax")(x6)
#print(x7.get_shape())

In [20]:
model = Model(input=x0, output=x7)

In [21]:
opt = Adam(clipnorm=5)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

In [27]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.00001)

In [28]:
early_stopping = EarlyStopping(monitor="loss", patience=2)

In [36]:
model_checkpoint = ModelCheckpoint(monitor="loss", filepath="weights.{epoch:02d}-{loss:.2f}.hdf5", save_best_only=True, save_weights_only=True)

In [37]:
cur_epoch = 0

In [None]:
while True:
    if cur_epoch >= num_epochs:
        break
    model.fit(X, X, batch_size=len(X), nb_epoch=1, verbose=2, shuffle=False, callbacks=[reduce_lr, early_stopping, model_checkpoint])
    print("Current epoch: %s" % cur_epoch)
    cur_epoch += 1
    if cur_epoch % 100 == 0:
        print(decode_documents(one_hot_conversion(model.predict(X))))

Epoch 1/1
2s - loss: 4.7328 - categorical_accuracy: 0.0942
Current epoch: 0
Epoch 1/1
2s - loss: 4.6846 - categorical_accuracy: 0.6618
Current epoch: 1
Epoch 1/1
2s - loss: 4.6362 - categorical_accuracy: 0.6688
Current epoch: 2
Epoch 1/1
2s - loss: 4.5943 - categorical_accuracy: 0.6699
Current epoch: 3
Epoch 1/1
2s - loss: 4.5490 - categorical_accuracy: 0.6704
Current epoch: 4
Epoch 1/1
2s - loss: 4.4987 - categorical_accuracy: 0.6727
Current epoch: 5
Epoch 1/1
2s - loss: 4.4502 - categorical_accuracy: 0.6688
Current epoch: 6
Epoch 1/1
2s - loss: 4.3930 - categorical_accuracy: 0.6690
Current epoch: 7
Epoch 1/1
2s - loss: 4.3348 - categorical_accuracy: 0.6690
Current epoch: 8
Epoch 1/1
2s - loss: 4.2674 - categorical_accuracy: 0.6701
Current epoch: 9
Epoch 1/1
2s - loss: 4.1878 - categorical_accuracy: 0.6717
Current epoch: 10
Epoch 1/1
2s - loss: 4.0929 - categorical_accuracy: 0.6694
Current epoch: 11
Epoch 1/1
2s - loss: 3.9755 - categorical_accuracy: 0.6698
Current epoch: 12
Epoch 1/1

In [None]:
# summarize performance of the model
scores = model.evaluate(X, X, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
decode_documents(one_hot_conversion(model.predict(X)))