In [1]:
import os
os.environ['THEANO_FLAGS'] = 'floatX=float32,device=gpu'
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin/"

import blocks
from blocks.bricks import Linear, Softmax, Softplus, NDimensionalSoftmax, BatchNormalizedMLP, \
                                Rectifier, Logistic, Tanh, MLP
from blocks.bricks.recurrent import GatedRecurrent, LSTM
from blocks.bricks.parallel import Fork
from blocks.initialization import Constant, IsotropicGaussian, Identity, Uniform
from blocks.bricks.cost import BinaryCrossEntropy, CategoricalCrossEntropy
from blocks.filter import VariableFilter
from blocks.roles import PARAMETER
from blocks.graph import ComputationGraph
# from copy import copy
import numpy as np
import theano
from theano import tensor as T

import learningfunctions

Using gpu device 0: TITAN X (Pascal) (CNMeM is disabled, cuDNN 5110)


In [2]:
from redbaron import RedBaron

with open("sample.py", "r") as f:
    source = f.read()
with open("sample.py", "r") as f:
    lines = f.readlines()

red = RedBaron(source)
data = []
for fn_node in red.findAll("DefNode"):
    starting_line = fn_node.absolute_bounding_box.top_left.to_tuple()[0]
    ending_line = fn_node.absolute_bounding_box.bottom_right.to_tuple()[0]
    fn_lines = lines[starting_line-1:ending_line-1]
    data.append("".join(fn_lines).rstrip())

In [3]:
data = ['this is data', 'we like data', 'once upon a time']

In [4]:
def clip_norm(g, c, n): 
    '''n is the norm, c is the threashold, and g is the gradient'''
    
    if c > 0: 
        g = T.switch(T.ge(n, c), g*c/n, g) 
    return g
def clip_norms(gs, c):
    norm = T.sqrt(sum([T.sum(g**2) for g in gs]))
    return [clip_norm(g, c, norm) for g in gs]

In [5]:
with open('test.txt', 'r') as f:
    char_str = f.readlines()[0]
char_to_idx = {char:idx for (idx, char) in enumerate(char_str)}
idx_to_char = {idx:char for (idx, char) in enumerate(char_str)}

SOS_TOKEN_IDX = len(char_to_idx)
EOS_TOKEN_IDX = len(char_to_idx)+1

# if desired_length is not specified, desired_length will be len(document)
# if len(document) < desired_length, pad with zero vectors to reach desired_length
# if len(document) > desired_length, truncate at desired_length
# encoded will always be desired_length + 2 (to include SOS and EOS tokens)
# EOS token will always be appended to the end of encoded even if len(document) > desired_length
def encode_document(document, char_to_idx, desired_length=-1):
    if desired_length == -1:
        desired_length = len(document)
    encoded = np.zeros((desired_length+2, len(char_to_idx)+2)) # +2 for SOS and EOS tokens
    encoded[0, SOS_TOKEN_IDX] = 1 # set SOS token
    for doc_idx, char in enumerate(document[:desired_length]):
        encoded[doc_idx+1, char_to_idx[char]] = 1
    encoded[len(document[:desired_length])+1, EOS_TOKEN_IDX] = 1
    return encoded.reshape(encoded.shape[0], 1, encoded.shape[1])

# if desired_length is not specified, desired_length will be length of longest document
def encode_documents(documents, char_to_idx, desired_length=-1):
    if desired_length == -1:
        desired_length = max([len(document) for document in documents])
    encodeds = []
    for document in documents:
        encodeds.append(encode_document(document, char_to_idx, desired_length))
    e = np.array(encodeds)
    return e

# encoded must be one-hot
def decode_document(encoded, idx_to_char):
    decoded = ""
    for idx in np.nonzero(encoded)[2]:
        if idx == SOS_TOKEN_IDX:
            continue
        elif idx == EOS_TOKEN_IDX:
            break
        decoded += idx_to_char[idx]
    return decoded

def decode_documents(encodeds, idx_to_char):
    decodeds = []
    for encoded in encodeds:
        decodeds.append(decode_document(encoded, idx_to_char))
    return decodeds

#encoded = encode_document("this is a", char_to_idx)
#decode_document(encoded, idx_to_char)

In [6]:
encoded = encode_documents(data, char_to_idx)

In [7]:
def one_hot_conversion(predictions):
    converted = np.zeros(predictions.shape)
    for prediction_idx, prediction in enumerate(predictions):
        for elem_idx, elem in enumerate(prediction):
            converted[prediction_idx, elem_idx, 0, np.argmax(elem[0])] = 1
    return converted

In [8]:
X = T.tensor4('X')
rnnType = 'gru'
dimIn = 98
dim = 20
wtstd = 0.2
rnnbias_init = Constant(0.0)
rnnwt_init = IsotropicGaussian(wtstd)
linewt_init = IsotropicGaussian(wtstd)
line_bias = Constant(1.0)

lr = 0.0001
decay = 0.9
decay_itr = 15000
learning_rate = theano.shared(np.array(lr, dtype=theano.config.floatX))
learning_decay = np.array(decay, dtype=theano.config.floatX)

clippings = 0.3
#ADD lr decay

In [9]:
if rnnType == 'gru':
    rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

fork = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dimIn, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = line_bias)



In [10]:
def onestepEnc(X):
    data1, data2 = fork.apply(X) 

    if rnnType == 'gru':
        hEnc = rnn.apply(data1, data2) 
    else:
        hEnc, _ = rnn.apply(data2)

    return hEnc

hEnc, _ = theano.scan(onestepEnc, X) 


In [11]:
fork.initialize()
rnn.initialize()

layer1Fun = theano.function([X], hEnc, allow_input_downcast=True)

In [12]:
fakeData = np.random.rand(3,50,1,70).astype('float32')

In [13]:
if rnnType == 'gru':
    rnn2 = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnn2 = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

fork2 = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = line_bias)



def onestepEnc2(hEnc):
    data3, data4 = fork2.apply(hEnc) 

    if rnnType == 'gru':
        hEnc2 = rnn2.apply(data3, data4) 
    else:
        hEnc2, _ = rnn2.apply(data4)

    return hEnc2, data3

[hEnc2, data3], _ = theano.scan(onestepEnc2, hEnc) 

fork2.initialize()
rnn2.initialize()

layer2Fun = theano.function([X], hEnc2, allow_input_downcast=True)

In [14]:
if rnnType == 'gru':
    rnn3 = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnn3 = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

fork3 = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = line_bias)

forkD = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dim, output_dims=[dimIn, dimIn * dimMultiplier], 
            weights_init = linewt_init, biases_init = line_bias)

def onestepEnc3(hEnc2):
    data5, data6 = fork3.apply(hEnc2) 

    if rnnType == 'gru':
        hEnc3 = rnn3.apply(data5, data6) 
    else:
        hEnc3, _ = rnn3.apply(data6)

    return hEnc3

hEnc3, _ = theano.scan(onestepEnc3, hEnc2) 
h4decoder = hEnc3[:,-1,:,:].reshape((-1, 1,1,20))

h4reshape, _ = forkD.apply(h4decoder)

forkD.initialize()
fork3.initialize()
rnn3.initialize()

layer3Fun = theano.function([X], h4reshape, allow_input_downcast=True)

In [15]:
if rnnType == 'gru':
    rnn4 = GatedRecurrent(dim=dimIn, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnn4 = LSTM(dim=dimIn, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

#TOTHINK: transform before the decoder or after
fork4 = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dimIn, output_dims=[dimIn, dimIn * dimMultiplier], 
            weights_init = linewt_init, biases_init = line_bias)

targets = T.concatenate((h4reshape, X[:,:-1, :,:]), axis=1)

def onestepEnc4(targets):
    data7, data8 = fork4.apply(targets) 

    if rnnType == 'gru':
        hDec = rnn4.apply(data7, data8) 
    else:
        hDec, _ = rnn4.apply(data8)

    return hDec

hDec, _ = theano.scan(onestepEnc4, targets) 

fork4.initialize()
rnn4.initialize()

layer4Fun = theano.function([X], hDec, allow_input_downcast=True)

decFun = theano.function([X], targets, allow_input_downcast=True)

In [16]:
predTargets = T.exp(hDec)/T.sum(T.exp(hDec), axis=(3,2), keepdims=True)
#precost = -X.squeeze*T.log(predTargets.squeeze()) - (1-X.squeeze())*T.log(1-predTargets.squeeze())

#ADDLATER: beam search
cost = T.mean(T.sum(T.nnet.categorical_crossentropy(predTargets, X), axis = 1))

In [17]:
cg = ComputationGraph([cost])
params = VariableFilter(roles = [PARAMETER])(cg.variables)

###To check gradients for explosion/shrinkage
gradients = T.grad(cost, params)
gradients = clip_norms(gradients, clippings)
gradientFun = theano.function([X, predTargets], gradients, allow_input_downcast=True)

learning = learningfunctions.Learning(cost,params,learning_rate,l1=0.,l2=0.,maxnorm=0.,c=clippings)
updates = learning.Adam() 

print('compiling graph you talented soul')
classifierTrain = theano.function([X], [cost, predTargets], 
                                  updates=updates, allow_input_downcast=True)
#classifierPredict = theano.function([X], [softoutClass, attEncpred, attContextpred], allow_input_downcast=True)
classifierPredict = theano.function([X], predTargets, allow_input_downcast=True)

compiling graph you talented soul


In [18]:
forks = [fork, fork2, fork3, fork4, forkD]
rnns = [rnn, rnn2, rnn3, rnn4]

for fork in forks:
    fork.initialize()
for rnn in rnns:
    rnn.initialize()

In [19]:
num_epochs=100000
for itr, x in enumerate(range(num_epochs)):
    result_cost, result_predTargets = classifierTrain(encoded)
    result_converted = one_hot_conversion(result_predTargets)
    if itr % 1000 == 0:
        print("%s: %s" % (result_cost, decode_documents(result_converted, idx_to_char)))
        grads = gradientFun(encoded, result_predTargets)
        for gra in grads:
            print('  gradient norms: ', np.linalg.norm(gra))
    if itr % decay_itr == 0:
        learning_rate.set_value(learning_rate.get_value() * learning_decay)

71.50408172607422: ['~ld~~jj~jjjjjj~jjj', '~l~~~j~j~jjjjj~jjj', '~~~~~~jj~~~~jjjj~]']
  gradient norms:  0.229775
  gradient norms:  0.0831331
  gradient norms:  0.0190914
  gradient norms:  0.0127768
  gradient norms:  0.0221939
  gradient norms:  0.0515382
  gradient norms:  0.137612
  gradient norms:  0.0211662
  gradient norms:  0.0657096
  gradient norms:  0.0267837
  gradient norms:  0.00579115
  gradient norms:  0.000284334
  gradient norms:  0.00191638
  gradient norms:  0.0061682
  gradient norms:  0.0123824
  gradient norms:  0.0401833
  gradient norms:  0.00965748
  gradient norms:  0.00205743
  gradient norms:  5.39623e-05
  gradient norms:  0.000635043
  gradient norms:  0.00217
  gradient norms:  0.0041517
  gradient norms:  0.0142125
  gradient norms:  0.00499458
  gradient norms:  0.00101761
  gradient norms:  3.07807e-05
  gradient norms:  0.000297811
  gradient norms:  0.000115647
  gradient norms:  0.00201731
  gradient norms:  0.000639412
43.878875732421875: ['wheee

KeyboardInterrupt: 

In [None]:
predictions = classifierPredict(encoded)

In [None]:
converted = one_hot_conversion(predictions)

In [None]:
decode_documents(converted, idx_to_char)