In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import pandas as pd
import pickle
from timeit import default_timer as timer

import sklearn
from sklearn.model_selection import train_test_split
import nltk as nltk

# Enabling eager execution 

In [2]:
tf.enable_eager_execution()

# Load Glove vectors

In [3]:
UNK = '<unk>'

glovePath = '/media/kandy/hdd/master-thesis/datasets/'
outfile = glovePath +'glove_word_corpus.pic'

with open(outfile, 'rb') as pickle_file:    
    gloveCorpus, glove_corpus_word_to_int, glove_corpus_int_to_word = pickle.load(pickle_file)
gloveSet = pd.read_csv(glovePath+'glove.42B.10d.txt', sep=' ', header=None)
print(gloveSet.shape)
print(gloveSet.head())

gloveWords = gloveSet.iloc[:,0:1]
gloveVectors = gloveSet.iloc[:,1:]

(1783088, 11)
    0        1         2         3         4         5         6       7   \
0    ,  0.18378 -0.121230 -0.119870  0.015227 -0.191210 -0.066074 -2.9876   
1  the -0.20838 -0.149320 -0.017528 -0.028432 -0.060104 -0.264600 -4.1445   
2    .  0.10876  0.002244  0.222130 -0.121020 -0.048959  0.018135 -3.8174   
3  and -0.09611 -0.257880 -0.358600 -0.328870  0.579500 -0.517740 -4.1582   
4   to -0.24837 -0.454610  0.039227 -0.284220 -0.031852  0.263550 -4.6323   

         8         9         10  
0  0.807950  0.067338 -0.131840  
1  0.629320  0.336720 -0.433950  
2 -0.032631 -0.625940 -0.518980  
3 -0.113710 -0.108480 -0.488850  
4  0.013890 -0.539280 -0.084454  


# Load training and test dataset

In [4]:
## https://www.nltk.org/_modules/nltk/tree.html
## above link contains the API and also some tutorials

#reader = nltk.corpus.BracketParseCorpusReader('.','SWB-all-sentences-original-with-punctuation.MRG')
reader = nltk.corpus.BracketParseCorpusReader('.','WSJ.txt')
print(reader.fileids())
print(type(reader))

## reads the file and converts each line into a tree
trees = reader.parsed_sents()
print('No. of trees: ', len(trees))
print(type(trees[0]))

['WSJ.txt']
<class 'nltk.corpus.reader.bracket_parse.BracketParseCorpusReader'>
No. of trees:  49208
<class 'nltk.tree.Tree'>


In [5]:
treesDict = [{'sentence': tree.leaves(), 'tree':tree} for tree in trees]
treeDataframe = pd.DataFrame(data=treesDict, columns=['sentence', 'tree'])
treeDataframe.head()

Unnamed: 0,sentence,tree
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[[[(NP (NNP Pierre) (NNP Vinken)), (, ,), (ADJ..."
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[[[(NNP Mr.), (NNP Vinken)], [(VBZ is), (NP-PR..."
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[[[(NP (NNP Rudolph) (NNP Agnew)), (, ,), (UCP..."
3,"[A, form, of, asbestos, once, used, *, *, to, ...",[[[(NP-SBJ\n (NP (NP (DT A) (NN form)) (PP (I...
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...",[[[(NP-SBJ\n (NP (DT The) (NN asbestos) (NN f...


In [6]:
def convert_imdb_corpus_into_int(words):
    words = [word if word in gloveCorpus else UNK for word in words]
    words_to_num = [glove_corpus_word_to_int[word] for word in words]
    return words_to_num

treeDataframe_num = treeDataframe.copy()
treeDataframe_num['sentence'] = treeDataframe_num['sentence'].apply(convert_imdb_corpus_into_int)
treeDataframe_num.head()

Unnamed: 0,sentence,tree
0,"[1127477, 1127477, 691001, 1127477, 1127477, 1...","[[[(NP (NNP Pierre) (NNP Vinken)), (, ,), (ADJ..."
1,"[1127477, 1127477, 1127477, 1127477, 1264798, ...","[[[(NNP Mr.), (NNP Vinken)], [(VBZ is), (NP-PR..."
2,"[1127477, 1127477, 691001, 1127477, 1127477, 1...","[[[(NP (NNP Rudolph) (NNP Agnew)), (, ,), (UCP..."
3,"[1127477, 1127477, 1264798, 1127477, 1127477, ...",[[[(NP-SBJ\n (NP (NP (DT A) (NN form)) (PP (I...
4,"[1127477, 1127477, 1127477, 691001, 211388, 69...",[[[(NP-SBJ\n (NP (DT The) (NN asbestos) (NN f...


In [40]:
treeDF_train, treeDF_test = sklearn.model_selection.train_test_split(treeDataframe_num, test_size=0.997)
print(treeDF_train.shape)
print(treeDF_test.shape)

(147, 2)
(49061, 2)


# Model and the Parameters

In [41]:
STATE_SIZE = 10
embeddings = tfe.Variable(name='embeddings', validate_shape= gloveVectors.shape, 
                          initial_value=gloveVectors.values, 
                          dtype=tf.float32, trainable=False)
w = tfe.Variable(name='w', validate_shape=(2*gloveVectors.shape[1], STATE_SIZE), 
                 initial_value=0.01 * tf.random_normal(shape=(2*gloveVectors.shape[1], STATE_SIZE)),
                 dtype=tf.float32)
b = tfe.Variable(name='b', validate_shape=(1, STATE_SIZE),
                 initial_value=0.01 * tf.random_normal(shape=(1, STATE_SIZE)),
                 dtype=tf.float32)

w_score = tfe.Variable(name='w_score', validate_shape=(STATE_SIZE, 1), 
                 initial_value=0.01 * tf.random_normal(shape=(STATE_SIZE, 1)),
                 dtype=tf.float32)
b_score = tfe.Variable(name='b_score', validate_shape=(1, 1),
                 initial_value=0.01 * tf.random_normal(shape=(1, 1)),
                 dtype=tf.float32)

#print(w)
#print(b)
#print(w_score)
#print(b_score)

In [42]:
print(embeddings.shape)
lookup = tf.nn.embedding_lookup(embeddings, glove_corpus_word_to_int['<unk>'])
lookup = tf.reshape(lookup, shape=(1, lookup.shape[0]))
print(lookup)

(1783088, 10)
tf.Tensor(
[[ 0.32699   0.17616   0.47762   0.15523  -0.43263   0.044493  0.40132
  -0.46666   0.28362   0.88024 ]], shape=(1, 10), dtype=float32)


In [43]:
def compute_score_for_tree(tree, weights, embeddings, scores):
    if(type(tree[0]) == type('a string')):
        #print(tree.label() + ' : ' + tree[0])
        word = tree[0].lower()
        if(word not in glove_corpus_word_to_int):
            word = '<unk>'
        word_vector = tf.nn.embedding_lookup(embeddings, glove_corpus_word_to_int[word])
        word_vector = tf.reshape(word_vector, shape=(1, word_vector.shape[0]))
        return word_vector
        #print('depth is reached !!!')
        #return

    #for i in range(len(tree)):
    #    print('Inside tree : '+ tree[i].label())
    #    compute_score_for_tree(tree[i], weights, embeddings)


        
    left = compute_score_for_tree(tree[0], weights, embeddings, scores)
    if(len(tree) !=2):
        return left

    right = compute_score_for_tree(tree[1], weights, embeddings, scores)
    words_concat = tf.concat([left, right], axis=0)
    #print(words_concat.shape)
    #print(left.shape, right.shape)
    words_concat = tf.reshape(words_concat, shape=(1, left.shape[1] + right.shape[1]))
    #print(words_concat)
    z = tf.matmul(words_concat, weights[0]) + weights[1]
    state_vec = tf.tanh(z)
    
    #print(state_vec)
    score = tf.matmul(state_vec, weights[2]) + weights[3]
    scores.append(score)
    return state_vec
    
    
for j in range(treeDF_train.shape[0]):
    tree = treeDF_train.iat[j,1]
    print(tree.leaves())
    scores = []
    tree.chomsky_normal_form()
    compute_score_for_tree(tree[0], [w,b,w_score,b_score], embeddings, scores)
    #print(sum(scores))
    total_score = tf.reduce_sum(tf.stack(scores))
    print(total_score)
    break
    


['Critics', 'of', 'the', 'present', 'arrangement', 'are', 'correct', '*-1', 'to', 'say', 'that', 'it', 'is', 'undemocratic', '.']
tf.Tensor(-0.06728338, shape=(), dtype=float32)


In [44]:
start = timer()
optimizer = tf.train.AdamOptimizer()
epoch = 1

for i in range(epoch):
    for j in range(treeDF_train.shape[0]):
        with tf.GradientTape() as tape:
            words = tf.nn.embedding_lookup(embeddings, treeDF_train.iat[j,0])
            numpy_words = words.numpy()
            tensor_words =  tf.convert_to_tensor(numpy_words)
            words_len = tensor_words.shape[0]
            total_pred_score = 0
            print(words.shape)
            words_unstack = tf.unstack(words)
            print(words_unstack[0].shape)
            print(type(words_unstack))
        break

(15, 10)
(10,)
<class 'list'>


In [49]:
start = timer()
optimizer = tf.train.AdamOptimizer()
epoch = 100

for i in range(epoch):
    with tf.GradientTape() as tape:
        for j in range(treeDF_train.shape[0]):
            words = tf.nn.embedding_lookup(embeddings, treeDF_train.iat[j,0])
            words_unstack = tf.unstack(words)
            words_len = len(words_unstack)
            pred_score_list = []
            total_loss_list = []
            while(words_len > 1):
                #print(words_len)
                scores = np.zeros(shape=(words_len-1, 1))
                for k in range(words_len - 1):
                    words_concat = tf.concat([words_unstack[k], words_unstack[k+1]], axis=0)
                    words_concat = tf.reshape(words_concat, shape=(1, words_concat.shape[0]))
                    #print(words_concat)
                    z = tf.matmul(words_concat, w) + b
                    state_vec = tf.tanh(z)
                    #print(state_vec)
                    score = tf.matmul(state_vec, w_score) + b_score
                    #print(score)
                    scores[k] = score
                
                #print(scores)
                
                max_score_index = np.argmax(scores) 
                pred_score_list.append(scores[max_score_index])
                #print(max_score_index)
                words_unstack.pop(max_score_index+1)
                #print(len(words_unstack))
                words_unstack.pop(max_score_index)
                #print(len(words_unstack))
                state_vec_vector = tf.reshape(state_vec, shape = [state_vec.shape[1]])
                #print(state_vec_vector)
                words_unstack.insert(max_score_index, state_vec_vector)
                #print(len(words_unstack))
                words_len = len(words_unstack)
                #print(words_len)
            
            act_score_list = []
            tree = treeDF_train.iat[j,1]
            tree.chomsky_normal_form()
            compute_score_for_tree(tree[0], [w,b,w_score,b_score], embeddings, act_score_list)
            total_act_score = tf.reduce_sum(tf.stack(act_score_list))
            total_pred_score = tf.reduce_sum(tf.stack(pred_score_list))
            #print(total_act_score)
            #print(total_pred_score)
            loss = tf.losses.absolute_difference(total_act_score,  total_pred_score)
            total_loss_list.append(loss)
            #loss = tf.losses.sigmoid_cross_entropy(tf.constant(imdb_train.iat[j,1], shape=(1,1)), y_predict)
        
        total_loss = tf.reduce_mean(tf.stack(total_loss_list))
        grads = tape.gradient(total_loss, [w,b,w_score,b_score])
        #print(grads)
        optimizer.apply_gradients(zip(grads, [w,b,w_score,b_score]))
            #,global_step=tf.train.get_or_create_global_step())
        print(total_loss.numpy())
        #if(j == 100):
        #    break
        
end = timer()
print('Time taken to execute (seconds): ', end-start)

0.0041068792
0.001674056
0.00043857098
0.0007224083
0.0011472702
0.0011411905
0.00085377693
0.00034725666
0.00035119057
0.0005912781
0.0006707907
0.00054585934
0.00020194054
0.0002450943
0.00049591064
0.0006041527
0.0006004572
0.0005013943
0.004415989
0.0042316914
0.003971815
0.0037435293
0.0032821894
0.0027462244
0.0020823479
0.00038921833
0.00055754185
0.0011968613
0.0015074015
0.00149858
0.0014225245
0.00087058544
0.0013151169
0.0018045902
0.0020039082
0.0019580126
0.0016981363
0.0012476444
0.00090551376
0.0012358427
0.0013753176
0.0013167858
0.0005239248
0.0005823374
0.00044322014
0.00012338161
0.0019847155
0.001956582
0.0019820929
0.0020834208
2.5987625e-05
0.000213027
0.0004886389
0.0005363226
0.0003772974
2.0742416e-05
0.0005322695
0.00078749657
0.0007582903
0.0004723072
0.00014078617
0.00035250187
0.00029456615
1.0251999e-05
2.1457672e-05
0.00023388863
0.00020122528
9.274483e-05
8.404255e-05
0.00019693375
0.00018048286
0.00010383129
8.273125e-05
0.00021135807
0.00020515919
7.02

In [11]:
start = timer()
optimizer = tf.train.AdamOptimizer()
epoch = 1

for i in range(epoch):
    for j in range(treeDF_train.shape[0]):
        with tf.GradientTape() as tape:
            words = tf.nn.embedding_lookup(embeddings, treeDF_train.iat[j,0])
            numpy_words = words.numpy()
            tensor_words =  tf.convert_to_tensor(numpy_words)
            words_len = tensor_words.shape[0]
            total_pred_score = 0
            #print(words_len)
            while(words_len > 1):
                scores = np.zeros(shape=(words_len-1, 1))
                for k in range(words_len - 1):
                    words_concat = tf.concat([tensor_words[k], tensor_words[k+1]], axis=0)
                    words_concat = tf.reshape(words_concat, shape=(1, words_concat.shape[0]))
                    #print(words_concat)
                    z = tf.matmul(words_concat, w) + b
                    state_vec = tf.tanh(z)
                    #print(state_vec)
                    score = tf.matmul(state_vec, w_score) + b_score
                    #print(score)
                    scores[k] = score
                
                #print(scores)
                
                max_score_index = np.argmax(scores) 
                total_pred_score = total_pred_score + scores[max_score_index]
                #print(max_score_index)
                list_words = numpy_words.tolist()
                #print(len(list_words))
                list_words.pop(max_score_index+1)
                #print(len(list_words))
                list_words.pop(max_score_index)
                #print(len(list_words))
                list_words.insert(max_score_index, state_vec.numpy().tolist()[0])
                #print(len(list_words))
                numpy_words = np.array(list_words)
                #print(numpy_words)
                #print(numpy_words.shape)
                tensor_words =  tf.convert_to_tensor(list_words)
                words_len = tensor_words.shape[0]
                #print(words_len)
            
            scores = []
            tree = treeDF_train.iat[j,1]
            tree.chomsky_normal_form()
            compute_score_for_tree(tree[0], [w,b,w_score,b_score], embeddings, scores)
            total_actual_score = sum(scores)
            #print(type(total_actual_score))
            #print(type(total_pred_score))
            loss = tf.losses.absolute_difference(total_actual_score,  total_pred_score[0])
            
            #loss = tf.losses.sigmoid_cross_entropy(tf.constant(imdb_train.iat[j,1], shape=(1,1)), y_predict)
           
            grads = tape.gradient(loss, [w_score, b_score, w, b])
            #print(grads)
            optimizer.apply_gradients(zip(grads, [w_score, b_score, w, b]))
            #,global_step=tf.train.get_or_create_global_step())
        if(j == 100):
            break
        #print(loss.numpy())
        
end = timer()
print('Time taken to execute (seconds): ', end-start)

ValueError: No gradients provided for any variable: ["<tf.Variable 'w_score:0' shape=(10, 1) dtype=float32, numpy=\narray([[-0.00154085],\n       [-0.00365236],\n       [-0.00598969],\n       [-0.00382886],\n       [-0.00333749],\n       [-0.01275734],\n       [-0.00310688],\n       [ 0.00849401],\n       [-0.01227861],\n       [ 0.00580805]], dtype=float32)>", "<tf.Variable 'b_score:0' shape=(1, 1) dtype=float32, numpy=array([[-0.00917668]], dtype=float32)>", "<tf.Variable 'w:0' shape=(20, 10) dtype=float32, numpy=\narray([[-3.09174345e-03, -5.74955996e-03, -8.96898750e-03,\n        -1.94976723e-03,  7.19460612e-03,  2.89615733e-03,\n         1.01199206e-02, -3.93189816e-03, -2.19476898e-03,\n         3.18261143e-03],\n       [ 6.97274320e-03, -1.18933683e-02,  1.33814020e-02,\n        -2.06305552e-03, -1.10047432e-02, -7.98548106e-03,\n        -1.16529735e-02,  1.74845587e-02,  2.48650163e-02,\n         6.07878016e-03],\n       [ 1.39661517e-03,  7.68934609e-03,  2.68776016e-03,\n         9.95987374e-03, -9.35456264e-05,  2.17868388e-03,\n         1.17401704e-02, -1.08868750e-02, -1.08957225e-02,\n         9.17153992e-03],\n       [ 9.02451109e-03,  4.45182202e-03, -1.71463809e-03,\n         1.89139247e-02, -1.92416157e-03,  7.12427171e-03,\n         4.11842950e-03,  3.45513923e-03, -1.73334926e-02,\n        -7.75449723e-03],\n       [ 9.80109069e-03, -3.49491648e-03, -9.11852904e-03,\n        -2.96191592e-03, -8.08261894e-03, -1.27005740e-03,\n         1.84484366e-02,  5.25826705e-04, -1.84372312e-03,\n        -4.00816835e-03],\n       [ 6.22068346e-03, -9.17762611e-03, -7.88632687e-03,\n         1.69579592e-02, -6.41970756e-03, -7.45966379e-03,\n        -5.77369751e-03,  8.18815362e-03,  5.83289796e-03,\n         1.20820124e-02],\n       [-1.71516847e-03, -4.80221258e-03,  1.15765231e-02,\n        -1.94087531e-02,  1.33954585e-02,  2.16873679e-02,\n        -3.94105166e-03,  8.04960262e-03,  4.16671811e-03,\n         2.51371018e-03],\n       [-1.40991611e-02, -1.58825740e-02, -3.45084397e-03,\n         2.25161738e-03,  4.35629045e-04,  1.04212295e-02,\n        -3.74808605e-03, -6.13378966e-03, -6.73765689e-03,\n         1.15719493e-02],\n       [ 1.63916475e-03,  1.22696729e-02,  7.22868554e-03,\n        -3.61649832e-03, -7.15096598e-04, -1.96413789e-02,\n         3.28121497e-03,  2.46137660e-03, -8.47877376e-03,\n        -9.32355691e-03],\n       [-1.19181173e-02,  1.07893227e-02,  5.73976571e-03,\n        -5.03425067e-03, -1.06739663e-02,  2.04937141e-02,\n         8.84764269e-03,  3.54238180e-03, -1.35535365e-02,\n        -9.74597968e-03],\n       [ 7.00888271e-03,  1.12532098e-02, -2.88254116e-03,\n         1.02232788e-02, -7.25333951e-03, -3.40706203e-03,\n        -1.36010228e-02, -1.50938686e-02, -1.28285750e-03,\n        -2.89220852e-03],\n       [-2.50755996e-02,  4.53507900e-03, -5.99443121e-03,\n        -8.10758211e-04,  1.18189678e-02,  3.52143007e-03,\n        -1.28063874e-03, -1.88804660e-02, -9.97012202e-03,\n        -1.90508121e-03],\n       [-1.19528351e-02, -8.62946175e-03,  2.90089771e-02,\n        -2.12417814e-04,  1.41580701e-02,  2.84806117e-02,\n        -2.95294099e-03, -1.29318424e-02, -4.01881896e-03,\n         1.15521462e-03],\n       [ 6.31020730e-03, -4.15504415e-04,  1.45845590e-02,\n         2.57285847e-03,  6.05166424e-03,  2.36604875e-03,\n        -7.08058337e-03, -7.07692560e-03, -6.68112002e-03,\n        -2.58101127e-03],\n       [ 1.95625667e-02, -7.05379294e-03,  8.14766996e-03,\n        -3.15949204e-04, -4.99683386e-03,  4.17695707e-03,\n        -4.73600207e-03,  1.16216801e-02, -3.48707964e-03,\n        -7.68754771e-03],\n       [ 1.39405327e-02,  7.72135891e-03, -3.65331024e-03,\n        -3.97247914e-03, -1.46261947e-02,  6.98104408e-03,\n        -5.41888503e-03,  1.69727225e-02,  1.17874993e-02,\n        -1.50906695e-02],\n       [ 6.25656964e-03, -6.79577049e-03, -6.43751072e-03,\n        -1.09023303e-02, -4.69371211e-03,  4.82316082e-03,\n        -8.20855424e-03, -7.38050323e-03, -8.35179165e-03,\n        -3.48199811e-03],\n       [ 9.05214995e-03,  1.27361333e-02, -1.83792543e-02,\n         2.00365614e-02,  3.06330854e-03, -9.39836353e-03,\n         9.20790527e-03,  5.43366233e-03, -2.07816325e-02,\n         5.02717588e-03],\n       [-5.50827850e-03, -5.71877928e-03, -2.77381227e-03,\n        -6.92338031e-03,  1.92580407e-03,  1.04830787e-03,\n        -5.97799756e-03, -1.04356753e-02, -1.30497562e-02,\n         2.12278147e-03],\n       [ 1.76440403e-02, -6.97733974e-03, -1.52996611e-02,\n         1.35736819e-03,  6.36868572e-05,  9.06752609e-03,\n        -3.31368973e-03,  1.10831326e-02,  8.87572579e-03,\n        -6.81262882e-03]], dtype=float32)>", "<tf.Variable 'b:0' shape=(1, 10) dtype=float32, numpy=\narray([[ 0.00263612, -0.01090427,  0.01265588,  0.01441635, -0.01406689,\n        -0.00690599, -0.00608754,  0.00096349,  0.00586474, -0.00322387]],\n      dtype=float32)>"].

In [89]:
print(len(list_words))
print(max_score_index)
print(list_words[len(list_words) - 1])

for i in range(len(list_words)):
    print(i)
    print(type(list_words[i]))
    word = list_words[i]
    print(type(word))
    print(len(word))

46
45
[[0.01782897114753723, 0.001361352507956326, -0.005683459807187319, -0.002537204185500741, 0.03020063415169716, -0.0396573543548584, -0.01406009029597044, -0.004425551276654005, 0.0037269028834998608, 0.02288670465350151]]
0
<class 'list'>
<class 'list'>
10
1
<class 'list'>
<class 'list'>
10
2
<class 'list'>
<class 'list'>
10
3
<class 'list'>
<class 'list'>
10
4
<class 'list'>
<class 'list'>
10
5
<class 'list'>
<class 'list'>
10
6
<class 'list'>
<class 'list'>
10
7
<class 'list'>
<class 'list'>
10
8
<class 'list'>
<class 'list'>
10
9
<class 'list'>
<class 'list'>
10
10
<class 'list'>
<class 'list'>
10
11
<class 'list'>
<class 'list'>
10
12
<class 'list'>
<class 'list'>
10
13
<class 'list'>
<class 'list'>
10
14
<class 'list'>
<class 'list'>
10
15
<class 'list'>
<class 'list'>
10
16
<class 'list'>
<class 'list'>
10
17
<class 'list'>
<class 'list'>
10
18
<class 'list'>
<class 'list'>
10
19
<class 'list'>
<class 'list'>
10
20
<class 'list'>
<class 'list'>
10
21
<class 'list'>
<class 

In [67]:
numpy_words = words.numpy()
tensor_words =  tf.convert_to_tensor(numpy_words)
print(numpy_words.shape)
print(tensor_words.shape)

(47, 10)
(47, 10)
