In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import pandas as pd
import pickle
from timeit import default_timer as timer
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
import nltk as nltk

import os

# Enabling eager execution and check versions

In [2]:
tf.enable_eager_execution()
print(tf.VERSION)
print(tf.keras.__version__)

1.12.0
2.1.6-tf


# Load Glove vectors

In [3]:
dirname = os.getcwd()
dirname = os.path.dirname(dirname)
dataset_path = os.path.join(dirname, 'datasets/')
print(dataset_path)

/media/kandy/hdd/master-thesis/constituency-parsing/datasets/


In [3]:
UNK = '<unk>'

outfile = dataset_path +'glove_word_corpus.pic'

with open(outfile, 'rb') as pickle_file:    
    gloveCorpus, glove_corpus_word_to_int, glove_corpus_int_to_word = pickle.load(pickle_file)

gloveSet = pd.read_csv(glovePath+'glove.42B.10d.txt', sep=' ', header=None)
print(gloveSet.shape)
print(gloveSet.head())

gloveWords = gloveSet.iloc[:,0:1]
gloveVectors = gloveSet.iloc[:,1:]

(1783088, 11)
    0        1         2         3         4         5         6       7   \
0    ,  0.18378 -0.121230 -0.119870  0.015227 -0.191210 -0.066074 -2.9876   
1  the -0.20838 -0.149320 -0.017528 -0.028432 -0.060104 -0.264600 -4.1445   
2    .  0.10876  0.002244  0.222130 -0.121020 -0.048959  0.018135 -3.8174   
3  and -0.09611 -0.257880 -0.358600 -0.328870  0.579500 -0.517740 -4.1582   
4   to -0.24837 -0.454610  0.039227 -0.284220 -0.031852  0.263550 -4.6323   

         8         9         10  
0  0.807950  0.067338 -0.131840  
1  0.629320  0.336720 -0.433950  
2 -0.032631 -0.625940 -0.518980  
3 -0.113710 -0.108480 -0.488850  
4  0.013890 -0.539280 -0.084454  


# Load training and test dataset

In [4]:
## https://www.nltk.org/_modules/nltk/tree.html
## above link contains the API and also some tutorials

#reader = nltk.corpus.BracketParseCorpusReader('.','SWB-all-sentences-original-with-punctuation.MRG')
reader = nltk.corpus.BracketParseCorpusReader(dataset_path,'WSJ.txt')
print(reader.fileids())
print(type(reader))

## reads the file and converts each line into a tree
trees = reader.parsed_sents()
print('No. of trees: ', len(trees))
print(type(trees[0]))

['WSJ.txt']
<class 'nltk.corpus.reader.bracket_parse.BracketParseCorpusReader'>
No. of trees:  49208
<class 'nltk.tree.Tree'>


In [5]:
treesDict = [{'sentence': tree.leaves(), 'tree':tree} for tree in trees]
treeDataframe = pd.DataFrame(data=treesDict, columns=['sentence', 'tree'])
treeDataframe.head()

Unnamed: 0,sentence,tree
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[[[(NP (NNP Pierre) (NNP Vinken)), (, ,), (ADJ..."
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[[[(NNP Mr.), (NNP Vinken)], [(VBZ is), (NP-PR..."
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[[[(NP (NNP Rudolph) (NNP Agnew)), (, ,), (UCP..."
3,"[A, form, of, asbestos, once, used, *, *, to, ...",[[[(NP-SBJ\n (NP (NP (DT A) (NN form)) (PP (I...
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...",[[[(NP-SBJ\n (NP (DT The) (NN asbestos) (NN f...


In [6]:
def convert_imdb_corpus_into_int(words):
    words = [word if word in gloveCorpus else UNK for word in words]
    words_to_num = [glove_corpus_word_to_int[word] for word in words]
    return words_to_num

treeDataframe_num = treeDataframe.copy()
treeDataframe_num['sentence'] = treeDataframe_num['sentence'].apply(convert_imdb_corpus_into_int)
treeDataframe_num.head()

Unnamed: 0,sentence,tree
0,"[349063, 349063, 1380359, 349063, 349063, 3490...","[[[(NP (NNP Pierre) (NNP Vinken)), (, ,), (ADJ..."
1,"[349063, 349063, 349063, 349063, 788117, 34906...","[[[(NNP Mr.), (NNP Vinken)], [(VBZ is), (NP-PR..."
2,"[349063, 349063, 1380359, 349063, 349063, 3490...","[[[(NP (NNP Rudolph) (NNP Agnew)), (, ,), (UCP..."
3,"[349063, 349063, 788117, 349063, 349063, 34906...",[[[(NP-SBJ\n (NP (NP (DT A) (NN form)) (PP (I...
4,"[349063, 349063, 349063, 1380359, 870463, 1380...",[[[(NP-SBJ\n (NP (DT The) (NN asbestos) (NN f...


In [7]:
treeDF_train, treeDF_test = sklearn.model_selection.train_test_split(treeDataframe_num, test_size=0.999)
print(treeDF_train.shape)
print(treeDF_test.shape)

(49, 2)
(49159, 2)


# Model and the Parameters

In [8]:
STATE_SIZE = 10
embeddings = tfe.Variable(name='embeddings', validate_shape= gloveVectors.shape, 
                          initial_value=gloveVectors.values, 
                          dtype=tf.float32, trainable=False)

In [None]:
class constituencyParsing(tf.keras.Model):
    def __init__(self, input_size):
        super(constituencyParsing, self).__init__()
        self.dense1 = tf.keras.layers.Dense(units=1, activation=tf.sigmoid, input_shape=(input_size,))
        
        
    def call(self, inputs):
        words = inputs
        #words matrix - unstack
        words_unstack = tf.unstack(words)
        words_len = len(words_unstack)
        pred_score_list = []
        total_loss_list = []
        #loop until all the words are merged together
        while(words_len > 1):
            #compute scores for the list of word combinations
            # for each word combination compute the score of it
            scores = np.zeros(shape=(words_len-1, 1))
            for k in range(words_len - 1):
                words_concat = tf.concat([words_unstack[k], words_unstack[k+1]], axis=0)
                #reshape the tensor to be a matrix with 1 row rather than vector
                words_concat = tf.reshape(words_concat, shape=(1, words_concat.shape[0]))
                # matrix computation and activation
                z = tf.matmul(words_concat, w) + b
                state_vec = tf.tanh(z)
                score = tf.matmul(state_vec, w_score) + b_score
                scores[k] = score

            #print(scores)
            #compare the scores and pick the maximum one. 
            max_score_index = np.argmax(scores) 
            pred_score_list.append(scores[max_score_index])

            # remove the words which is used to combine and replace with combined state vector
            words_unstack.pop(max_score_index+1)
            words_unstack.pop(max_score_index)
            # statevector needs to be reshaped as matrix to update
            state_vec_vector = tf.reshape(state_vec, shape = [state_vec.shape[1]])
            words_unstack.insert(max_score_index, state_vec_vector)
            words_len = len(words_unstack)

        # get the actual tree - convert it to chomsky normal form, and compute the score
        act_score_list = []
        tree = treeDF_train.iat[j,1]
        tree.chomsky_normal_form()
        compute_score_for_tree(tree[0], [w,b,w_score,b_score], embeddings, act_score_list)

        # compute the total actual and predicted score. use the loss function as absolute difference
        # the above is done for each training data and the loss are accumulated
        total_act_score = tf.reduce_sum(tf.stack(act_score_list))
        total_pred_score = tf.reduce_sum(tf.stack(pred_score_list))
        loss = tf.losses.absolute_difference(total_act_score,  total_pred_score)
        total_loss_list.append(loss)
        #loss = tf.losses.sigmoid_cross_entropy(tf.constant(imdb_train.iat[j,1], shape=(1,1)), y_predict)

        #compute the average losses accompanying all training data
        # compute the gradients and apply them on variables
        total_loss = tf.reduce_mean(tf.stack(total_loss_list))
        grads = tape.gradient(total_loss, [w,b,w_score,b_score])
        print(w[0])
        grad_op = optimizer.apply_gradients(zip(grads, [w,b,w_score,b_score]), 
                                  global_step=tf.train.get_or_create_global_step())
        print(w[0])
        #maintain the history of variables, losses and gradients
        variables_history.append([np.copy(w.numpy()), np.copy(b.numpy()),np.copy(w_score.numpy()),np.copy(b_score.numpy())])
        loss_history.append(np.copy(total_loss.numpy()))
        grad_history.append([np.copy(grad.numpy()) for grad in grads])
        #print(tf.train.get_or_create_global_step().numpy(),total_loss.numpy())
    
        
    
    
    def compute_score_for_tree(tree, weights, embeddings, scores):
        if(type(tree[0]) == type('a string')):
            #print(tree.label() + ' : ' + tree[0])
            word = tree[0].lower()
            if(word not in glove_corpus_word_to_int):
                word = '<unk>'
            word_vector = tf.nn.embedding_lookup(embeddings, glove_corpus_word_to_int[word])
            word_vector = tf.reshape(word_vector, shape=(1, word_vector.shape[0]))
            return word_vector
            #print('depth is reached !!!')
            #return

        #for i in range(len(tree)):
        #    print('Inside tree : '+ tree[i].label())
        #    compute_score_for_tree(tree[i], weights, embeddings)



        left = compute_score_for_tree(tree[0], weights, embeddings, scores)
        if(len(tree) !=2):
            return left

        right = compute_score_for_tree(tree[1], weights, embeddings, scores)
        words_concat = tf.concat([left, right], axis=0)
        #print(words_concat.shape)
        #print(left.shape, right.shape)
        words_concat = tf.reshape(words_concat, shape=(1, left.shape[1] + right.shape[1]))
        #print(words_concat)
        z = tf.matmul(words_concat, weights[0]) + weights[1]
        state_vec = tf.tanh(z)

        #print(state_vec)
        score = tf.matmul(state_vec, weights[2]) + weights[3]
        scores.append(score)
        return state_vec
    
    
        
    