In [1]:
import numpy as np
import tensorflow as tf


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data_location = "C:/Users/Himanshu/Downloads/a3/a3/data/dev.gold.conll"
num_cells = 128
num_epochs = 50
batch_size = 500

In [28]:
class DataProcess:
    def __init__(self):
        self.transitions = {"left": 0, "right": 1, "shift": 2}
        self.model = None
        
    @staticmethod
    def read_dataset(filename):
        examples = []
        with open(filename, "r+") as f:
            words, pos, head, label = [], [], [], []
            for line in f.readlines():
                line_split = line.strip().split("\t")
                if len(line_split) == 10:
                    words.append(line_split[1])
                    pos.append(line_split[3])
                    head.append(int(line_split[6]))
                    label.append(line_split[7])
                else:
                    examples.append([words, pos, head, label])
                    words, pos, head, label = [], [], [], []
        return examples
    
    def create_tok2id(self, examples):
        tok2id = {}
        tok2id["UNK"] = 0
        tok2id["ROOT"] = 1
        tok2id["ROOTPOS"] = 2
        tok2id["ROOTHEAD"] = 3
        tok2id["ROOTLABEL"] = 4
        tok2id["PAD"] = 5
        tok2id["PPAD"] = 6
        tok2id["LPAD"] = 7
        ind = len(tok2id)
        for example in examples:
            for word in example[0]:
                if word not in tok2id:
                    tok2id[word] = ind
                    ind += 1
            for pos in example[1]:
                if pos not in tok2id:
                    tok2id[pos] = ind
                    ind += 1
            for label in example[3]:
                if label not in tok2id:
                    tok2id[label] = ind
                    ind += 1
        self.tok2id = tok2id
        self.id2tok = { tok_id: tok for tok, tok_id in self.tok2id.items() }
    
    def get_tok2id(self, tok):
        if tok in self.tok2id:
            return self.tok2id[tok]
        else:
            return self.tok2id["UNK"]
    
    def get_oracle(self, stack, buf, example):
        if len(stack) < 2:
            return self.transitions["shift"]
        
        i1 = stack[-1]
        i2 = stack[-2]
        h1 = example[2][i1]
        h2 = example[2][i2]
        if i2 > 0 and h2 == i1 and all(False if example[2][b] == i2 else True for b in buf):
            return self.transitions["left"]
        elif i2 >=0 and h1 == i2 and all(False if example[2][b] == i1 else True for b in buf):
            return self.transitions["right"]
        else:
            return self.transitions["shift"]
    
    def get_features(self, stack, buffer, arcs, example):
        i1 = stack[-1]
        i2 = stack[-2] if len(stack) >= 2 else -1
        w1 = self.get_tok2id(example[0][i1])
        w2 = self.get_tok2id(example[0][i2]) if i2 != -1 else self.get_tok2id("PAD")
        wb1 = self.get_tok2id(example[0][buffer[0]]) if len(buffer) > 0 else self.get_tok2id("PAD")
        lci1 = [arc[1] for arc in arcs if arc[0] == i1 and arc[1] < i1]
        rci1 = [arc[1] for arc in arcs if arc[0] == i1 and arc[1] > i1]
        lci2 = [arc[1] for arc in arcs if arc[0] == i2 and arc[1] < i2]
        rci2 = [arc[1] for arc in arcs if arc[0] == i2 and arc[1] > i2]
        wlc1 = self.get_tok2id(example[0][lci1[0]]) if len(lci1) > 0 else self.get_tok2id("PAD")
        wrc1 = self.get_tok2id(example[0][rci1[0]]) if len(rci1) > 0 else self.get_tok2id("PAD")
        wlc2 = self.get_tok2id(example[0][lci2[0]]) if len(lci2) > 0 else self.get_tok2id("PAD")
        wrc2 = self.get_tok2id(example[0][rci2[0]]) if len(rci2) > 0 else self.get_tok2id("PAD")
        p1 = self.get_tok2id(example[1][i1])
        p2 = self.get_tok2id(example[1][i2]) if i2 != -1 else self.get_tok2id("PPAD")
        pb1 = self.get_tok2id(example[1][buffer[0]]) if len(buffer) > 0 else self.get_tok2id("PPAD")
        plc1 = self.get_tok2id(example[1][lci1[0]]) if len(lci1) > 0 else self.get_tok2id("PAD")
        prc1 = self.get_tok2id(example[1][rci1[0]]) if len(rci1) > 0 else self.get_tok2id("PAD")
        plc2 = self.get_tok2id(example[1][lci2[0]]) if len(lci2) > 0 else self.get_tok2id("PAD")
        prc2 = self.get_tok2id(example[1][rci2[0]]) if len(rci2) > 0 else self.get_tok2id("PAD")
        return [w1, w2, wb1, wlc1, wrc1, wlc2, wrc2, p1, p2, pb1, plc1, prc1, plc2, prc2]
    
    def parse_example(self, given_example):
        stack = [0]
        buffer = list(range(1, len(given_example[0]) + 1))
        arcs = []
        example = [[], [], []]
        example[0] = ["ROOT"] + given_example[0]
        example[1] = ["ROOTPOS"] + given_example[1]
        example[2] = ["ROOTHEAD"] + given_example[2]
        input_features = []
        outputs = []
        while not (len(buffer) == 0 and stack == [0]):
            gold_parse = self.get_oracle(stack, buffer, example)
            features = self.get_features(stack, buffer, arcs, example)
            input_features.append(features)
            outputs.append(gold_parse)
            if gold_parse == 0:
                arcs.append([stack[-1], stack[-2], gold_parse])
                stack = stack[:-2] + [stack[-1]]
            elif gold_parse == 1:
                arcs.append([stack[-2], stack[-1], gold_parse])
                stack = stack[:-1]
            else:
                stack = stack + [buffer[0]]
                buffer = buffer[1: ]
        return input_features, outputs
    
    def set_model(self, model):
        self.model = model
    
    def predict(self, inps):
        inps = inps[:]
        assert self.model
        stacks = []
        buffers = []
        arcs = []
        outputs = [0] * len(inps)
        for i in range(len(inps)):
            batch = []
            stack = [0]
            buffer = list(range(1, len(inps[i][0]) + 1))
            arc = []
            stacks.append(stack)
            buffers.append(buffer)
            arcs.append(arc)
            inps[i][0] = ["ROOT"] + inps[i][0]
            inps[i][1] = ["ROOTPOS"] + inps[i][1]
        example_indices = list(range(len(inps)))
        while example_indices:
            input_features = []
            for i in example_indices:
                features = self.get_features(stacks[i], buffers[i], arcs[i], inps[i])
                input_features.append(features)
            predicted_parse = self.model.predict(np.array(input_features))
            predicted_parse = np.argmax(predicted_parse, axis=1)
            print(predicted_parse)
            remaining_indices = []
            for index, i in enumerate(example_indices):
                gold_parse = predicted_parse[index]
                if len(buffers[i]) == 0 and len(stacks[i]) == 1:
                    print("discarding input {0}".format(i))
                    outputs[i] = arcs[i]
                    continue
                if gold_parse == 0:
                    arcs[i].append([stacks[i][-1], stacks[i][-2], gold_parse])
                    stacks[i] = stacks[i][:-2] + [stacks[i][-1]]
                    remaining_indices.append(i)
                elif gold_parse == 1:
                    arcs[i].append([stacks[i][-2], stacks[i][-1], gold_parse])
                    stacks[i] = stacks[i][:-1]
                    remaining_indices.append(i)
                else:
                    if len(buffers[i]) != 0:
                        stacks[i] = stacks[i] + [buffers[i][0]]
                        buffers[i] = buffers[i][1: ]
                        remaining_indices.append(i)
                    else:
                        print("in ELSE")
                        outputs[i] = arcs[i]
            example_indices = remaining_indices
        return outputs

In [29]:
data_process = DataProcess()
examples = data_process.read_dataset(data_location)
data_process.create_tok2id(examples)
model_features, model_outputs = [], []
for ind, example in enumerate(examples):
    try:
        features, outputs = data_process.parse_example(example)
        model_features += features
        model_outputs += outputs
    except:
        print(ind)
model_features = np.array(model_features)
model_outputs = np.array(model_outputs)
print(model_features.shape)
print(model_outputs.shape)

35
352
708
1316
1561
(79920, 14)
(79920,)


In [None]:
print(np.unique(model_outputs, return_counts=True))

In [5]:
model_inp = tf.keras.layers.Input(shape=(14, ))
embeddings = tf.keras.layers.Embedding(input_dim=len(data_process.tok2id), output_dim=100)(model_inp)
flat_emb = tf.keras.layers.Flatten()(embeddings)
flat_emb = tf.keras.layers.Dropout(0.2)(flat_emb)
dense1 = tf.keras.layers.Dense(num_cells, activation="elu")(flat_emb)
dense1 = tf.keras.layers.Dropout(0.2)(dense1)
dense2 = tf.keras.layers.Dense(num_cells, activation="elu")(dense1)
dense2 = tf.keras.layers.Dropout(0.2)(dense2)
model_out = tf.keras.layers.Dense(3, activation="softmax")(dense2)
model = tf.keras.models.Model(inputs=model_inp, outputs=model_out)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001), loss="sparse_categorical_crossentropy", metrics=["acc"])
model_checkpoint = tf.keras.callbacks.ModelCheckpoint("./dpmodels/model.{epoch:02d}-{val_loss:.2f}.hdf5")
early_stop = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
model.fit(model_features, model_outputs, epochs=3,
          batch_size=batch_size, validation_split=0.1, callbacks=[model_checkpoint, early_stop])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 71928 samples, validate on 7992 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0xa183e4b588>

In [15]:
len(model_features[0])

14

In [30]:
data_process.set_model(model)
out = data_process.predict(examples[:3])
for i in range(len(examples[:3])):
    print(" ".join(examples[i][0]))
    for each in out[i]:
        print(examples[i][0][each[0]], "<--", examples[i][0][each[1]])

[2 2 2]
[2 2 2]
[0 0 0]
[2 2 2]
[2 1 0]
[2 2 2]
[2 2 2]
[0 0 0]
[0 2 2]
[0 2 2]
[2 2 0]
[1 2 1]
[2 0 2]
[2 0 2]
[0 2 2]
[1 2 0]
[2 0 0]
[2 1 2]
[0 1 1]
[0 1 1]
[0 1 2]
[2 2 1]
[2 1 2]
[2 2 2]
[0 1 2]
[0 2 0]
[2 1 2]
[2 2 0]
[2 1 0]
[2 2 2]
[2 2 2]
[2 0 2]
[0 0 0]
[0 2 0]
[0 2 1]
[0 2 1]
[0 2 1]
[1 0 2]
[2 0 1]
[2 0 1]
[0 1 1]
discarding input 2
[2 2]
[1 2]
[1 2]
[2 2]
[1 2]
[2 0]
[2 0]
[2 0]
[2 0]
[0 1]
[0 2]
[0 2]
[2 0]
[2 2]
[2 2]
[0 2]
[0 0]
[2 0]
[1 2]
[2 1]
[0 2]
[2 2]
[2 2]
[2 0]
[0 0]
[0 2]
[1 2]
[1 2]
[1 2]
[1 2]
[2 0]
[1 0]
[1 0]
[1 0]
discarding input 0
[0]
[0]
[0]
[2]
[2]
[2]
[2]
[0]
[0]
[0]
[1]
[1]
[2]
[1]
[1]
[1]
discarding input 1
ROOT Influential members of the House Ways and Means Committee introduced legislation that would restrict how the new savings-and-loan bailout agency can raise capital , creating another potential obstacle to the government 's sale of sick thrifts .
members <-- Influential
Ways <-- House
Ways <-- the
Ways <-- of
Ways <-- and
Committee <-- Means


In [24]:
len(examples[2][0])

21