In [1]:
%load_ext autoreload

In [97]:
%autoreload 1

In [3]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, Flatten, Conv1D, concatenate, Activation, LSTM, Dropout
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers, Model, Sequential
from keras.models import load_model
from sklearn import metrics
import numpy as np
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
import bz2
import json
import dill

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [106]:
%aimport parser
from parser import Parser

%aimport helpers
from helpers import read_embeddings, ROOT, clean_deprel

In [5]:
vec_filename = "ubercorpus.lowercased.tokenized.300d.bz2"

In [6]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()
trees = parse(data)

with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [112]:
def build_vocabulary(trees, form="form"):
    word_index = {}
    pos_index = {}
    dep_index = {}
    label_index = {"shift": 0, "reduce": 1}
    for tree in trees:
        for word in tree:
            deprel = clean_deprel(word["deprel"])
            word_id = len(word_index)+1
            pos_id = len(pos_index)+1
            dep_id = len(dep_index)+1
            label_id = len(label_index)
            if not any(el for el in label_index.keys() if deprel in el):
                label_index[f"left_{deprel}"] = label_id
                label_index[f"right_{deprel}"] = label_id + 1
            word_t = word[form].lower()
            word_pos = word["upostag"]
            word_index[word_t] = word_index.get(word_t, word_id)
            pos_index[word_pos] = pos_index.get(word_pos, pos_id)
            dep_index[deprel] = dep_index.get(deprel, dep_id)

    word_index[ROOT["form"]] = len(word_index)+1
    pos_index[ROOT["upostag"]] = len(pos_index)+1
    return word_index, pos_index, dep_index, label_index

In [8]:
def get_data(trees, parser):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _ = parser.parse(tree)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features

In [113]:
word_index, pos_index, dep_index, label_index = build_vocabulary(trees+test_trees)

In [10]:
word_2_vec, ndim, _ = read_embeddings(filename=vec_filename, word_index=word_index)

In [11]:
DEFAULT_VEC = np.zeros(ndim, np.float32)

In [12]:
embedding_matrix = np.zeros((len(word_index)+1, ndim))
for word, i in word_index.items():
    embedding_matrix[i] = word_2_vec.get(word, DEFAULT_VEC)

In [13]:
 def get_stack_context(depth, stack, data):
        if depth >= 3:
            return data[stack[-1]["id"]], data[stack[-2]["id"]], data[stack[-3]["id"]]
        elif depth >= 2:
            return data[stack[-1]["id"]], data[stack[-2]["id"]], 0
        elif depth == 1:
            return data[stack[-1]["id"]], 0, 0
        else:
            return 0, 0, 0

In [14]:
def get_buffer_context(k, buffer, data):
        if k >= 3:
            return data[buffer[0]["id"]], data[buffer[1]["id"]], data[buffer[2]["id"]]
        elif k >= 2:
            return data[buffer[0]["id"]], data[buffer[1]["id"]], 0
        elif k == 1:
            return data[buffer[0]["id"]], 0, 0
        else:
            return 0, 0, 0

In [383]:
def get_parse_context(word, deps, data, left=True):
    if not word or word == -1:
        return 0, (0, 0), (0, 0)
    deps = deps[word["id"]]
    num = len(deps)
    if not num:
        return num, (0, 0), (0, 0)
    elif num==1:
        return num, (data[deps[0][0]], deps[0][1]), (0, 0)
    else:
        temp = sorted(deps, key=lambda x: x[0], reverse=left)
        return num, (data[deps[0][0]], deps[0][1]), (data[deps[1][0]], deps[1][1])

In [453]:
def feature_builder(stack, queue, tree, parse=None, word_index=word_index, 
                    pos_index=pos_index, dep_index=dep_index, form="form"):
    words = []
    tags = []
    deps = []
    depth = len(stack)
    q_len = len(queue)
    if ROOT not in tree:
        tree = [ROOT, *tree]
    
    s0, s1, s2 = get_stack_context(depth, stack, tree)
    q0, q1, q2 = get_buffer_context(q_len, queue, tree)
    
    # Left two child of the top stack
    Ns0l, s0l1, s0l2 = get_parse_context(s0, parse.lefts, tree, left=True)  
    # Right two child of the top stack
    Ns0r, s0r1, s0r2 = get_parse_context(s0, parse.rights, tree, left=False)
    # Left two child of the second element on stack
    Ns1l, s1l1, s1l2 = get_parse_context(s1, parse.lefts, tree, left=True)
    # Left two child of the second element on stack
    Ns1r, s1r1, s1r2 = get_parse_context(s1, parse.rights, tree, left=False)
    
    if s0l1[0] and parse.lefts[s0l1[0]["id"]]:
        idx, dep = max(parse.lefts[s0l1[0]["id"]], key=lambda x: x[0])
        s0l1l1 = tree[idx]
        s0l1l1_dep = dep
    else:
        s0l1l1 = 0
        s0l1l1_dep = 0
    
    if s0r1[0] and parse.rights[s0r1[0]["id"]]:
        idx, dep = min(parse.rights[s0r1[0]["id"]], key=lambda x: x[0])
        s0r1r1 = tree[idx]
        s0r1r1_dep = dep
    else:
        s0r1r1 = 0
        s0r1r1_dep = 0
    
    if s1l1[0] and parse.lefts[s1l1[0]["id"]]:
        idx, dep = max(parse.lefts[s1l1[0]["id"]], key=lambda x: x[0])
        s1l1l1 = tree[idx]
        s1l1l1_dep = dep
    else:
        s1l1l1 = 0
        s1l1l1_dep = 0
    
    if s1r1[0] and parse.rights[s1r1[0]["id"]]:
        idx, dep = min(parse.rights[s1r1[0]["id"]], key=lambda x: x[0])
        s1r1r1 = tree[idx]
        s1r1r1_dep = dep
    else:
        s1r1r1 = 0
        s1r1r1_dep = 0
#     if s0r1[0] or s0r2[0] or s0l1[0] or s0l2[0]:
#         print(s0["form"], s0["id"], parse.lefts[s0["id"]],  parse.rights[s0["id"]])
#         print("Top stack (rights): ", s0r1, s0r2)
#         print("Top stack (lefts): ", s0l1, s0l2)
#         print()
#     if s1r1[0] or s1r2[0] or s1l1[0] or s1l2[0]:
#         print(s1["form"], s1["id"], parse.lefts[s1["id"]],  parse.rights[s1["id"]])
#         print("Second stack (rights): ", s1r1, s1r2)
#         print("Second stack (lefts): ", s1l1, s1l2)
#         print()
    deps = [dep_index.get(s0l1[-1], 0), dep_index.get(s0l2[-1], 0),
            dep_index.get(s0r1[-1], 0), dep_index.get(s0r2[-1], 0),
            dep_index.get(s1l1[-1], 0), dep_index.get(s1l2[-1], 0),
            dep_index.get(s1r1[-1], 0), dep_index.get(s1r2[-1], 0),
            dep_index.get(s0l1l1_dep, 0), dep_index.get(s0r1r1_dep, 0),
            dep_index.get(s1l1l1_dep, 0), dep_index.get(s1r1r1_dep, 0)]
    
    for x in [s0, s1, s2, q0, q1, q2, 
              s0l1[0], s0l2[0], s0r1[0], s0r2[0], 
              s1l1[0], s1l2[0], s1r1[0], s1r2[0],
              s0l1l1, s0r1r1, s1l1l1, s1r1r1]:
        if x:
            word = x[form].lower() if x["id"] else "ROOT"
            word_idx = word_index.get(word)
            pos_idx = pos_index.get(x["upostag"])
            words.append(word_idx)
            tags.append(pos_idx)
        else:
            words.append(x)
            tags.append(x)
    
    dist = s0["id"] - q0["id"] if q0 and s0 else 0
    nums = [dist, Ns0l, Ns0r, Ns1r, Ns1l]
    features = [*words, *tags, *deps, *nums]
    return features, len(words), len(tags), len(deps), len(nums)

In [456]:
tree = trees[5]
l, f, pairs, *_ = parser.parse(tree, feature_extractor=feature_builder)
len(f[0])

53

In [436]:
def get_data(trees, parser, feature_extractor):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _, n_w, n_t, n_d, n_num = parser.parse(tree, feature_extractor=feature_extractor)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features, n_w, n_t, n_d, n_num

In [437]:
parser = Parser()

In [492]:
labels, features, n_w, n_t, n_d, n_num = get_data(trees+test_trees, parser, feature_builder)
label_index = parser.label_index.copy()
X = np.asarray(features)
y = to_categorical(np.asarray(labels))

In [497]:
n_train = 154709

In [498]:
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = y[:n_train], y[n_train:]

In [499]:
word_embedding_layer = Embedding(len(word_index)+1,
                            ndim,
                            weights=[embedding_matrix],
                            input_length=n_w,
                            trainable=0
                           )

In [500]:
pos_embedding_layer = Embedding(len(pos_index)+1,
                                50,
                                input_length=n_t,
                                trainable=1
                               )

In [501]:
dep_embedding_layer = Embedding(len(dep_index)+1,
                                50,
                                input_length=n_d,
                                trainable=1
                               )

In [502]:
word_sequence_input = Input(shape=(n_w,), dtype='int32')
word_embedded_sequences = word_embedding_layer(word_sequence_input)

In [503]:
pos_sequence_input = Input(shape=(n_t,), dtype='int32')
pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)

In [504]:
dep_sequence_input = Input(shape=(n_d,), dtype='int32')
dep_embedded_sequences = dep_embedding_layer(dep_sequence_input)

In [505]:
features = Input(shape=(n_num,))

In [506]:
word = Flatten()(word_embedded_sequences)
pos = Flatten()(pos_embedded_sequences)
dep = Flatten()(dep_embedded_sequences)
x = concatenate(inputs=[word, pos, dep, features], axis=-1)
#x = concatenate(inputs=[word_embedded_sequences, pos_embedded_sequences], axis=1)
#x = LSTM(256)(x)
x = Dense(200, activation="relu",
          kernel_regularizer=regularizers.l2(1e-8))(x)
x = Dropout(0.3)(x)
#x = Dense(200, activation="relu",
#          kernel_regularizer=regularizers.l2(1e-8))(x)
#x = Dropout(0.3)(x)
preds = Dense(len(parser.label_index), activation='softmax',
             kernel_regularizer=regularizers.l2(1e-8))(x)

In [507]:
model = Model(inputs=[word_sequence_input, pos_sequence_input, dep_sequence_input, features], outputs=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])

In [None]:
model.summary()

In [508]:
model.fit(
    [X_train[:, :n_w], X_train[:, n_w:n_w+n_t], X_train[:, n_w+n_t:n_w+n_t+n_d], X_train[:, n_w+n_t+n_d:]], 
          y_train, 
    validation_data=([X_test[:, :n_w], X_test[:, n_w:n_w+n_t], X_test[:, n_w+n_t:n_w+n_t+n_d], X_test[:, n_w+n_t+n_d:]], y_test), 
          epochs=6, 
          batch_size=128, 
          verbose=1)

Train on 154709 samples, validate on 30661 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f0ad7b73198>

#### Calculate metrics

In [512]:
def LUAS(parser, trees, oracle=None, feature_extractor=None):
    total, tpL, tpU, failed = 0, 0, 0, 0
    for tree in trees:
        golden = [(node["id"], node["head"], clean_deprel(node["deprel"])) for node in tree]
        try:
            _, _, predicted, *_ = parser.parse(tree, oracle=oracle, update_label_index=False,
                                               feature_extractor=feature_extractor)
            total += len(golden)
            tpL += len(set(golden).intersection(set(predicted)))
            tpU += len(set([(c,h) for c,h,_ in golden]).intersection([(c,h) for c,h,_ in predicted]))
        except:
            failed += 1
    return total, tpL, tpU, failed

In [514]:
total, tpL, tpU, failed = LUAS(parser, test_trees, model, feature_extractor=feature_builder)
print("Failed:", failed)
print("Total:", total)
print("Correctly defined (unlabeled):", tpU)
print("Correctly defined (labeled):", tpL)
print("UAS:", round(tpU / total, 3))
print("LAS:", round(tpL / total, 3))

Failed: 1
Total: 14869
Correctly defined (unlabeled): 11136
Correctly defined (labeled): 10398
UAS: 0.749
LAS: 0.699
