In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [426]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, Flatten, Conv1D, concatenate, Activation, LSTM, Dropout
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers, Model, Sequential
from keras.models import load_model
from sklearn import metrics
import numpy as np
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
import bz2
import json
import dill

In [13]:
%aimport parser
from parser import Parser

%aimport helpers
from helpers import read_embeddings, ROOT

In [7]:
vec_filename = "ubercorpus.lowercased.tokenized.300d.bz2"

In [129]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()
trees = parse(data)

with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

n_train = np.sum([len(tree) for tree in trees])

In [250]:
def build_vocabulary(trees, form="form"):
    word_index = {}
    pos_index = {}
    dep_index = {}
    for tree in trees:
        for word in tree:
            deprel = word["deprel"]
            word_id = len(word_index)+1
            pos_id = len(pos_index)+1
            dep_id = len(dep_index)+1
            word_t = word[form].lower()
            word_pos = word["upostag"]
            word_index[word_t] = word_index.get(word_t, word_id)
            pos_index[word_pos] = pos_index.get(word_pos, pos_id)
            dep_index[deprel] = dep_index.get(deprel, dep_id)

    word_index[ROOT["form"]] = len(word_index)+1
    pos_index[ROOT["upostag"]] = len(pos_index)+1
    return word_index, pos_index, dep_index

In [22]:
def get_data(trees, parser):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _ = parser.parse(tree)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features

In [251]:
word_index, pos_index, dep_index = build_vocabulary(trees+test_trees)

In [28]:
word_2_vec, ndim, _ = read_embeddings(filename=vec_filename, word_index=word_index)

In [29]:
DEFAULT_VEC = np.zeros(ndim, np.float32)

In [30]:
embedding_matrix = np.zeros((len(word_index)+1, ndim))
for word, i in word_index.items():
    embedding_matrix[i] = word_2_vec.get(word, DEFAULT_VEC)

In [203]:
 def get_stack_context(depth, stack, data):
        if depth >= 3:
            return data[stack[-1]["id"]], data[stack[-2]["id"]], data[stack[-3]["id"]]
        elif depth >= 2:
            return data[stack[-1]["id"]], data[stack[-2]["id"]], 0
        elif depth == 1:
            return data[stack[-1]["id"]], 0, 0
        else:
            return 0, 0, 0

In [204]:
def get_buffer_context(k, buffer, data):
        if k >= 3:
            return data[buffer[0]["id"]], data[buffer[1]["id"]], data[buffer[2]["id"]]
        elif k >= 2:
            return data[buffer[0]["id"]], data[buffer[1]["id"]], 0
        elif k == 1:
            return data[buffer[0]["id"]], 0, 0
        else:
            return 0, 0, 0

In [436]:
def get_parse_context(word, deps, data):
    if not word or word == -1:
        return 0, "", ""
    deps = deps[word["id"]]
    num = len(deps)
    if not num:
        return num, "", ""
    elif num==1:
        return num, data[deps[-1]-1], ""
    else:
        return num, data[deps[-1]-1], data[deps[-1]-1]

In [237]:
def feature_builder(stack, queue, tree, parse=None, word_index=word_index, pos_index=pos_index, dep_index=dep_index, form="form"):
    words = []
    tags = []
    deps = []
    depth = len(stack)
    q_len = len(queue)
    if ROOT not in tree:
        tree = [ROOT, *tree]
    
    s0, s1, s2 = get_stack_context(depth, stack, tree)
    q0, q1, q2 = get_buffer_context(q_len, queue, tree)

    for x in [s0, s1, s2, q0, q1, q2]:
        if x:
            word = x[form].lower() if x["id"] else "ROOT"
            word_idx = word_index.get(word)
            pos_idx = pos_index.get(x["upostag"])
            words.append(word_idx)
            tags.append(pos_idx)
        else:
            words.append(x)
            tags.append(x)
    
    features = [*words, *tags, *deps]
    return features, len(words), len(tags), len(deps)

In [238]:
def get_data(trees, parser, feature_extractor):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _, n_w, n_t, n_d = parser.parse(tree, feature_extractor=feature_extractor)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features, n_w, n_t, n_d

In [267]:
parser = Parser()

In [392]:
labels, features, n_w, n_t, n_d = get_data(trees+test_trees, parser, feature_builder)
X = np.asarray(features)
y = to_categorical(np.asarray(labels))

In [396]:
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = y[:n_train], y[n_train:]

In [368]:
word_embedding_layer = Embedding(len(word_index)+1,
                            ndim,
                            weights=[embedding_matrix],
                            input_length=n_w,
                            trainable=0
                           )

In [442]:
pos_embedding_layer = Embedding(len(pos_index)+1,
                                100,
                                input_length=n_t,
                                trainable=1
                               )

In [443]:
word_sequence_input = Input(shape=(n_w,), dtype='int32')
word_embedded_sequences = word_embedding_layer(word_sequence_input)

In [444]:
pos_sequence_input = Input(shape=(n_t,), dtype='int32')
pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)

In [445]:
left = Flatten()(word_embedded_sequences)
right = Flatten()(pos_embedded_sequences)
x = concatenate(inputs=[left, right], axis=-1)
#x = concatenate(inputs=[word_embedded_sequences, pos_embedded_sequences], axis=1)
#x = LSTM(256)(x)
x = Dense(200, activation="relu",
          kernel_regularizer=regularizers.l2(1e-8))(x)
x = Dropout(0.3)(x)
preds = Dense(len(parser.label_index), activation='softmax',
             kernel_regularizer=regularizers.l2(1e-8))(x)

In [446]:
model = Model(inputs=[word_sequence_input, pos_sequence_input], outputs=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])

In [447]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 6)            0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 6, 300)       7751400     input_9[0][0]                    
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 6, 100)       1900        input_10[0][0]                   
__________________________________________________________________________________________________
flatten_94

In [448]:
model.fit([X_train[:, :n_w], X_train[:, n_w:n_w+n_t],], y_train, 
          validation_data=([X_test[:, :n_w], X_test[:, n_w:n_w+n_t]], y_test), 
          epochs=6, 
          batch_size=128, 
          verbose=1)

Train on 75098 samples, validate on 110272 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fd4f9320438>