In [334]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, Flatten, Conv1D, concatenate, Activation
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from keras import Model, Sequential
import numpy as np
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
import bz2

In [None]:
class Metrics(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.f1s = []
        
    def on_epoch_end(self, batch, logs={}):
        probas = np.asarray(self.model.predict(self.validation_data[0]))
        targ = np.argmax(self.validation_data[1], axis=1)
        predict = np.argmax(probas, axis=1)
        self.f1s.append(metrics.f1_score(targ, predict, average="weighted"))
        return

metrs = Metrics()

In [10]:
vec_filename = "ubercorpus.lowercased.tokenized.300d.bz2"

In [55]:
def read_embeddings(filename=vec_filename, word_index=None):
    word_2_vec = {}
    with bz2.open(filename, "rt") as f:
        words, ndim = map(int, f.readline().strip().split())
        for line in f:
            values = line.split()
            word = values[0]
            if words:
                if word in word_index:
                    vec = np.asarray(values[1:], dtype=np.float32)
                    word_2_vec[word] = vec
            else:
                vec = np.asarray(values[1:], dtype=np.float32)
                word_2_vec[word] = vec
    return word_2_vec, ndim, words

In [57]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', "ROOT"),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [72]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()
trees = parse(data)

with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [383]:
tree = trees[0]
for node in tree:
    head = node["head"]
    print("{} {} <- {}".format(node["form"], node["id"], tree[head-1]["form"] if head>0 else "root"))

У 1 <- домі
домі 2 <- була
римського 3 <- патриція
патриція 4 <- домі
Руфіна 5 <- патриція
була 6 <- root
прегарна 7 <- фреска
фреска 8 <- була
, 9 <- зображення
зображення 10 <- фреска
Венери 11 <- зображення
та 12 <- Адоніса
Адоніса 13 <- Венери
. 14 <- була


In [377]:
def get_childs(word, tree):
    return [w for w in tree if w["head"]==word["id"]]

In [392]:
def get_heights(tree, res=None, word=None, k=0):
    if res is None:
        res = {0: 0}
        k = 1
    if word is None:
        word = [w for w in tree if not w["head"]][0]
        res[word["id"]] = k
    for child in get_childs(word, tree):
        res[child["id"]] = k + 1
        get_height(tree, res, child, k+1)
    return res

In [408]:
def build_vocabulary(trees, form="form"):
    word_index = {}
    label_index = {}
    pos_index = {}
    records = []
    labels = []
    for tree in trees:
        heights = get_heights(tree)
        for word in tree:
            deprel = word["deprel"]
            word_t = word[form].lower()
            head = tree[word["head"]-1] if word["head"] else ROOT
            head_t = head[form]
            word_pos = word["upostag"]
            head_pos = tree[word["head"]-1]["upostag"] if word["head"] else ROOT["upostag"]
            
            if deprel not in label_index:
                label_id = len(label_index)
                label_index[deprel] = label_id
            else:
                label_id = label_index.get(deprel)
            
            if word_t not in word_index:
                word_id = len(word_index)
                word_index[word_t] = word_id
            else:
                word_id = word_index[word_t]
            
            if head_t not in word_index:
                head_id = len(word_index)
                word_index[head_t] = head_id
            else:
                head_id = word_index[head_t]
            
            if word_pos not in pos_index:
                word_pos_id = len(pos_index)
                pos_index[word_pos] = word_pos_id
            else:
                word_pos_id = pos_index[word_pos]
    
            if head_pos not in pos_index:
                head_pos_id = len(pos_index)
                pos_index[head_pos] = head_pos_id
            else:
                head_pos_id = pos_index[head_pos]
                
            dist = word["id"] - head["id"]
            height_diff = heights[word["id"]] - heights[head["id"]]
            n_l_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]<word["id"]])
            n_r_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]>word["id"]])
            n_l_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]<head["id"]])
            n_r_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]>head["id"]])
            
            features = [dist, height_diff, n_l_child_word, n_r_child_word, n_r_child_head, n_l_child_head]
            records.append((word_id, head_id, word_pos_id, head_pos_id, *features))
            labels.append(label_id)
            #features.append(feature_vector)
    return records, labels, word_index, label_index, pos_index, len(features)

In [386]:
n_train = np.sum([len(tree) for tree in trees])

In [409]:
records, labels, word_index, label_index, pos_index, n_feat = build_vocabulary(trees+test_trees)
labels = to_categorical(np.asarray(labels))
X = np.asarray(records)

In [410]:
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = labels[:n_train], labels[n_train:]

In [None]:
word_2_vec, ndim, _ = read_embeddings(word_index=word_index)

In [209]:
DEFAULT_VEC = np.zeros(ndim, np.float32)

In [326]:
embedding_matrix = np.zeros((len(word_index), ndim))
for word, i in word_index.items():
    embedding_matrix[i] = word_2_vec.get(word, DEFAULT_VEC)

In [412]:
word_embedding_layer = Embedding(len(word_index),
                            ndim,
                            weights=[embedding_matrix],
                            input_length=2,
                            trainable=0
                           )

In [413]:
pos_embedding_layer = Embedding(len(pos_index),
                                ndim,
                                input_length=2,
                                trainable=1
                               )

In [414]:
word_sequence_input = Input(shape=(2,), dtype='int32')
word_embedded_sequences = word_embedding_layer(word_sequence_input)

In [415]:
pos_sequence_input = Input(shape=(2,), dtype='int32')
pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)

In [416]:
features = Input(shape=(n_feat,))

In [417]:
left = Flatten()(word_embedded_sequences)
right = Flatten()(pos_embedded_sequences)
x = concatenate(inputs=[left, right, features])
x = Dense(200)(x)
x = Dense(100, activation='relu')(x)
preds = Dense(len(label_index), activation='softmax')(x)

In [418]:
model = Model(inputs=[word_sequence_input, pos_sequence_input, features], outputs=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
model.summary()

In [419]:
model.fit([X_train[:, :2], X_train[:, 2:4], X_train[:, 4:]], y_train, 
          validation_data=([X_test[:, :2], X_test[:, 2:4], X_test[:, 4:]], y_test), 
          epochs=5, 
          batch_size=128, 
          verbose=1)

Train on 75098 samples, validate on 14939 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f21b78b2438>