In [1]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, Flatten, Conv1D, concatenate, Activation
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from keras import Model, Sequential
import numpy as np
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
import bz2

Using TensorFlow backend.


In [2]:
class Metrics(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.f1s = []
        
    def on_epoch_end(self, batch, logs={}):
        probas = np.asarray(self.model.predict(self.validation_data[0]))
        targ = np.argmax(self.validation_data[1], axis=1)
        predict = np.argmax(probas, axis=1)
        self.f1s.append(metrics.f1_score(targ, predict, average="weighted"))
        return

metrs = Metrics()

In [3]:
vec_filename = "ubercorpus.lowercased.tokenized.300d.bz2"

In [4]:
def read_embeddings(filename=vec_filename, word_index=None):
    word_2_vec = {}
    with bz2.open(filename, "rt") as f:
        words, ndim = map(int, f.readline().strip().split())
        for line in f:
            values = line.split()
            word = values[0]
            if words:
                if word in word_index:
                    vec = np.asarray(values[1:], dtype=np.float32)
                    word_2_vec[word] = vec
            else:
                vec = np.asarray(values[1:], dtype=np.float32)
                word_2_vec[word] = vec
    return word_2_vec, ndim, words

In [5]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', "ROOT"),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [6]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()
trees = parse(data)

with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [7]:
tree = trees[0]
for node in tree:
    head = node["head"]
    print("{} {} <- {}".format(node["form"], node["id"], tree[head-1]["form"] if head>0 else "root"))

У 1 <- домі
домі 2 <- була
римського 3 <- патриція
патриція 4 <- домі
Руфіна 5 <- патриція
була 6 <- root
прегарна 7 <- фреска
фреска 8 <- була
, 9 <- зображення
зображення 10 <- фреска
Венери 11 <- зображення
та 12 <- Адоніса
Адоніса 13 <- Венери
. 14 <- була


In [31]:
def get_childs(word, tree):
    return [w for w in tree if w["head"]==word["id"]]

In [38]:
def get_heights(tree, res=None, word=None, k=0):
    if res is None:
        res = {0: 0}
        k = 1
    if word is None:
        word = [w for w in tree if not w["head"]][0]
        res[word["id"]] = k
    for child in get_childs(word, tree):
        res[child["id"]] = k + 1
        get_heights(tree, res, child, k+1)
    return res

In [185]:
def build_vocabulary(trees, form="form"):
    word_index = {"root": 1}
    pos_index = {ROOT["upostag"]: 1}
    for tree in trees:
        for word in tree:
            word_id = len(word_index) or 1
            pos_id = len(pos_index) or 1
            word_t = word[form].lower()
            word_pos = word["upostag"]
            word_index[word_t] = word_index.get(word_t, word_id)
            pos_index[word_pos] = pos_index.get(word_pos, pos_id)
    return word_index, pos_index

In [234]:
def build_features(trees, word_index, pos_index, form="form"):
    label_index = {}
    records = []
    labels = []
    for tree in trees:
        heights = get_heights(tree)
        for word in tree:
            deprel = word["deprel"]
            word_t = word[form].lower()
            head = tree[word["head"]-1] if word["head"] else ROOT
            head_t = head[form].lower()
            word_pos = word["upostag"]
            head_pos = head["upostag"]
            
            if deprel not in label_index:
                label_id = len(label_index)
                label_index[deprel] = label_id
            else:
                label_id = label_index.get(deprel)
            
            dist = word["id"] - head["id"]
            height_diff = heights[word["id"]] - heights[head["id"]]
            n_l_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]<word["id"]])
            n_r_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]>word["id"]])
            n_l_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]<head["id"]])
            n_r_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]>head["id"]])
            
            # construct final feature vector
            num_features = [dist, height_diff, heights[word["id"]], 
                            n_l_child_word, n_r_child_word, 
                            n_r_child_head, n_l_child_head
                           ]
            words = [word_index.get(word_t), word_index.get(head_t)]
            tags = [pos_index.get(word_pos), pos_index.get(head_pos)]
            records.append((*words, *tags, *num_features))
            labels.append(label_id)
    return records, labels, label_index, len(words), len(tags), len(num_features)

In [19]:
n_train = np.sum([len(tree) for tree in trees])

In [203]:
word_index, pos_index = build_vocabulary(trees+test_trees)

In [235]:
records, labels, label_index, n_words, n_tags, n_feat = build_features(trees+test_trees, word_index, pos_index)
labels = to_categorical(np.asarray(labels))
X = np.asarray(records)

In [236]:
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = labels[:n_train], labels[n_train:]

In [None]:
word_2_vec, ndim, _ = read_embeddings(word_index=word_index)

In [118]:
DEFAULT_VEC = np.zeros(ndim, np.float32)

In [237]:
embedding_matrix = np.zeros((len(word_index), ndim))
for word, i in word_index.items():
    embedding_matrix[i] = word_2_vec.get(word, DEFAULT_VEC)

In [238]:
word_embedding_layer = Embedding(len(word_index),
                            ndim,
                            weights=[embedding_matrix],
                            input_length=n_words,
                            trainable=0
                           )

In [239]:
pos_embedding_layer = Embedding(len(pos_index),
                                ndim,
                                input_length=n_tags,
                                trainable=1
                               )

In [240]:
word_sequence_input = Input(shape=(n_words,), dtype='int32')
word_embedded_sequences = word_embedding_layer(word_sequence_input)

In [241]:
pos_sequence_input = Input(shape=(n_tags,), dtype='int32')
pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)

In [242]:
features = Input(shape=(n_feat,))

In [243]:
left = Flatten()(word_embedded_sequences)
right = Flatten()(pos_embedded_sequences)
x = concatenate(inputs=[left, right, features])
x = Dense(200)(x)
x = Dense(100, activation='relu')(x)
preds = Dense(len(label_index), activation='softmax')(x)

In [244]:
model = Model(inputs=[word_sequence_input, pos_sequence_input, features], outputs=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
model.summary()

In [245]:
model.fit([X_train[:, :n_words], X_train[:, n_words:n_words+n_tags], X_train[:, n_words+n_tags:]], y_train, 
          validation_data=([X_test[:, :n_words], X_test[:, n_words:n_words+n_tags], X_test[:, n_words+n_tags:]], y_test), 
          epochs=5, 
          batch_size=128, 
          verbose=1)

Train on 75098 samples, validate on 14939 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f21550cadd8>

In [256]:
idx_2_label = {v:k for k,v in label_index.items()}

In [262]:
preds = np.argmax(model.predict([X_test[:2, :n_words], X_test[:2, n_words:n_words+n_tags], X_test[:2, n_words+n_tags:]]), axis=1)
[idx_2_label[p] for p in preds]

['nsubj', 'amod']