In [455]:
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd

In [456]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', "ROOT"),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [457]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

In [7]:
with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()

In [461]:
trees = parse(data)

In [462]:
tree = trees[0]

In [294]:
for node in tree:
    head = node["head"]
    print("{} <- {}".format(node["form"], tree[head-1]["form"] if head>0 else "root"))

У <- домі
домі <- була
римського <- патриція
патриція <- домі
Руфіна <- патриція
була <- root
прегарна <- фреска
фреска <- була
, <- зображення
зображення <- фреска
Венери <- зображення
та <- Адоніса
Адоніса <- Венери
. <- була


In [374]:
tree[0]

OrderedDict([('id', 1),
             ('form', 'У'),
             ('lemma', 'у'),
             ('upostag', 'ADP'),
             ('xpostag', 'Spsl'),
             ('feats', OrderedDict([('Case', 'Loc')])),
             ('head', 2),
             ('deprel', 'case'),
             ('deps', None),
             ('misc', OrderedDict([('Id', '0003')]))])

In [366]:
def get_parse_context(word, deps, data):
    if not word or word == -1:
        return 0, "", ""
    deps = deps[word["id"]]
    num = len(deps)
    if not num:
        return num, "", ""
    elif num==1:
        return num, data[deps[-1]-1], ""
    else:
        return num, data[deps[-1]-1], data[deps[-1]-1]

In [385]:
def extract_features(stack, queue, tree, parse):
    features = {}
    stack_depth = len(stack)
    s0 = stack[-1] if stack_depth else -1
    q0 = queue[0] if queue else -1
    
    # Features for stack
    if stack:
        features["s0-form"] = s0["form"]
        features["s0-tag"] = s0["upostag"]
        features["s0-lemma"] = s0["lemma"]
        if s0["feats"]:
            for k, v in s0["feats"].items():
                features[f"s0-{k}"] = v
    if stack_depth > 1:
        features["s1-pos"] = stack[-2]["upostag"]
    
    # Features for queue
    if queue:
        features["q0-form"] = q0["form"]
        features["q0-tag"] = q0["upostag"]
        features["q0-lemma"] = q0["lemma"]
        if q0["feats"]:
            for k, v in q0["feats"].items():
                features[f"q0-{k}"] = v 
    if len(queue) > 1:
        features["q1-form"] = queue[1]["form"]
        features["q1-tag"] = queue[1]["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
        
    # Left two child for top stack
    _, s0l1, s0l2 = get_parse_context(s0, parse.lefts, tree)

    # Right two child for top stack
    _, s0r1, s0r2 = get_parse_context(s0, parse.rights, tree)
    
    # Left two child for top queue
    _, q0l1, q0l2 = get_parse_context(q0, parse.lefts, tree)
    
    # Right two child for top stack
    _, q0r1, q0r2 = get_parse_context(q0, parse.rights, tree)
    
    return features

In [302]:
class Parse(object):
    
    def __init__(self, n):
        self.n = n
        self.relations = []
        self.lefts = []
        self.rights = []
        # we need n+1 coz examples in the training data are indexed from 1
        for k in range(n+1):
            self.lefts.append([])
            self.rights.append([])
    
    def add_relation(self, child, head):
        self.relations.append((child, head))
        if child < head:
            self.lefts[head].append(child)
        else:
            self.rights[head].append(child)

In [459]:
class Parser(object):
    
    def __init__(self):
        pass
    
    def get_action(self, stack, q, parse):
        if stack and not q:
            return "reduce"
        if stack[-1]["head"] == q[0]["id"]:
            return "left"
        elif q[0]["head"] == stack[-1]["id"]:
            return "right"
        elif (stack[-1]["head"] in [parent for _, parent in parse.relations] 
              and q[0]["head"] < stack[-1]["id"]):
            return "reduce"
        else:
            return "shift" 
        
    def parse(self, tree, oracle=None, vectorizer=None):
        q = tree.copy()
        parse = Parse(len(q))
        stack = [ROOT]
        labels = []
        features = []
        while q or stack:
            feature_set = extract_features(stack, q, tree, parse)
            
            if oracle is not None:
                v_features = vectorizer.transform(feature_set)
                action = oracle.predict(v_features)[0]
            else:
                action = self.get_action(stack or None, q or None, parse)
            
            if action == "left":
                parse.add_relation(stack[-1]["id"], q[0]["id"])
                stack.pop()
            elif action == "right":
                parse.add_relation(q[0]["id"], stack[-1]["id"])
                stack.append(q.pop(0))
            elif action == "reduce":
                stack.pop()
            elif action == "shift":
                stack.append(q.pop(0))              
            labels.append(action)
            features.append(feature_set)
        return labels, features, parse.relations

In [463]:
parser = Parser()
labels, features, _ = parser.parse(tree)
print(len(labels), len(features))

29 29


In [464]:
def get_data(trees, parser):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _ = parser.parse(tree)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features

##### Prepare train / test data

In [465]:
y_train, features_train = get_data(trees, parser)
print(len(features_train), len(y_train))

154709 154709


In [391]:
with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [439]:
y_test, features_test = get_data(test_trees, parser)
print(len(features_test), len(y_test))

30661 30661


##### Vectorize features

In [426]:
vectorizer = DictVectorizer(sparse=True)
v_train = vectorizer.fit_transform(features_train)
v_test = vectorizer.transform(features_test)

#### Train models

In [505]:
def calc_metrics(y_test, pred, proba=None, labels=None, print_=True, mode="weighted"):
    output = {}
    if proba is not None:
        roc_auc = metrics.roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = metrics.recall_score(y_test, pred, average=mode)
    output["Precision"] = metrics.precision_score(y_test, pred, average=mode)
    output["F1"] = metrics.f1_score(y_test, pred, average=mode)
    output["Accuracy"] = metrics.accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + str(el) for el in index]
    else:
        columns = None
        index = None
    conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, pred, labels=labels), 
                               columns=columns, index=index)
    report = metrics.classification_report(y_true=y_test, y_pred=pred, labels=labels)
    if print_:
        for key, value in output.items():
            print(f"{key}: {value:0.3f}")
        print("\nConfusion matrix:")
        print(conf_matrix)
        print("\nReport:")
        print(report)
    return output, report, conf_matrix

In [None]:
clf = LogisticRegression(random_state=25, solver="lbfgs", n_jobs=-1)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)

In [506]:
output, report, conf_matrix = calc_metrics(y_test, y_pred, labels=clf.classes_)

Recall: 0.828
Precision: 0.829
F1: 0.828
Accuracy: 0.828

Confusion matrix:
        pred_left  pred_reduce  pred_right  pred_shift
left         6437          226         380         323
reduce        471         6536         997         352
right         186          770        5769         492
shift         401          170         503        6648

Report:
             precision    recall  f1-score   support

       left       0.86      0.87      0.87      7366
     reduce       0.85      0.78      0.81      8356
      right       0.75      0.80      0.78      7217
      shift       0.85      0.86      0.86      7722

avg / total       0.83      0.83      0.83     30661



##### Add UAS calculation

In [492]:
def UAS(trees, oracle=None, vectorizer=None):
    total, tp = 0, 0
    for tree in trees:
        golden = [(node["id"], node["head"]) for node in tree]
        _, _, predicted = parser.parse(tree, oracle=oracle, vectorizer=vectorizer)
        total += len(golden)
        tp += len(set(golden).intersection(set(predicted))) 
    return total, tp

In [494]:
total, tp = UAS(test_trees, clf, vectorizer)
print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp / total, 3))

Total: 14939
Correctly defined: 10255
UAS: 0.686
