In [554]:
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import xgboost as xgb

In [508]:
def calc_metrics(y_test, pred, proba=None, labels=None, print_=True, mode="weighted"):
    output = {}
    if proba is not None:
        roc_auc = metrics.roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = metrics.recall_score(y_test, pred, average=mode)
    output["Precision"] = metrics.precision_score(y_test, pred, average=mode)
    output["F1"] = metrics.f1_score(y_test, pred, average=mode)
    output["Accuracy"] = metrics.accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + str(el) for el in index]
    else:
        columns = None
        index = None
    conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, pred, labels=labels), 
                               columns=columns, index=index)
    report = metrics.classification_report(y_true=y_test, y_pred=pred, labels=labels)
    if print_:
        for key, value in output.items():
            print(f"{key}: {value:0.3f}")
        print("\nConfusion matrix:")
        print(conf_matrix)
        print("\nReport:")
        print(report)
    return output, report, conf_matrix

In [3]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', "ROOT"),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [4]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

In [5]:
with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()

In [6]:
trees = parse(data)

In [10]:
tree = trees[0]
tree[0]

OrderedDict([('id', 1),
             ('form', 'У'),
             ('lemma', 'у'),
             ('upostag', 'ADP'),
             ('xpostag', 'Spsl'),
             ('feats', OrderedDict([('Case', 'Loc')])),
             ('head', 2),
             ('deprel', 'case'),
             ('deps', None),
             ('misc', OrderedDict([('Id', '0003')]))])

In [8]:
for node in tree:
    head = node["head"]
    print("{} <- {}".format(node["form"], tree[head-1]["form"] if head>0 else "root"))

У <- домі
домі <- була
римського <- патриція
патриція <- домі
Руфіна <- патриція
була <- root
прегарна <- фреска
фреска <- була
, <- зображення
зображення <- фреска
Венери <- зображення
та <- Адоніса
Адоніса <- Венери
. <- була


In [11]:
def get_parse_context(word, deps, data):
    if not word or word == -1:
        return 0, "", ""
    deps = deps[word["id"]]
    num = len(deps)
    if not num:
        return num, "", ""
    elif num==1:
        return num, data[deps[-1]-1], ""
    else:
        return num, data[deps[-1]-1], data[deps[-1]-1]

In [562]:
def extract_features(stack, queue, tree, parse):
    features = {}
    stack_depth = len(stack)
    s0 = stack[-1] if stack_depth else -1
    q0 = queue[0] if queue else -1
    
    # Features for stack
    if stack:
        features["s0-form"] = s0["form"]
        features["s0-tag"] = s0["upostag"]
        features["s0-lemma"] = s0["lemma"]
        features["s0-word-tag"] = s0["form"] + s0["upostag"]
        if s0["feats"]:
            for k, v in s0["feats"].items():
                features[f"s0-{k}"] = v
    if stack_depth > 1:
        features["s1-tag"] = stack[-2]["upostag"]
        features["s1-word-tag"] = stack[-2]["form"] + stack[-2]["upostag"]
    
    # Features for queue
    if queue:
        features["q0-form"] = q0["form"]
        features["q0-tag"] = q0["upostag"]
        features["q0-lemma"] = q0["lemma"]
        features["q0-word-tag"] = q0["form"] + q0["upostag"]
        if q0["feats"]:
            for k, v in q0["feats"].items():
                features[f"q0-{k}"] = v 
    if len(queue) > 1:
        features["q1-form"] = queue[1]["form"]
        features["q1-tag"] = queue[1]["upostag"]
        features["q1-word-tag"] = queue[1]["form"] + queue[1]["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
        #features["q2-word-tag"] = queue[2]["form"] + queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
        
    if queue and stack:
        Ds0q0 = q0["id"] - s0["id"]
        features["distance"] = Ds0q0
        features["q0-dist"] = q0["form"] + "-{}".format(Ds0q0)
        features["s0-dist"] = s0["form"] + "-{}".format(Ds0q0)
        features["s0q0-dist"] = s0["lemma"] + q0["lemma"] + "-{}".format(Ds0q0)
        features["s0-tag-dist"] = s0["upostag"] + "-{}".format(Ds0q0)
        features["q0-tag-dist"] = q0["upostag"] + "-{}".format(Ds0q0)
        features["s0q0-tag-dist"] = s0["upostag"] + q0["upostag"] + "-{}".format(Ds0q0)
        
    # Left two child for top stack
    Ns0l, s0l1, s0l2 = get_parse_context(s0, parse.lefts, tree) 
    if s0l1:
        features["s0l1"] = s0l1["form"]
        features["s0l1-tag"] = s0l1["upostag"]   
    if s0l2:
        features["s0l2"] = s0l2["form"]
        features["s0l2-tag"] = s0l2["upostag"]
    
    # Right two child for top stack
    Ns0r, s0r1, s0r2 = get_parse_context(s0, parse.rights, tree)
    if s0r1:
        features["s0r1"] = s0r1["form"]
        features["s0r1-tag"] = s0r1["upostag"] 
    if s0r2:
        features["s0r2"] = s0r2["form"]
        features["s0r2-tag"] = s0r2["upostag"]
    
    # Left two child for top queue
    Nq0l, q0l1, q0l2 = get_parse_context(q0, parse.lefts, tree)
    if q0l1:
        features["q0l1"] = q0l1["form"]
        features["q0l1-tag"] = q0l1["upostag"]  
    if q0l2:
        features["q0l2"] = q0l2["form"]
        features["q0l2-tag"] = q0l2["upostag"]
    
    # Right two child for top stack
    Nq0r, q0r1, q0r2 = get_parse_context(q0, parse.rights, tree)
    if q0r1:
        features["q0r1"] = q0r1["form"]
        features["q0r1-tag"] = q0r1["upostag"]    
    if q0r2:
        features["q0r2"] = q0r2["form"]
        features["q0r2-tag"] = q0r2["upostag"]
    
    if stack:
        features["s0l-N"] = s0["form"] + f"-{Ns0l}"
        features["s0r-N"] = s0["form"] + f"-{Ns0r}"
        features["s0l-tag-N"] = s0["upostag"] + f"-{Ns0l}"
        features["s0r-tag-N"] = s0["upostag"] + f"-{Ns0r}"
    if queue:
        features["q0l-N"] = q0["form"] + f"-{Nq0l}"
        features["q0r-N"] = q0["form"] + f"-{Nq0r}"
        features["q0l-tag-N"] = q0["upostag"] + f"-{Nq0l}"
        features["q0r-tag-N"] = q0["upostag"] + f"-{Nq0r}"
    return features

In [13]:
class Parse(object):
    
    def __init__(self, n):
        self.n = n
        self.relations = []
        self.lefts = []
        self.rights = []
        # we need n+1 coz examples in the training data are indexed from 1
        for k in range(n+1):
            self.lefts.append([])
            self.rights.append([])
    
    def add_relation(self, child, head):
        self.relations.append((child, head))
        if child < head:
            self.lefts[head].append(child)
        else:
            self.rights[head].append(child)

In [546]:
class Parser(object):
    
    def __init__(self):
        pass
    
    def get_action(self, stack, q, parse):
        if stack and not q:
            return "reduce"
        if stack[-1]["head"] == q[0]["id"]:
            return "left"
        elif q[0]["head"] == stack[-1]["id"]:
            return "right"
        elif (stack[-1]["head"] in [parent for _, parent in parse.relations] 
              and q[0]["head"] < stack[-1]["id"]):
            return "reduce"
        else:
            return "shift" 
        
    def parse(self, tree, oracle=None, vectorizer=None):
        q = tree.copy()
        parse = Parse(len(q))
        stack = [ROOT]
        labels = []
        features = []
        while q or stack:
            feature_set = extract_features(stack, q, tree, parse)
            
            if oracle is not None:
                v_features = vectorizer.transform(feature_set)
                action = oracle.predict(v_features)[0]
            else:
                action = self.get_action(stack or None, q or None, parse)
            
            if action == "left":
                parse.add_relation(stack[-1]["id"], q[0]["id"])
                stack.pop()
            elif action == "right":
                parse.add_relation(q[0]["id"], stack[-1]["id"])
                stack.append(q.pop(0))
            elif action == "reduce":
                stack.pop()
            elif action == "shift":
                stack.append(q.pop(0))              
            labels.append(action)
            features.append(feature_set)
        return labels, features, parse.relations

In [547]:
parser = Parser()
labels, features, _ = parser.parse(tree)
print(len(labels), len(features))

41 41


In [16]:
def get_data(trees, parser):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _ = parser.parse(tree)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features

##### Prepare train / test data

In [55]:
with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [563]:
y_train, features_train = get_data(trees, parser)
print(len(features_train), len(y_train))

154709 154709


In [564]:
y_test, features_test = get_data(test_trees, parser)
print(len(features_test), len(y_test))

30661 30661


##### Vectorize features

In [565]:
vectorizer = DictVectorizer(sparse=True)
v_train = vectorizer.fit_transform(features_train)
v_test = vectorizer.transform(features_test)

#### Train models

In [566]:
clf = LogisticRegression(random_state=25)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)

In [567]:
output, report, conf_matrix = calc_metrics(y_test, y_pred, labels=clf.classes_)

Recall: 0.894
Precision: 0.894
F1: 0.894
Accuracy: 0.894

Confusion matrix:
        pred_left  pred_reduce  pred_right  pred_shift
left         6983          139          54         190
reduce        377         6954         838         187
right          59          614        6312         232
shift         220          132         211        7159

Report:
             precision    recall  f1-score   support

       left       0.91      0.95      0.93      7366
     reduce       0.89      0.83      0.86      8356
      right       0.85      0.87      0.86      7217
      shift       0.92      0.93      0.92      7722

avg / total       0.89      0.89      0.89     30661



##### Add UAS calculation

In [532]:
def UAS(trees, oracle=None, vectorizer=None):
    total, tp, failed = 0, 0, 0
    for tree in trees:
        try:
            golden = [(node["id"], node["head"]) for node in tree]
            _, _, predicted = parser.parse(tree, oracle=oracle, vectorizer=vectorizer)
            total += len(golden)
            tp += len(set(golden).intersection(set(predicted))) 
        except:
            failed += 1
    return total, tp, failed

In [568]:
total, tp, failed = UAS(test_trees, clf, vectorizer)
print("Failed:", failed)
print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp / total, 3))

Failed: 10
Total: 14602
Correctly defined: 11277
UAS: 0.772


#### Try XGBoost