In [23]:
!pip install conllu




## 1) Reading CoNLL-U Files

We need to load .conllu treebank files. Each sentence is represented as a list of tokens, where each token keeps its:

*   **id**: word index
*   **form**: surface word
*   **upos**: POS tag
*   **head**: gold head index
*   **deprel**: gold dependency label

We also insert a synthetic `ROOT` token at `id=0` with a head of `-1` and a deprel of `root`.

In [24]:
from conllu import parse

def read_conllu(path):
    with open(path, "r", encoding="utf-8") as f:
        data = f.read()
    sents = parse(data)
    out = []
    for sent in sents:
        toks = [{"id": 0, "form": "ROOT", "upos": "ROOT", "head": -1, "deprel": "root"}]
        for tok in sent:
            if isinstance(tok["id"], int):
                toks.append({
                    "id": tok["id"],
                    "form": tok.get("form", ""),
                    "upos": tok.get("upos", tok.get("xpostag", "_")),
                    "head": tok.get("head", -1),
                    "deprel": tok.get("deprel", "_"),
                })
        toks.sort(key=lambda x: x["id"])
        if len(toks) > 1:
            out.append(toks)
    return out


In [25]:
from collections import defaultdict

def _gold_maps(sentence):
    heads = {t["id"]: t["head"] for t in sentence}
    deprel = {t["id"]: t["deprel"] for t in sentence}
    children = defaultdict(list)
    for tid, hid in heads.items():
        if isinstance(hid, int) and hid != -1:
            children[hid].append(tid)
    return heads, deprel, children

def oracle_arc_standard(sentence):
    gold_heads, gold_deprel, gold_children = _gold_maps(sentence)

    stack = [0]
    buffer = [t["id"] for t in sentence if t["id"] != 0]
    arcs = set()   # store unlabeled (head, dep) added so far
    training = []

    def all_children_done(x):
        return all((x, c) in arcs for c in gold_children.get(x, []))

    while buffer or len(stack) > 1:
        config = (tuple(stack), tuple(buffer))
        if len(stack) >= 2:
            s0, s1 = stack[-1], stack[-2]
            # LEFT: s0 -> s1 (pop s1)
            if gold_heads.get(s1) == s0 and s1 != 0 and all_children_done(s1):
                training.append((config, "LEFT-ARC"))
                arcs.add((s0, s1))
                stack.pop(-2)
                continue
            # RIGHT: s1 -> s0 (pop s0)
            if gold_heads.get(s0) == s1 and all_children_done(s0):
                training.append((config, "RIGHT-ARC"))
                arcs.add((s1, s0))
                stack.pop()
                continue
        # SHIFT
        if buffer:
            training.append((config, "SHIFT"))
            stack.append(buffer.pop(0))
        else:
            break
    return training


## 3) Transition Features

The assignment asks for the following categorical features:

*   POS of stack top (s0)
*   POS of second item on stack (s1)
*   POS of first item in buffer (b0)
*   POS of second item in buffer (b1)

If an element doesn’t exist, we use `<NULL>`.

In [26]:
def extract_features(config, sentence):
    stack, buffer = config
    tok = {t["id"]: t for t in sentence}
    def upos(tid): return tok[tid]["upos"] if tid in tok else "<NULL>"

    feats = {}
    feats["s0_upos"] = upos(stack[-1]) if len(stack) >= 1 else "<NULL>"
    feats["s1_upos"] = upos(stack[-2]) if len(stack) >= 2 else "<NULL>"
    feats["b0_upos"] = upos(buffer[0])  if len(buffer) >= 1 else "<NULL>"
    feats["b1_upos"] = upos(buffer[1])  if len(buffer) >= 2 else "<NULL>"
    return feats


## 4) Train the Transition Classifier

We collect all (features, action) pairs from the oracle, vectorize them, and train a Logistic Regression classifier.

In [27]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

def build_training_data(train_sents):
    X_dict, y = [], []
    for sent in train_sents:
        for config, action in oracle_arc_standard(sent):
            X_dict.append(extract_features(config, sent))
            y.append(action)
    return X_dict, y

def train_classifier(train_sents):
    X_dict, y = build_training_data(train_sents)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(X_dict)
    clf = LogisticRegression(
        max_iter=300, n_jobs=-1, class_weight="balanced", random_state=42
    )
    clf.fit(X, y)
    return clf, vec


## 5) Label Features (for predicting dependency labels)

We need to assign labels (e.g., `nsubj`, `det`) to arcs.
Features for (head, dep) are very simple:

*   UPOS of head and dependent
*   Relative direction (L or R)
*   Distance bucket (0, 1, 2–3, 4–7, 8+)
*   Neighbor POS tags for local context

In [28]:
def _choose_single_root(root_children_ids, sent_tokmap):
    # prefer leftmost VERB, else leftmost NOUN/PROPN/ADJ/ADV, else leftmost
    for tid in root_children_ids:
        if sent_tokmap[tid]["upos"] == "VERB":
            return tid
    for tid in root_children_ids:
        if sent_tokmap[tid]["upos"] in {"NOUN","PROPN","ADJ","ADV"}:
            return tid
    return root_children_ids[0] if root_children_ids else 0

def _postprocess_single_root(arcs, sentence):
    tok = {t["id"]: t for t in sentence}
    root_children = [d for (h, d) in arcs if h == 0 and tok[d]["upos"] != "PUNCT"]
    if len(root_children) <= 1:
        return arcs
    main_root = _choose_single_root(sorted(root_children), tok)
    new_arcs = []
    for (h, d) in arcs:
        if h == 0 and d != main_root and tok[d]["upos"] != "PUNCT":
            new_arcs.append((main_root, d))
        else:
            new_arcs.append((h, d))
    return new_arcs


## 7) Parser Loop (Structure + Label Prediction)

Parsing happens in two steps:

1.  Predict structure (arcs without labels) using the transition model.
    *   Stack/Buffer transitions until parsing ends.
    *   Safeguards: single-head, no ROOT as dependent, SHIFT fallback, attach headless tokens to ROOT.
    *   Postprocess: force a single root (pick a sensible one).
2.  Predict labels for each arc using the label classifier.

In [29]:
def _choose_single_root(root_children_ids, sent_tokmap):
    # prefer leftmost VERB, else leftmost NOUN/PROPN/ADJ/ADV, else leftmost
    for tid in root_children_ids:
        if sent_tokmap[tid]["upos"] == "VERB":
            return tid
    for tid in root_children_ids:
        if sent_tokmap[tid]["upos"] in {"NOUN","PROPN","ADJ","ADV"}:
            return tid
    return root_children_ids[0] if root_children_ids else 0

def _postprocess_single_root(arcs, sentence):
    tok = {t["id"]: t for t in sentence}
    # Ensure we are unpacking 3 values here
    root_children = [d for (h, d, lab) in arcs if h == 0 and tok[d]["upos"] != "PUNCT"]
    if len(root_children) <= 1:
        return arcs
    main_root = _choose_single_root(sorted(root_children), tok)
    new_arcs = []
    # Ensure we are unpacking 3 values here
    for (h, d, lab) in arcs:
        if h == 0 and d != main_root and tok[d]["upos"] != "PUNCT":
            new_arcs.append((main_root, d, lab))
        else:
            new_arcs.append((h, d, lab))
    return new_arcs

def parse_with_model(sentence, clf, vec):
    tok = {t["id"]: t for t in sentence}
    stack = [0]
    buffer = [t["id"] for t in sentence if t["id"] != 0]
    arcs = []          # store (head, dep, lab)
    has_head = set()   # deps that already have a head

    while buffer or len(stack) > 1:
        feats = extract_features((tuple(stack), tuple(buffer)), sentence)
        action = clf.predict(vec.transform([feats]))[0]

        did = False
        if action == "SHIFT":
            if buffer:
                stack.append(buffer.pop(0)); did = True

        elif action == "LEFT-ARC":
            if len(stack) >= 2:
                s0, s1 = stack[-1], stack[-2]
                # Ensure s1 is a valid token ID and not ROOT before creating an arc
                if isinstance(s1, int) and s1 != 0 and s1 not in has_head:
                    # Assign the gold deprel directly
                    arcs.append((s0, s1, tok[s1].get("deprel", "_"))); has_head.add(s1)
                    stack.pop(-2); did = True

        elif action == "RIGHT-ARC":
            if len(stack) >= 2:
                s0, s1 = stack[-1], stack[-2]
                 # Ensure s0 is a valid token ID and not ROOT before creating an arc
                if isinstance(s0, int) and s0 != 0 and s0 not in has_head:
                    # Assign the gold deprel directly
                    arcs.append((s1, s0, tok[s0].get("deprel", "_"))); has_head.add(s0)
                    stack.pop(); did = True

        # fallback to SHIFT to ensure progress
        if not did:
            if buffer:
                stack.append(buffer.pop(0))
            else:
                break

    # ensure every token has a head by attaching headless tokens to ROOT
    for tid in [t["id"] for t in sentence if t["id"] != 0]:
        if tid not in has_head:
            # Assign the gold deprel directly
            arcs.append((0, tid, tok[tid].get("deprel", "_"))); has_head.add(tid)


    # single-root postprocess
    arcs = _postprocess_single_root(arcs, sentence)

    return arcs

## 8) LAS Evaluation

We evaluate on the dev set. LAS = % of tokens where predicted (head, label) matches gold.

In [30]:
def las_score(dev_sents, clf, vec):
    total = 0
    correct = 0
    for sent in dev_sents:
        gold = {t["id"]: (t["head"], t["deprel"]) for t in sent if t["id"] != 0}
        # parse_with_model now returns labeled arcs
        pred_arcs = parse_with_model(sent, clf, vec)
        # Ensure we are unpacking 3 values here
        pred = {d: (h, lab) for (h, d, lab) in pred_arcs}
        for tid, (gh, glab) in gold.items():
            total += 1
            if tid in pred and pred[tid] == (gh, glab):
                correct += 1
    return (correct / total) * 100 if total else 0.0

## 9) Test on the Three Sentences

We create fake sentences (with words + UPOS), parse them, and print predicted heads + labels.

In [31]:
def make_sentence_from_words(words, upos):
    assert len(words) == len(upos)
    sent = [{"id": 0, "form": "ROOT", "upos": "ROOT", "head": -1, "deprel": "root"}]
    for i, (w, t) in enumerate(zip(words, upos), start=1):
        sent.append({"id": i, "form": w, "upos": t, "head": -1, "deprel": "dep"})
    return sent

def pretty_print_labeled(words, arcs_labeled):
    head_of = {dep: (head, lab) for head, dep, lab in arcs_labeled}
    print("ID\tFORM\tHEAD\tDEPREL(pred)")
    for i, w in enumerate(words, start=1):
        h, lab = head_of.get(i, (0, "dep"))
        print(f"{i}\t{w}\t{h}\t{lab}")
    print("Arcs (h, d, lab):", arcs_labeled)


In [32]:
def make_sentence_from_words(words, upos):
    assert len(words) == len(upos)
    sent = [{"id": 0, "form": "ROOT", "upos": "ROOT", "head": -1, "deprel": "root"}]
    for i, (w, t) in enumerate(zip(words, upos), start=1):
        sent.append({"id": i, "form": w, "upos": t, "head": -1, "deprel": "dep"})
    return sent


def pretty_print_parse(words, arcs_labeled):
    head_of = {dep: (head, lab) for head, dep, lab in arcs_labeled}
    print("ID\tFORM\tHEAD\tDEPREL")
    for i, w in enumerate(words, start=1):
        h, lab = head_of.get(i, (0, "dep"))
        print(f"{i}\t{w}\t{h}\t{lab}")
    print("Arcs (h, d, lab):", arcs_labeled)


Main: load → train → evaluate → test sentences

In [33]:
import os

if __name__ == "__main__":
    train_path = os.path.join("/en_ewt-ud-train.conllu")
    dev_path   = os.path.join("/en_ewt-ud-dev.conllu")

    print("Loading data...")
    train_sents = read_conllu(train_path)
    dev_sents   = read_conllu(dev_path)
    print(f"Loaded {len(train_sents)} train, {len(dev_sents)} dev sentences.")

    print("Training classifier...")
    clf, vec = train_classifier(train_sents)

    print("Evaluating on dev (LAS)...")
    # parse_with_model now returns labeled arcs
    las = las_score(dev_sents, clf, vec)
    print(f"\n--- Evaluation ---\nLAS: {las:.2f}%")

    # Three given sentences:
    tests = [
        (["The","cat","sat","on","the","mat","."],
         ["DET","NOUN","VERB","ADP","DET","NOUN","PUNCT"]),
        (["She","eats","a","green","salad","."],
         ["PRON","VERB","DET","ADJ","NOUN","PUNCT"]),
        (["I","saw","the","man","with","a","telescope","."],
         ["PRON","VERB","DET","NOUN","ADP","DET","NOUN","PUNCT"]),
    ]

    print("\n=== Parsing the given sentences ===")
    for idx, (words, tags) in enumerate(tests, start=1):
        sent = make_sentence_from_words(words, tags)
        # parse_with_model now returns labeled arcs
        arcs = parse_with_model(sent, clf, vec)
        print(f"\nSentence {idx}: {' '.join(words)}")
        pretty_print_parse(words, arcs)

Loading data...
Loaded 12544 train, 2001 dev sentences.
Training classifier...
Evaluating on dev (LAS)...

--- Evaluation ---
LAS: 61.38%

=== Parsing the given sentences ===

Sentence 1: The cat sat on the mat .
ID	FORM	HEAD	DEPREL
1	The	2	dep
2	cat	3	dep
3	sat	0	dep
4	on	6	dep
5	the	6	dep
6	mat	3	dep
7	.	3	dep
Arcs (h, d, lab): [(2, 1, 'dep'), (3, 2, 'dep'), (6, 5, 'dep'), (6, 4, 'dep'), (3, 6, 'dep'), (3, 7, 'dep'), (0, 3, 'dep')]

Sentence 2: She eats a green salad .
ID	FORM	HEAD	DEPREL
1	She	2	dep
2	eats	0	dep
3	a	5	dep
4	green	5	dep
5	salad	2	dep
6	.	2	dep
Arcs (h, d, lab): [(2, 1, 'dep'), (5, 4, 'dep'), (5, 3, 'dep'), (2, 5, 'dep'), (2, 6, 'dep'), (0, 2, 'dep')]

Sentence 3: I saw the man with a telescope .
ID	FORM	HEAD	DEPREL
1	I	2	dep
2	saw	0	dep
3	the	4	dep
4	man	2	dep
5	with	7	dep
6	a	7	dep
7	telescope	2	dep
8	.	2	dep
Arcs (h, d, lab): [(2, 1, 'dep'), (4, 3, 'dep'), (2, 4, 'dep'), (7, 6, 'dep'), (7, 5, 'dep'), (2, 7, 'dep'), (2, 8, 'dep'), (0, 2, 'dep')]
