## CS310 Natural Language Processing
## Assignment 4. Dependency Parsing

**Total points**: 50

In this assignment, you will train feed-forward neural network-based dependency parser and evaluate its performance on the provided treebank dataset.

#### Reference: 
https://github.com/lmxy0212/NLP


### 0. Import Necessary Libraries

In [1]:
import torch.nn as nn
import torch
from dep_utils import conll_reader, DependencyTree, DependencyEdge
import copy
from pprint import pprint
from collections import Counter, defaultdict
from typing import List, Dict, Tuple
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 1. Read Data and Generate Training Instances

In [3]:
print('In train.conll:')
with open('data/train.conll') as f:
    train_trees = list(conll_reader(f))
print(f'{len(train_trees)} trees read.')

print('In dev.conll:')
with open('data/dev.conll') as f:
    dev_trees = list(conll_reader(f))
print(f'{len(dev_trees)} trees read.')

print('In test.conll:')
with open('data/test.conll') as f:
    test_trees = list(conll_reader(f))
print(f'{len(test_trees)} trees read.')

In train.conll:
39832 trees read.
In dev.conll:
1700 trees read.
In test.conll:
2416 trees read.


#### State Class

- The top of stack is `stack[-1]`
- The front of buffer is `buffer[-1]`
- `deps` represents the currently found dependencies
  - It is a list of `(parent, child, relation)` triples, where `parent` and `child` are integer IDs and `relation` is a string (the dependency label).
- The `shift` methods moves the front of the buffer to the top of the stack

In [4]:
class State(object):
    def __init__(self, sentence=[]):
        self.stack = []
        self.buffer = []
        if sentence:
            self.buffer = list(reversed(sentence))
        self.deps = set()

    def shift(self):
        assert len(self.buffer) > 0
        self.stack.append(self.buffer.pop())

    def left_arc(self, label):
        assert len(self.stack) >= 2
        self.deps.add((self.stack[-1], self.stack[-2], label))
        self.stack.pop(-2)

    def right_arc(self, label):
        assert len(self.stack) >= 2
        self.deps.add((self.stack[-2], self.stack[-1], label))
        self.stack.pop(-1)

    def __repr__(self):
        return "({},{},{})".format(self.stack, self.buffer, self.deps)

#### Get training data from a dependency tree

The return type of this function is a list of two-elements tuples:
- Tuple[0] is a `State` object, deepcopied from the initial state
- Tuple[1] is a a tuple of `(action, relation)` where `action` is from {"shift", "left_arc", "right_arc"} and `relation` is the specific dependency relation.

- If $s_1 \rightarrow s_2$ exists in `deprels`, then `left_arc` is performed.
- If $s_2 \rightarrow s_1$ exists in `deprels`, **AND** all rules with $s_1$ as the head have already been assigned, then `right_arc` is performed.
- Perform `shift` otherwise.

In [5]:
class RootDummy(object):
    def __init__(self):
        self.head = None
        self.id = 0
        self.deprel = None
    def __repr__(self):
        return "<ROOT>"


def get_training_instances(dep_tree: DependencyTree) -> List[Tuple[State, Tuple[str, str]]]:
    deprels = dep_tree.deprels

    word_ids = list(deprels.keys())
    state = State(word_ids)
    state.stack.append(0) # ROOT

    childcount = defaultdict(int)
    for _, rel in deprels.items():
        childcount[rel.head] += 1

    seq = []
    while len(state.buffer) > 0 or len(state.stack) > 1:
        if state.stack[-1] == 0:
            seq.append((copy.deepcopy(state), ("shift", None)))
            state.shift()
            continue
        
        stack_top1 = deprels[state.stack[-1]]
        if state.stack[-2] == 0:
            stack_top2 = RootDummy()
        else:
            stack_top2 = deprels[state.stack[-2]]

        # Decide transition action
        ### START YOUR CODE ###
        try:
            if stack_top2.head == stack_top1.id : # Left-Arc, top1 -> top2
                childcount[stack_top1.id] -= 1
                seq.append((copy.deepcopy(state), ("left_arc", stack_top2.deprel)))
                state.left_arc(stack_top2.deprel)
            elif stack_top1.head == stack_top2.id and childcount[stack_top1.id] == 0: # Right-Arc, top2 -> top1
                childcount[stack_top2.id] -= 1
                seq.append((copy.deepcopy(state), ("right_arc", stack_top1.deprel)))
                state.right_arc(stack_top1.deprel)
            else: # Shift
                seq.append((copy.deepcopy(state), ("shift", None)))
                state.shift()
        except:
            return seq
        ### END YOUR CODE ###
    
    return seq

#### Build vocabulary


In [6]:
word2id = {}
pos2id = {}
def get_vocabs(trees: List[DependencyTree]):
    for tree in trees:
        word = tree.words()
        pos = tree.pos()
        for w in word:
            if w is None:
                continue
            if w not in word2id:
                word2id[w] = len(word2id)
        for p in pos:
            if p is None:
                continue
            if p not in pos2id:
                pos2id[p] = len(pos2id)

In [7]:
get_vocabs(train_trees)
get_vocabs(dev_trees)
get_vocabs(test_trees)

word2id['<NULL>'] = len(word2id)
pos2id['<NULL>'] = len(pos2id)
word2id['<ROOT>'] = len(word2id)
pos2id['<ROOT>'] = len(pos2id)


print(f'word_vocab: {len(word2id)} words')
print(f'pos_vocab: {len(pos2id)} pos tags')

word_vocab: 46350 words
pos_vocab: 47 pos tags


##### Action Vocabulary

In [8]:
rel_vocab = {}

for t in train_trees+dev_trees+test_trees:
    for e in t.deprels.values():
        if e.deprel not in rel_vocab:
            rel_vocab[e.deprel] = len(rel_vocab)

# Test results
print('Total number fo unique relations:', len(rel_vocab))
print(rel_vocab.keys())

# You should expect to see the following output:
# Total number fo unique relations: 39
# {'nummod', 'root', 'nmod:tmod', 'nmod', 'punct', 'expl', 'auxpass', 'neg', 'nsubjpass', 'appos' ...

Total number fo unique relations: 39
dict_keys(['case', 'det', 'compound', 'nummod', 'nmod', 'punct', 'nmod:poss', 'amod', 'nsubj', 'dep', 'dobj', 'cc', 'conj', 'nsubjpass', 'acl', 'auxpass', 'advmod', 'root', 'ccomp', 'mark', 'xcomp', 'nmod:tmod', 'appos', 'nmod:npmod', 'aux', 'cop', 'neg', 'acl:relcl', 'advcl', 'mwe', 'det:predet', 'csubj', 'parataxis', 'compound:prt', 'iobj', 'expl', 'cc:preconj', 'discourse', 'csubjpass'])


In [9]:
# action vocab
action2id = {}
action2id[('shift',None)] = len(action2id)
for rel in rel_vocab.keys():
    if rel != 'root':
        action2id[("left_arc", rel)] = len(action2id)
        action2id[("right_arc", rel)] = len(action2id)
action2id[("right_arc", 'root')] = len(action2id)

In [10]:
len(action2id) # (39-1)*2 + 1(right_arc, root) + 1(shift, none) = 78

78

- For actual training step, you need to post-process the data to convert each relation tuple to an integer index. 
- We have 39 unique dependency relations in the data, including `ROOT`. Considering `ROOT` only appears as the head in a `right_arc` action, we have $(39-1)\times 2 + 1 = 77$ possible actions in total.

#### Feature Extractor

inputs:     $𝑒(𝑠_2)⨁𝑒(𝑠_1)⨁𝑒(𝑠_0)⨁𝑒(𝑏_0)⨁𝑒(𝑏_1)⨁𝑒(𝑏_2)⨁𝑒(𝑡𝑠_2)⨁𝑒(𝑡𝑠_1)⨁𝑒(𝑡𝑠_0)⨁𝑒(𝑡𝑏_0)⨁𝑒(𝑡𝑏_1)⨁𝑒(𝑡𝑏_2)$

In [11]:
class FeatureExtractor():
    def __init__(self):
        print('FeatureExtractor')

    def get_input_representation(self, words, pos, state):
        # (s2, s1, s0, b0, b1, b2, ts2, ts1, ts0, tb0, tb1, tb2)
        input = []
        for s in range(-3, 0): # top 3 words on the stack
            if abs(s) <= len(state.stack):
                sw_id = state.stack[s]
                if sw_id == 0: # None, make it ROOT
                    input.append(word2id['<ROOT>'])
                else:
                    input.append(word2id[words[sw_id]])
            else:
                input.append(word2id['<NULL>'])

        for b in range(-1, -4, -1): # top 3 words on the buffer
            if abs(b) <= len(state.buffer):
                bw_id = state.buffer[b]
                input.append(word2id[words[bw_id]])
            else:
                input.append(word2id['<NULL>'])

        # pos
        for i in range(-3, 0):
            if abs(i) <= len(state.stack):
                sw_id = state.stack[i]
                if sw_id == 0:
                    input.append(pos2id['<ROOT>'])
                else:
                    input.append(pos2id[pos[sw_id]])
            else:
                input.append(pos2id['<NULL>'])

        for i in range(-1, -4, -1):
            if abs(i) <= len(state.buffer):
                bw_id = state.buffer[i]
                input.append(pos2id[pos[bw_id]])
            else:
                input.append(pos2id['<NULL>'])
                
        # print(input)
        return torch.LongTensor(input).to(device) 

    def get_output_representation(self, action):
        return torch.tensor(action2id[action], dtype=torch.long).to(device)


In [12]:
# Test the FeatureExtractor
dt = train_trees[23]
fe = FeatureExtractor()
seq = get_training_instances(dt)
inputs = [] 
outputs = []
for i, (state, action) in enumerate(seq):
    words = dt.words()
    pos = dt.pos()
    input = fe.get_input_representation(words, pos, state)
    output = fe.get_output_representation(action)
    inputs.append(input)
    outputs.append(output)


FeatureExtractor


In [13]:
len(inputs), len(outputs)

(97, 97)

In [14]:
inputs[:3], outputs[:3]

([tensor([46348, 46348, 46349,   312,    19,   151,    45,    45,    46,    29,
              1,    10], device='cuda:1'),
  tensor([46348, 46349,   312,    19,   151,   259,    45,    46,    29,     1,
             10,    29], device='cuda:1'),
  tensor([46349,   312,    19,   151,   259,   291,    46,    29,     1,    10,
             29,    25], device='cuda:1')],
 [tensor(0, device='cuda:1'),
  tensor(0, device='cuda:1'),
  tensor(0, device='cuda:1')])

In [15]:
def process(dep_trees: List[DependencyTree], word_vocab: dict, pos_vocab: dict, action_vocab, extractor):
    inputs = []
    outputs = []
    for i, tree in enumerate(dep_trees):
        words = tree.words()
        pos = tree.pos()
        # i = 23 卡在get_training_instances
        instances = get_training_instances(tree)
        if i % 1000 == 0:
            print(f'{i}/{len(dep_trees)}')
        for state, action in instances:
            # convert to torch tensor
            inputs.append(extractor.get_input_representation(words, pos, state))
            outputs.append(extractor.get_output_representation(action))

    return inputs, outputs

In [16]:
if os.path.exists('train_data.pt') and os.path.exists('train_label.pt'):
    train_data = torch.load('train_data.pt')
    train_label = torch.load('train_label.pt')
else:
    train_data, train_label = process(train_trees, word2id, pos2id, action2id, FeatureExtractor())
    train_data = torch.stack(train_data)
    train_label = torch.stack(train_label)
    torch.save(train_data, 'train_data.pt')
    torch.save(train_label, 'train_label.pt')
    # this may take a while

In [17]:
train_data = train_data.to(device)
train_label = train_label.to(device)

In [18]:
len(train_data), len(train_label), type(train_data), type(train_label)

(1899390, 1899390, torch.Tensor, torch.Tensor)

In [19]:
train_data[0], train_label[0]

(tensor([46348, 46348, 46349,     0,     1,     2,    45,    45,    46,     0,
             1,     2], device='cuda:1'),
 tensor(0, device='cuda:1'))

### 2. Build the Model

In [20]:
word_dim = len(word2id)
pos_dim = len(pos2id)
feature_len = len(train_data[0])
out_dim = len(action2id)
emb_dim = 50
hidden_dim = 100
word_dim, pos_dim, out_dim

(46350, 47, 78)

#### Model Class


In [21]:
class NNOracle(nn.Module):
    def __init__(self,input_len, word_dim, pos_dim, emb_dim, out_dim, hidden_dim=100):
        super(NNOracle, self).__init__()
        self.word_embedding = nn.Embedding(num_embeddings=word_dim, embedding_dim=emb_dim)
        self.pos_embedding = nn.Embedding(num_embeddings=pos_dim, embedding_dim=emb_dim)
        self.hidden_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(emb_dim * input_len, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 10),
            nn.ReLU()
        )
        self.fc = nn.Linear(10, out_dim)
        # softmax layer is calculated outside
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.word_embedding.weight.data.uniform_(-initrange, initrange)
        self.pos_embedding.weight.data.uniform_(-initrange, initrange)
        for layer in self.hidden_layers:
            if isinstance(layer, nn.Linear):
                layer.weight.data.uniform_(-initrange, initrange)
                layer.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x):
        word_tensor, pos_tensor = torch.chunk(x, 2, dim=1)
        word_emb = self.word_embedding(word_tensor)
        pos_emb = self.pos_embedding(pos_tensor)
        x = torch.cat((word_emb, pos_emb), dim=1)
        x = self.hidden_layers(x)
        x = self.fc(x)
        return x

#### Parser Class

In [22]:
class Parser(object): 

    def __init__(self, model: NNOracle):
        self.model = model
        self.extractor = FeatureExtractor()
        self.id2action = {v: k for k, v in action2id.items()}

    def parse_sentence(self, words, pos):
        state = State(range(1, len(words)))
        state.stack.append(0) # ROOT

        while len(state.buffer) > 0 or len(state.stack) > 1:
            model_input = self.extractor.get_input_representation(words, pos, state)
            model_out = self.model.forward(model_input.unsqueeze(0))
            probs = torch.softmax(model_out, dim=1)
            sorted_indices = torch.argsort(probs, dim=1, descending=True)
            sorted_indices = torch.squeeze(sorted_indices)
            for i in range(0, len(sorted_indices)): # might have illegal actions
                move, rel = self.id2action[sorted_indices[i].item()]
                if move == 'shift' and len(state.buffer) > 0:
                    state.shift()
                    break
                elif len(state.stack) >= 2:
                    if move == 'left_arc' and state.stack[-2] != 0 and rel != 'root':
                        state.left_arc(rel)
                        break
                    if move == 'right_arc':
                        state.right_arc(rel)
                        break

        result = DependencyTree()
        for h, c, r in state.deps: # head, child(dependent), relation
            result.add_deprel(DependencyEdge(c, words[c], pos[c], h, r))
        return result 
    
    # compare the predicted tree with the reference tree
    def compare_tree(self, ref_tree: DependencyTree, prediction: DependencyTree):
        # unlabeled does not care about the relation
        target_unlabeled = set((d.id,d.head) for d in ref_tree.deprels.values())
        target_labeled = set((d.id,d.head,d.deprel) for d in ref_tree.deprels.values())
        predict_unlabeled = set((d.id,d.head) for d in prediction.deprels.values())
        predict_labeled = set((d.id,d.head,d.deprel) for d in prediction.deprels.values())

        labeled_correct = len(predict_labeled.intersection(target_labeled))
        unlabeled_correct = len(predict_unlabeled.intersection(target_unlabeled))
        num_words = len(predict_labeled)
        return labeled_correct, unlabeled_correct, num_words 
        

### 3. Train and Evaluate

In [23]:
def evaluate(dep_trees: List[DependencyTree], parser: Parser):
    total_labeled_correct = 0
    total_unlabeled_correct = 0
    total_words = 0
    count = 0 
    print("Evaluating.")
    for dtree in dep_trees:
        words = dtree.words()
        pos = dtree.pos()
        prediction = parser.parse_sentence(words, pos)
        labeled_correct, unlabeled_correct, num_words = parser.compare_tree(dtree, prediction)
        total_labeled_correct += labeled_correct
        total_unlabeled_correct += unlabeled_correct
        total_words += num_words
        count += 1 
        if count % 200 == 0:
            print(f'{count}/{len(dep_trees)}')

    las = total_labeled_correct / float(total_words)
    uas = total_unlabeled_correct / float(total_words)

    print(f"{len(dep_trees)} sentences.\n")
    print(f"Labeled Attachment Score: {las}\n")
    print(f"Unlabeled Attachment Score: {uas}")

In [24]:
model_demo = NNOracle(input_len=feature_len, word_dim=word_dim, pos_dim=pos_dim, emb_dim=emb_dim, out_dim=out_dim, hidden_dim=hidden_dim).to(device)

In [25]:
# test evaluate
tree = dev_trees[:50]
parser = Parser(model_demo)
evaluate(tree, parser)

FeatureExtractor
Evaluating.
50 sentences.

Labeled Attachment Score: 0.0

Unlabeled Attachment Score: 0.0425531914893617


In [26]:
class DepDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

In [27]:
batch_size = 256
epochs = 2
learning_rate = 0.001

In [28]:
model = NNOracle(input_len=feature_len, word_dim=word_dim, pos_dim=pos_dim, emb_dim=emb_dim, out_dim=out_dim, hidden_dim=hidden_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [29]:
def train(
        model: NNOracle,
        optimizer: torch.optim.Optimizer,
        loss_function,
        train_dataloader: DataLoader,
        log_interval=500,
        epochs: int = 3):
    
    model.train()
    dev_parser = Parser(model)
    for epoch in range(epochs):
        total_loss = 0
        epoch_loss = 0
        for id, (batch_data, batch_label) in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            output = model.forward(batch_data)
            output = output.cpu()
            batch_label = batch_label.cpu()
            # output:[batch_size, num_classes]
            # label: [batch_size]
            loss = loss_function(output, batch_label)
            total_loss += loss.item()
            epoch_loss += loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 0.2) # 防止梯度爆炸
            optimizer.step()

            if id % log_interval == 0 and id > 0:
                print(
                    "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| loss {:8.4f}".format(
                        epoch, id, len(train_dataloader), loss
                    )
                )
                total_loss = 0

        print(f'Epoch {epoch}, loss: {epoch_loss/len(train_dataloader)}')
        print('--'*20)
        evaluate(dev_trees, dev_parser)
        print('--'*20)
        epoch_loss = 0

In [30]:
train_dataset = DepDataset(train_data, train_label)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [31]:
train(model, optimizer, criterion, train_dataloader, epochs=epochs, log_interval=2000)
torch.save(model, 'dep_model_nn.pt')

FeatureExtractor


 27%|██▋       | 2025/7420 [00:09<00:25, 208.13it/s]

| epoch   0 |  2000/ 7420 batches | loss   0.5489


 54%|█████▍    | 4029/7420 [00:18<00:13, 246.31it/s]

| epoch   0 |  4000/ 7420 batches | loss   0.5127


 81%|████████▏ | 6040/7420 [00:26<00:05, 245.90it/s]

| epoch   0 |  6000/ 7420 batches | loss   0.4861


100%|██████████| 7420/7420 [00:32<00:00, 227.67it/s]


Epoch 0, loss: 0.5925818628177167
----------------------------------------
Evaluating.
200/1700
400/1700
600/1700
800/1700
1000/1700
1200/1700
1400/1700
1600/1700
1700 sentences.

Labeled Attachment Score: 0.7005508886506967

Unlabeled Attachment Score: 0.7682777874716454
----------------------------------------


 27%|██▋       | 2038/7420 [00:09<00:21, 245.57it/s]

| epoch   1 |  2000/ 7420 batches | loss   0.3893


 54%|█████▍    | 4038/7420 [00:17<00:13, 249.01it/s]

| epoch   1 |  4000/ 7420 batches | loss   0.4215


 81%|████████▏ | 6038/7420 [00:25<00:05, 248.52it/s]

| epoch   1 |  6000/ 7420 batches | loss   0.3699


100%|██████████| 7420/7420 [00:31<00:00, 233.72it/s]


Epoch 1, loss: 0.339244027463895
----------------------------------------
Evaluating.
200/1700
400/1700
600/1700
800/1700
1000/1700
1200/1700
1400/1700
1600/1700
1700 sentences.

Labeled Attachment Score: 0.7326819054266271

Unlabeled Attachment Score: 0.794301667622205
----------------------------------------


#### Evaluation

In [32]:
test_model = torch.load('dep_model_nn.pt')
parser = Parser(test_model)
evaluate(test_trees, parser)

FeatureExtractor
Evaluating.
200/2416
400/2416
600/2416
800/2416
1000/2416
1200/2416
1400/2416
1600/2416
1800/2416
2000/2416
2200/2416
2400/2416
2416 sentences.

Labeled Attachment Score: 0.7357631783219251

Unlabeled Attachment Score: 0.7955331310422694


### Bonus 1: Arc Eager Algorithm


#### Reference

- http://fancyerii.github.io/books/depparser/
- https://aclanthology.org/C12-1059/
- https://direct.mit.edu/coli/article/40/2/259/1466/Arc-Eager-Parsing-with-the-Tree-Constraint


In [33]:
class EagerState(object):
    def __init__(self, sentence=[]):
        self.stack = []
        self.buffer = []
        if sentence:
            self.buffer = list(reversed(sentence))
        self.deps = set()

    def shift(self):
        assert len(self.buffer) > 0
        self.stack.append(self.buffer.pop())

    def left_arc(self, label):
        assert len(self.stack) > 0 and len(self.buffer) > 0
        self.deps.add((self.buffer[-1], self.stack[-1], label))
        self.stack.pop()

    def right_arc(self, label):
        assert len(self.stack) > 0 and len(self.buffer) > 0
        self.deps.add((self.stack[-1], self.buffer[-1], label))
        self.stack.append(self.buffer.pop())
    
    def reduce(self):
        assert len(self.stack) > 0
        self.stack.pop()

    def __repr__(self):
        return "({},{},{})".format(self.stack, self.buffer, self.deps)

In [34]:
def get_eager_training_instances(dep_tree: DependencyTree) -> List[Tuple[EagerState, Tuple[str, str]]]:
    deprels = dep_tree.deprels

    word_ids = list(deprels.keys())
    state = EagerState(word_ids)
    state.stack.append(0) # ROOT

    childcount = defaultdict(int)
    for _, rel in deprels.items():
        childcount[rel.head] += 1

    seq = []
    # a terminal configuration is any configuration with an empty buffer
    while len(state.buffer) > 0:
        if state.stack[-1] == 0:
            seq.append((copy.deepcopy(state), ("shift", None)))
            state.shift()
            continue
        
        stack_top = deprels[state.stack[-1]]
        buffer_top = deprels[state.buffer[-1]]

        try:
            if stack_top.head == buffer_top.id:
                childcount[buffer_top.id] -= 1
                seq.append((copy.deepcopy(state), ("left_arc", stack_top.deprel)))
                state.left_arc(stack_top.deprel)
            elif buffer_top.head == stack_top.id:
                childcount[stack_top.id] -= 1
                seq.append((copy.deepcopy(state), ("right_arc", buffer_top.deprel)))
                state.right_arc(buffer_top.deprel)
            elif childcount[stack_top.id] == 0 and stack_top.id in [d[1] for d in state.deps]:
                seq.append((copy.deepcopy(state), ("reduce", None)))
                state.reduce()
            else:
                seq.append((copy.deepcopy(state), ("shift", None)))
                state.shift()
        except:
            return seq

    return seq


Compare with Arc Standard

In [35]:
toy_data_standard = get_training_instances(dev_trees[1])
print('toy_data_standard length:', len(toy_data_standard))
pprint(toy_data_standard[:10])

toy_data_standard length: 90
[(([0],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],set()),
  ('shift', None)),
 (([0, 1],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2],set()),
  ('shift', None)),
 (([0, 1, 2],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3],set()),
  ('left_arc', 'det')),
 (([0, 2],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3],{(2, 1, 'det')}),
  ('shift', None)),
 (([0, 2, 3],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16

In [36]:
toy_data_eager = get_eager_training_instances(dev_trees[1])
print('toy_data_eager length:', len(toy_data_eager))
pprint(toy_data_eager[:10])

toy_data_eager length: 88
[(([0],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],set()),
  ('shift', None)),
 (([0, 1],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2],set()),
  ('left_arc', 'det')),
 (([0],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2],{(2, 1, 'det')}),
  ('shift', None)),
 (([0, 2],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3],{(2, 1, 'det')}),
  ('right_arc', 'punct')),
 (([0, 2, 3],[45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,

### Bonus 2: Bi-LSTM-based Encoder
Please check the notebook `A4_lstm.ipynb`