# 加载数据

本次依存句法分析完成的是Biaffine Parser模型

数据由10列组成，第0列为索引，从1开始，第1列为单词，第6列为每个词对应的head，第7列为与head组成的弧上的标签。
下面是一句样例

```txt
1       She     _       _       _       _       2       nsubj   _       _
2       enjoys  _       _       _       _       0       root    _       _
3       playing _       _       _       _       2       xcomp   _       _
4       tennis  _       _       _       _       3       dobj    _       _
5       .       _       _       _       _       2       punct   _       _

```

我们直接使用supar来加载数据。
首先，创建一下单词标签等多个域，然后用训练集初始化。
主要是创建单词表，收集所有标签等等。
创建完之后，可以通过域进行数字化和逆数字化。

In [1]:
import torch
from supar.utils.field import Field, SubwordField
from supar.utils import Dataset, Embedding
from supar.utils.metric import AttachmentMetric
from supar.utils.transform import CoNLL
from pprint import pprint


WORD = Field('words', pad='<pad>', unk='<unk>', bos='<bos>', lower=True)
CHAR = SubwordField('chars', pad='<pad>', unk='<unk>', bos='<bos>',fix_len=20)
ARC = Field('arcs', bos='<bos>', use_vocab=False, fn=CoNLL.get_arcs)
REL = Field('rels', bos='<bos>')

transform = CoNLL(FORM=(WORD, CHAR), HEAD=ARC, DEPREL=REL)
train = Dataset(transform, 'data/ptb/train.conllx')
WORD.build(train, 2, Embedding.load('data/glove.6B.100d.txt', 'unk'))
CHAR.build(train)
REL.build(train)

print(f"\nTotal words: {len(WORD.vocab)}\nTotal words in train data: {WORD.vocab.n_init}")
print(f"Total characters: {len(CHAR.vocab)}")
print(f"Total labels: {len(REL.vocab)}")

print(f"Numericalize 'John saw Marry': {WORD.transform([['John', 'saw', 'Marry']])}")
print(WORD.vocab[[2, 11312, 17403, 12647]])
print(REL.vocab.itos)

100%|###################################8| 985021/989860 00:03<00:00, 284851.45it/s
Total words: 401153
Total words in train data: 21679
Total characters: 82
Total labels: 46
Numericalize 'John saw Marry': [tensor([    2, 11312, 17403, 12647])]
['<bos>', 'john', 'saw', 'marry']
['<bos>', 'acomp', 'advcl', 'advmod', 'amod', 'appos', 'aux', 'auxpass', 'cc', 'ccomp', 'conj', 'cop', 'csubj', 'csubjpass', 'dep', 'det', 'discourse', 'dobj', 'expl', 'infmod', 'iobj', 'mark', 'mwe', 'neg', 'nn', 'npadvmod', 'nsubj', 'nsubjpass', 'num', 'number', 'parataxis', 'partmod', 'pcomp', 'pobj', 'poss', 'possessive', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'rcmod', 'root', 'tmod', 'xcomp']


从文件中加载train/dev/test数据，简单起见，分别只保留了1000/200/200句。

In [2]:
print("Load the data")
train = Dataset(transform, 'data/ptb/train.conllx')
train.sentences = train.sentences[:1000]
dev = Dataset(transform, 'data/ptb/dev.conllx')
dev.sentences = dev.sentences[:200]
test = Dataset(transform, 'data/ptb/test.conllx')
test.sentences = test.sentences[:200]

print(f"\ntext:\n{train.sentences[1]}")
print(f"arcs: {train.arcs[1]}")
print(f"rels: {train.rels[1]}")






Load the data
 66%|#######################6            | 38794/59100 00:00<00:00, 387896.90it/s
text:
1	Ms.	_	NNP	NNP	_	2	nn	_	_
2	Haag	_	NNP	NNP	_	3	nsubj	_	_
3	plays	_	VBZ	VBZ	_	0	root	_	_
4	Elianti	_	NNP	NNP	_	3	dobj	_	_
5	.	_	.	.	_	3	punct	_	_

arcs: ('2', '3', '0', '3', '3')
rels: ('nn', 'nsubj', 'root', 'dobj', 'punct')


根据前面建立的类为`CoNLL`的transform，对数据集进行数字化，产生一个`DataLoader`。

In [3]:
print("Numericalize the data")
train.build(batch_size=5000, n_buckets=32, shuffle=True)
dev.build(batch_size=5000, n_buckets=32)
test.build(batch_size=5000, n_buckets=32)
print(f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n")
print(test.loader)  
pprint(next(iter(test.loader)))


Numericalize the data

train: Dataset(n_sentences=1000, n_batches=32, n_buckets=32)
dev:   Dataset(n_sentences=200, n_batches=32, n_buckets=32)
test:  Dataset(n_sentences=200, n_batches=32, n_buckets=32)

<supar.utils.data.DataLoader object at 0x2ad9a7f60fd0>
[tensor([[     2,   1960,  10431,   9916,   1961, 195924,  19671,  19736,  14626,
           6471,  19884,  11146,  18377,   1959,  13335,  66371,   2010,  11950,
             35,      8],
        [     2,  15279,  14073,   4491,  10625,  16182,   2363,  19881,  17658,
          11182,  16570,  17756,   2400,  19881,   4421,  19671,  21232,  13503,
           6976,     35],
        [     2,   2657,  12574,   9109,  12527,   9916,   4305,  20635,   4677,
          11974,   2657,  17405,  19708,  21350,   3533,   4422,  18861,  19736,
          21178,     35],
        [     2,  12607,   9158,     28,   8824,  10902,     28,  10209,   4677,
          15289,  21213,   2012,    765,      6,  10611,  17763,  14073,  10167,
           91

# 模型定义

定义`BiaffineParserModel`，为PyTorch的一个模块类，实现了`forward`过程，以及loss的计算，解码

In [30]:
import torch.nn as nn
from supar.modules import MLP, BertEmbedding, Biaffine, BiLSTM, CharLSTM
from supar.modules.dropout import IndependentDropout, SharedDropout
from supar.utils import Config
from supar.utils.alg import eisner, eisner2o, mst
from supar.utils.transform import CoNLL
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class BiaffineDependencyModel(nn.Module):
    """
    The implementation of Biaffine Dependency Parser.

    References:
        - Timothy Dozat and Christopher D. Manning (ICLR'17)
          Deep Biaffine Attention for Neural Dependency Parsing
          https://openreview.net/pdf?id=Hk95PK9le/

    Args:
        n_words (int):
            Size of the word vocabulary.
        n_feats (int):
            Size of the feat vocabulary.
        n_rels (int):
            Number of labels in the treebank.
        feat (str):
            Specifies which type of additional feature to use: 'char' | 'bert' | 'tag'.
            'char': Character-level representations extracted by CharLSTM.
            'bert': BERT representations, other pretrained langugae models like `XLNet` are also feasible.
            'tag': POS tag embeddings.
            Default: 'char'.
        n_embed (int):
            Size of word embeddings. Default: 100.
        n_feat_embed (int):
            Size of feature representations. Default: 100.
        n_char_embed (int):
            Size of character embeddings serving as inputs of CharLSTM, required if feat='char'. Default: 50.
        bert (str):
            Specify which kind of language model to use, e.g., 'bert-base-cased' and 'xlnet-base-cased'.
            This is required if feat='bert'. The full list can be found in `transformers`.
            Default: `None`.
        n_bert_layers (int):
            Specify how many last layers to use. Required if feat='bert'.
            The final outputs would be the weight sum of the hidden states of these layers.
            Default: 4.
        mix_dropout (float):
            Dropout ratio of BERT layers. Required if feat='bert'. Default: .0.
        embed_dropout (float):
            Dropout ratio of input embeddings. Default: .33.
        n_lstm_hidden (int):
            Dimension of LSTM hidden states. Default: 400.
        n_lstm_layers (int):
            Number of LSTM layers. Default: 3.
        lstm_dropout (float): Default: .33.
            Dropout ratio of LSTM.
        n_mlp_arc (int):
            Arc MLP size. Default: 500.
        n_mlp_rel  (int):
            Label MLP size. Default: 100.
        mlp_dropout (float):
            Dropout ratio of MLP layers. Default: .33.
        feat_pad_index (int):
            The index of the padding token in the feat vocabulary. Default: 0.
        pad_index (int):
            The index of the padding token in the word vocabulary. Default: 0.
        unk_index (int):
            The index of the unknown token in the word vocabulary. Default: 1.
    """

    def __init__(self,
                 n_words,
                 n_feats,
                 n_rels,
                 feat='char',
                 n_embed=100,
                 n_feat_embed=100,
                 n_char_embed=50,
                 bert=None,
                 n_bert_layers=4,
                 mix_dropout=.0,
                 embed_dropout=.33,
                 n_lstm_hidden=400,
                 n_lstm_layers=3,
                 lstm_dropout=.33,
                 n_mlp_arc=500,
                 n_mlp_rel=100,
                 mlp_dropout=.33,
                 feat_pad_index=0,
                 pad_index=0,
                 unk_index=1,
                 **kwargs):
        super().__init__()

        self.args = Config().update(locals())
        # the embedding layer
        self.word_embed = nn.Embedding(num_embeddings=n_words,
                                       embedding_dim=n_embed)

        self.feat_embed = CharLSTM(n_chars=n_feats,
                                       n_embed=n_char_embed,
                                       n_out=n_feat_embed,
                                       pad_index=feat_pad_index)
        self.embed_dropout = IndependentDropout(p=embed_dropout)

        # the lstm layer
        self.lstm = BiLSTM(input_size=n_embed+n_feat_embed,
                           hidden_size=n_lstm_hidden,
                           num_layers=n_lstm_layers,
                           dropout=lstm_dropout)
        self.lstm_dropout = SharedDropout(p=lstm_dropout)

        # the MLP layers
        self.mlp_arc_d = MLP(n_in=n_lstm_hidden*2,
                             n_out=n_mlp_arc,
                             dropout=mlp_dropout)
        self.mlp_arc_h = MLP(n_in=n_lstm_hidden*2,
                             n_out=n_mlp_arc,
                             dropout=mlp_dropout)
        self.mlp_rel_d = MLP(n_in=n_lstm_hidden*2,
                             n_out=n_mlp_rel,
                             dropout=mlp_dropout)
        self.mlp_rel_h = MLP(n_in=n_lstm_hidden*2,
                             n_out=n_mlp_rel,
                             dropout=mlp_dropout)

        # the Biaffine layers
        self.arc_attn = Biaffine(n_in=n_mlp_arc,
                                 bias_x=True,
                                 bias_y=False)
        self.rel_attn = Biaffine(n_in=n_mlp_rel,
                                 n_out=n_rels,
                                 bias_x=True,
                                 bias_y=True)
        self.criterion = nn.CrossEntropyLoss()
        self.pad_index = pad_index
        self.unk_index = unk_index

    def load_pretrained(self, embed=None):
        if embed is not None:
            self.pretrained = nn.Embedding.from_pretrained(embed)
            nn.init.zeros_(self.word_embed.weight)
        return self

    def forward(self, words, feats):
        """
        Args:
            words (LongTensor) [batch_size, seq_len]:
                The word indices.
            feats (LongTensor):
                The feat indices.
                If feat is 'char' or 'bert', the size of feats should be [batch_size, seq_len, fix_len]
                If 'tag', then the size is [batch_size, seq_len].

        Returns:
            s_arc (Tensor): [batch_size, seq_len, seq_len]
                The scores of all possible arcs.
            s_rel (Tensor): [batch_size, seq_len, seq_len, n_labels]
                The scores of all possible labels on each arc.
        """

        batch_size, seq_len = words.shape
        # get the mask and lengths of given batch
        mask = words.ne(self.pad_index)
        ext_words = words
        # set the indices larger than num_embeddings to unk_index
        if hasattr(self, 'pretrained'):
            ext_mask = words.ge(self.word_embed.num_embeddings)
            ext_words = words.masked_fill(ext_mask, self.unk_index)

        # get outputs from embedding layers
        word_embed = self.word_embed(ext_words)
        if hasattr(self, 'pretrained'):
            word_embed += self.pretrained(words)
        feat_embed = self.feat_embed(feats)
        word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
        # concatenate the word and feat representations
        embed = torch.cat((word_embed, feat_embed), -1)

        x = pack_padded_sequence(embed, mask.sum(1), True, False)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, True, total_length=seq_len)
        x = self.lstm_dropout(x)

        # apply MLPs to the BiLSTM output states
        arc_d = self.mlp_arc_d(x)
        arc_h = self.mlp_arc_h(x)
        rel_d = self.mlp_rel_d(x)
        rel_h = self.mlp_rel_h(x)

        # [batch_size, seq_len, seq_len]
        s_arc = self.arc_attn(arc_d, arc_h)
        # [batch_size, seq_len, seq_len, n_rels]
        s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
        # set the scores that exceed the length of each sentence to -inf
        s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))

        return s_arc, s_rel

    def loss(self, s_arc, s_rel, arcs, rels, mask):
        """
        Args:
            s_arc (Tensor): [batch_size, seq_len, seq_len]
                The scores of all possible arcs.
            s_rel (Tensor): [batch_size, seq_len, seq_len, n_labels]
                The scores of all possible labels on each arc.
            arcs (LongTensor): [batch_size, seq_len]
                Tensor of gold-standard arcs.
            rels (LongTensor): [batch_size, seq_len]
                Tensor of gold-standard labels.
            mask (BoolTensor): [batch_size, seq_len, seq_len]
                Mask for covering the unpadded tokens.

        Returns:
            loss (Tensor): scalar
                The training loss.
        """
        s_arc, arcs = s_arc[mask], arcs[mask]    
        s_rel, rels = s_rel[mask], rels[mask]
        # s_rel: [batch_size, seq_len, seq_len, n_labels] 
        # s_rel[mask] -> [L, seq_len, n_labels]
        # s_rel[torch.arange(len(arcs)), arcs] -> [L, n_labels]

        s_rel = s_rel[torch.arange(len(arcs)), arcs]
        arc_loss = self.criterion(s_arc, arcs)
        rel_loss = self.criterion(s_rel, rels)

        return arc_loss + rel_loss

    def decode(self, s_arc, s_rel, mask, tree=False, proj=False):
        """
        Args:
            s_arc (Tensor): [batch_size, seq_len, seq_len]
                The scores of all possible arcs.
            s_rel (Tensor): [batch_size, seq_len, seq_len, n_labels]
                The scores of all possible labels on each arc.
            mask (BoolTensor): [batch_size, seq_len, seq_len]
                Mask for covering the unpadded tokens.
            tree (bool):
                If True, ensures to output well-formed trees. Default: False.
            proj (bool):
                If True, ensures to output projective trees. Default: False.

        Returns:
            arc_preds (Tensor): [batch_size, seq_len]
                The predicted arcs.
            rel_preds (Tensor): [batch_size, seq_len]
                The predicted labels.
        """

        lens = mask.sum(1)  
        # prevent self-loops
        s_arc.diagonal(0, 1, 2).fill_(float('-inf'))
        arc_preds = mst(s_arc, mask)
        rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)

        return arc_preds, rel_preds

    @classmethod
    def load(cls, path):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        state = torch.load(path, map_location=device)
        model = cls(**state['args'])
        model.load_pretrained(state['pretrained'])
        model.load_state_dict(state['state_dict'], False)
        model.to(device)

        return model

    def save(self, path):
        state_dict, pretrained = self.state_dict(), None
        if hasattr(self, 'pretrained'):
            pretrained = state_dict.pop('pretrained.weight')
        state = {
            'args': self.args,
            'state_dict': state_dict,
            'pretrained': pretrained
        }
        torch.save(state, path)


创建optimizer，和控制学习速率衰减的scheduler，采用指数衰减，衰减公式为$0.75^\frac{t}{5000}$
创建前面定义的模型，参数均设为默认

In [31]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR

model = BiaffineDependencyModel(n_words=WORD.vocab.n_init,
                                n_feats=len(CHAR.vocab), 
                                n_rels=len(REL.vocab), 
                                pad_index=WORD.pad_index, 
                                unk_index=WORD.unk_index, 
                                bos_index=WORD.bos_index, 
                                feat_pad_index=CHAR.pad_index)
model.load_pretrained(WORD.embed)
optimizer = Adam(model.parameters(), lr=2e-3, betas=(0.9, 0.9), eps=1e-12)
scheduler = ExponentialLR(optimizer, .75**(1/5000))

print(model)

BiaffineDependencyModel(
  (word_embed): Embedding(21679, 100)
  (feat_embed): CharLSTM(82, 50, n_out=100, pad_index=0)
  (embed_dropout): IndependentDropout(p=0.33)
  (lstm): BiLSTM(200, 400, num_layers=3, dropout=0.33)
  (lstm_dropout): SharedDropout(p=0.33, batch_first=True)
  (mlp_arc_d): MLP(n_in=800, n_out=500, dropout=0.33)
  (mlp_arc_h): MLP(n_in=800, n_out=500, dropout=0.33)
  (mlp_rel_d): MLP(n_in=800, n_out=100, dropout=0.33)
  (mlp_rel_h): MLP(n_in=800, n_out=100, dropout=0.33)
  (arc_attn): Biaffine(n_in=500, n_out=1, bias_x=True)
  (rel_attn): Biaffine(n_in=100, n_out=46, bias_x=True, bias_y=True)
  (criterion): CrossEntropyLoss()
  (pretrained): Embedding(401153, 100)
)


# 训练模型

首先定义评价指标。
对ptb数据集而言，评价时需要去掉标点。

In [32]:
from supar.utils.metric import Metric
from supar.utils.fn import ispunct


class AttachmentMetric(Metric):

    def __init__(self, eps=1e-8):
        super().__init__()

        self.eps = eps

        self.n = 0.0
        self.n_ucm = 0.0
        self.n_lcm = 0.0
        self.total = 0.0
        self.correct_arcs = 0.0
        self.correct_rels = 0.0

    def __repr__(self):
        s = f"UCM: {self.ucm:6.2%} LCM: {self.lcm:6.2%} "
        s += f"UAS: {self.uas:6.2%} LAS: {self.las:6.2%}"
        return s

    def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
        lens = mask.sum(1)
        arc_mask = arc_preds.eq(arc_golds) & mask
        rel_mask = rel_preds.eq(rel_golds) & arc_mask
        arc_mask_seq, rel_mask_seq = arc_mask[mask], rel_mask[mask]

        self.n += len(mask)
        self.n_ucm += arc_mask.sum(1).eq(lens).sum().item()
        self.n_lcm += rel_mask.sum(1).eq(lens).sum().item()

        self.total += len(arc_mask_seq)
        self.correct_arcs += arc_mask_seq.sum().item()
        self.correct_rels += rel_mask_seq.sum().item()

    @property
    def score(self):
        return self.las

    @property
    def ucm(self):
        return self.n_ucm / (self.n + self.eps)

    @property
    def lcm(self):
        return self.n_lcm / (self.n + self.eps)

    @property
    def uas(self):
        return self.correct_arcs / (self.total + self.eps)

    @property
    def las(self):
        return self.correct_rels / (self.total + self.eps)



metric = AttachmentMetric()
metric(torch.tensor([[1, 2, 3, 4, 4]]), # arc_preds
       torch.tensor([[1, 2, 3, 3, 5]]), # rel_preds
       torch.tensor([[1, 2, 3, 4, 5]]), # arc_golds
       torch.tensor([[1, 2, 3, 4, 5]]), # rel_golds
       torch.ones((1, 5)).gt(0))
print(metric)
puncts = torch.tensor([i for s, i in WORD.vocab.stoi.items() if ispunct(s)])
mask = torch.ones((1, 5)).gt(0) & torch.tensor([[1, 2, 3, 4, 5]]).unsqueeze(-1).ne(puncts).all(-1)
print(puncts)
print(WORD.vocab[puncts])
print(mask)
metric = AttachmentMetric()
metric(torch.tensor([[1, 2, 3, 4, 4]]), 
       torch.tensor([[1, 2, 3, 3, 5]]), 
       torch.tensor([[1, 2, 3, 4, 5]]),
       torch.tensor([[1, 2, 3, 4, 5]]),
       mask)
print(metric)

UCM:  0.00% LCM:  0.00% UAS: 80.00% LAS: 60.00%
tensor([     3,      4,      6,      7,      8,      9,     26,     27,     28,
            29,     30,     35,     36,   1955,   1956,   1958,  21679,  21680,
         21681,  21682,  21683,  21684,  21685,  21686,  21687,  21813,  21979,
         21980,  22266,  22267,  22268,  22269,  22270,  22271,  22272,  22273,
         22274,  22275,  22276,  22277,  22278,  22279,  22280,  22281,  22282,
         22283,  22284,  22285,  22286,  22287,  22288,  22289,  22290,  22291,
         22292,  22293,  22294,  22295,  22296,  22297,  22298,  22299,  22300,
         22301,  22302,  22303,  22304,  22305,  22306,  22307,  22517,  22518,
         23328,  62761,  62762,  62763,  62838,  62839,  62843,  62844,  62854,
         62855,  62856,  62857,  62858,  62859,  62860,  62862,  62863,  62864,
         62866,  62867,  62868,  62869,  62870,  62871,  62872,  62873,  62874,
         62875,  62876,  62877,  62878,  62879,  62880,  62881,  62882, 

下面定义了`train_loader`和`evaluate_loader`函数，遍历loader分别进行训练和评价，作为一个epoch。
另外还有`train_parser`函数，训练，评价和保存模型。
简单起见设置`patience`为3，要得到更好的性能，可以设为100。


In [33]:
import os
from datetime import datetime, timedelta


def train_loader(model, optimizer, scheduler, loader):
    model.train()

    metric = AttachmentMetric()

    for words, feats, arcs, rels in loader:
        optimizer.zero_grad()

        mask = words.ne(WORD.pad_index)
        # ignore the first token of each sentence
        mask[:, 0] = 0
        s_arc, s_rel = model(words, feats)
        loss = model.loss(s_arc, s_rel, arcs, rels, mask)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        scheduler.step()

        arc_preds, rel_preds = model.decode(s_arc, s_rel, mask)
        # ignore all punctuation if not specified
        # mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
        metric(arc_preds, rel_preds, arcs, rels, mask)

@torch.no_grad()
def evaluate_loader(model, loader):
    model.eval()

    total_loss, metric = 0, AttachmentMetric()

    for words, feats, arcs, rels in loader:
        mask = words.ne(WORD.pad_index)
        # ignore the first token of each sentence
        mask[:, 0] = 0
        s_arc, s_rel = model(words, feats)
        loss = model.loss(s_arc, s_rel, arcs, rels, mask)
        arc_preds, rel_preds = model.decode(s_arc, s_rel, mask)
        total_loss += loss.item()
        mask &= words.unsqueeze(-1).ne(puncts).all(-1)
        metric(arc_preds, rel_preds, arcs, rels, mask)
    total_loss /= len(loader)

    return total_loss, metric

def train_parser(train, dev, test, model, optimizer, scheduler,
          path='model',
          epochs=5000,
          patience=3):
    transform.train()

    elapsed = timedelta()
    best_e, best_metric = 1, Metric()

    for epoch in range(1, epochs + 1):
        start = datetime.now()

        print(f"Epoch {epoch} / {epochs}:")
        train_loader(model, optimizer, scheduler, train.loader)
        loss, dev_metric = evaluate_loader(model, dev.loader)
        print(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}")
        loss, test_metric = evaluate_loader(model, test.loader)
        print(f"{'test:':6} - loss: {loss:.4f} - {test_metric}")

        t = datetime.now() - start
        # save the model if it is the best so far
        if dev_metric > best_metric:
            best_e, best_metric = epoch, dev_metric
            model.save(path)
            print(f"{t}s elapsed (saved)\n")
        else:
            print(f"{t}s elapsed\n")
        elapsed += t
        if epoch - best_e >= patience:
            break
    loss, metric = evaluate_loader(model.load(path), test.loader)

    print(f"Epoch {best_e} saved")
    print(f"{'dev:':6} - {best_metric}")
    print(f"{'test:':6} - {metric}")
    print(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")

train_parser(train,dev,test,model, optimizer, scheduler)


Epoch 1 saved
dev:   - <supar.utils.metric.Metric object at 0x2ad986ba3910>
test:  - UCM: 19.50% LCM:  9.50% UAS: 81.08% LAS: 75.67%


NameError: name 'epoch' is not defined