# CMU 11424/11824 Spring 2024: Reinflection and Paradigm Completion

## Baselines for Unimorph Reinflection Task


## Download Datasets

Data should be downloaded from Canvas and uploaded to the Colab filesystem.

In [None]:
!unzip dataset.zip

In [None]:
# %cd /content/
# !rm -rf dataset
# !unzip dataset.zip

/content
unzip:  cannot find or open dataset.zip, dataset.zip.zip or dataset.zip.ZIP.


## Performance Measures

Here, we are using exact match for evaluation.

In [2]:
import unicodedata

def evaluate(gold, pred):

    preds = [int(unicodedata.normalize("NFC",p)==unicodedata.normalize("NFC",g)) for p, g in zip(pred, gold)]
    if len(preds) == 0:
        return 0
    return sum(preds)/len(preds)

## Non-Neural Baseline

This method is pulled from the Unimorph non-neural baseline for Sigmorphon Reinflection Shared Task 2022.

In [3]:
import sys, os, getopt, re
from functools import wraps
from glob import glob

In [4]:
def hamming(s,t):
    return sum(1 for x,y in zip(s,t) if x != y)


def halign(s,t):
    """Align two strings by Hamming distance."""
    slen = len(s)
    tlen = len(t)
    minscore = len(s) + len(t) + 1
    for upad in range(0, len(t)+1):
        upper = '_' * upad + s + (len(t) - upad) * '_'
        lower = len(s) * '_' + t
        score = hamming(upper, lower)
        if score < minscore:
            bu = upper
            bl = lower
            minscore = score

    for lpad in range(0, len(s)+1):
        upper = len(t) * '_' + s
        lower = (len(s) - lpad) * '_' + t + '_' * lpad
        score = hamming(upper, lower)
        if score < minscore:
            bu = upper
            bl = lower
            minscore = score

    zipped = zip(bu,bl)
    newin  = ''.join(i for i,o in zipped if i != '_' or o != '_')
    newout = ''.join(o for i,o in zipped if i != '_' or o != '_')
    return newin, newout


def levenshtein(s, t, inscost = 1.0, delcost = 1.0, substcost = 1.0):
    """Recursive implementation of Levenshtein, with alignments returned."""
    @memolrec
    def lrec(spast, tpast, srem, trem, cost):
        if len(srem) == 0:
            return spast + len(trem) * '_', tpast + trem, '', '', cost + len(trem)
        if len(trem) == 0:
            return spast + srem, tpast + len(srem) * '_', '', '', cost + len(srem)

        addcost = 0
        if srem[0] != trem[0]:
            addcost = substcost

        return min((lrec(spast + srem[0], tpast + trem[0], srem[1:], trem[1:], cost + addcost),
                   lrec(spast + '_', tpast + trem[0], srem, trem[1:], cost + inscost),
                   lrec(spast + srem[0], tpast + '_', srem[1:], trem, cost + delcost)),
                   key = lambda x: x[4])

    answer = lrec('', '', s, t, 0)
    return answer[0],answer[1],answer[4]


def memolrec(func):
    """Memoizer for Levenshtein."""
    cache = {}
    @wraps(func)
    def wrap(sp, tp, sr, tr, cost):
        if (sr,tr) not in cache:
            res = func(sp, tp, sr, tr, cost)
            cache[(sr,tr)] = (res[0][len(sp):], res[1][len(tp):], res[4] - cost)
        return sp + cache[(sr,tr)][0], tp + cache[(sr,tr)][1], '', '', cost + cache[(sr,tr)][2]
    return wrap


def alignprs(lemma, form):
    """Break lemma/form into three parts:
    IN:  1 | 2 | 3
    OUT: 4 | 5 | 6
    1/4 are assumed to be prefixes, 2/5 the stem, and 3/6 a suffix.
    1/4 and 3/6 may be empty.
    """

    al = levenshtein(lemma, form, substcost = 1.1) # Force preference of 0:x or x:0 by 1.1 cost
    alemma, aform = al[0], al[1]
    # leading spaces
    lspace = max(len(alemma) - len(alemma.lstrip('_')), len(aform) - len(aform.lstrip('_')))
    # trailing spaces
    tspace = max(len(alemma[::-1]) - len(alemma[::-1].lstrip('_')), len(aform[::-1]) - len(aform[::-1].lstrip('_')))
    return alemma[0:lspace], alemma[lspace:len(alemma)-tspace], alemma[len(alemma)-tspace:], aform[0:lspace], aform[lspace:len(alemma)-tspace], aform[len(alemma)-tspace:]


def prefix_suffix_rules_get(lemma, form):
    """Extract a number of suffix-change and prefix-change rules
    based on a given example lemma+inflected form."""
    lp,lr,ls,fp,fr,fs = alignprs(lemma, form) # Get six parts, three for in three for out

    # Suffix rules
    ins  = lr + ls + ">"
    outs = fr + fs + ">"
    srules = set()
    for i in range(min(len(ins), len(outs))):
        srules.add((ins[i:], outs[i:]))
    srules = {(x[0].replace('_',''), x[1].replace('_','')) for x in srules}

    # Prefix rules
    prules = set()
    if len(lp) >= 0 or len(fp) >= 0:
        inp = "<" + lp
        outp = "<" + fp
        for i in range(0,len(fr)):
            prules.add((inp + fr[:i],outp + fr[:i]))
            prules = {(x[0].replace('_',''), x[1].replace('_','')) for x in prules}

    return prules, srules


def apply_best_rule(lemma, msd, allprules, allsrules):
    """Applies the longest-matching suffix-changing rule given an input
    form and the MSD. Length ties in suffix rules are broken by frequency.
    For prefix-changing rules, only the most frequent rule is chosen."""

    bestrulelen = 0
    base = "<" + lemma + ">"
    if msd not in allprules and msd not in allsrules:
        return lemma # Haven't seen this inflection, so bail out

    if msd in allsrules:
        applicablerules = [(x[0],x[1],y) for x,y in allsrules[msd].items() if x[0] in base]
        if applicablerules:
            bestrule = max(applicablerules, key = lambda x: (len(x[0]), x[2], len(x[1])))
            base = base.replace(bestrule[0], bestrule[1])

    if msd in allprules:
        applicablerules = [(x[0],x[1],y) for x,y in allprules[msd].items() if x[0] in base]
        if applicablerules:
            bestrule = max(applicablerules, key = lambda x: (x[2]))
            base = base.replace(bestrule[0], bestrule[1])

    base = base.replace('<', '')
    base = base.replace('>', '')
    return base


def numleadingsyms(s, symbol):
    return len(s) - len(s.lstrip(symbol))


def numtrailingsyms(s, symbol):
    return len(s) - len(s.rstrip(symbol))

In [5]:
lang = 'xty'

allprules, allsrules = {}, {}
lines = [line.strip() for line in open(f"dataset/{lang}.train.tsv", "r") if line != '\n']
trainlemmas = set()
trainmsds = set()

# First, test if language is predominantly suffixing or prefixing
# If prefixing, work with reversed strings
prefbias, suffbias = 0,0
for l in lines:
  lemma, form, msd = l.split(u'\t')
  trainlemmas.add(lemma)
  trainmsds.add(msd)
  aligned = halign(lemma, form)
  if ' ' not in aligned[0] and ' ' not in aligned[1] and '-' not in aligned[0] and '-' not in aligned[1]:
      prefbias += numleadingsyms(aligned[0],'_') + numleadingsyms(aligned[1],'_')
      suffbias += numtrailingsyms(aligned[0],'_') + numtrailingsyms(aligned[1],'_')
for l in lines: # Read in lines and extract transformation rules from pairs
    lemma, form, msd = l.split(u'\t')
    if prefbias > suffbias:
        lemma = lemma[::-1]
        form = form[::-1]
    prules, srules = prefix_suffix_rules_get(lemma, form)

    if msd not in allprules and len(prules) > 0:
        allprules[msd] = {}
    if msd not in allsrules and len(srules) > 0:
        allsrules[msd] = {}

    for r in prules:
        if (r[0],r[1]) in allprules[msd]:
            allprules[msd][(r[0],r[1])] = allprules[msd][(r[0],r[1])] + 1
        else:
            allprules[msd][(r[0],r[1])] = 1

    for r in srules:
        if (r[0],r[1]) in allsrules[msd]:
            allsrules[msd][(r[0],r[1])] = allsrules[msd][(r[0],r[1])] + 1
        else:
            allsrules[msd][(r[0],r[1])] = 1

evallines = [line.strip() for line in open(f"dataset/{lang}.dev.tsv", "r") if line != '\n']
outfile = open(f"{lang}.txt", "w")

pred = []
gold = []

for l in evallines:
    lemma, correct, msd = l.split(u'\t')
    # lemma, msd = l.split(u'\t')
    if prefbias > suffbias:
        lemma = lemma[::-1]
    outform = apply_best_rule(lemma, msd, allprules, allsrules)
    if prefbias > suffbias:
        outform = outform[::-1]
        lemma = lemma[::-1]
    pred.append(outform)
    gold.append(correct)

    outfile.write(outform + "\n")

print(pred[:10])
print(gold[:10])
print(evaluate(pred, gold))


['chi³i³', 'chi¹⁴i³', 'ni¹-chi³i³', 'chi¹³i³', 'chi⁴i³', 'ka³a⁴', 'ka¹⁴a⁴', 'ni¹-ka³a⁴', 'ka¹³a⁴', 'ka⁴a⁴']
['chi³i³', 'chi¹⁴i³', 'ni¹-chi³i³', 'chi¹³i³', 'chi⁴i⁴', 'ka³a⁴', 'ka¹⁴a²⁴', 'ni¹-ka³a⁴', 'ka¹³a⁴', 'ka⁴a²⁴']
0.753968253968254


## Neural Baseline

This method is pulled from the Unimorph non-neural baseline for Sigmorphon Reinflection Shared Task 2022.

In [6]:
!git clone https://github.com/shijie-wu/neural-transducer/

Cloning into 'neural-transducer'...
remote: Enumerating objects: 363, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 363 (delta 89), reused 61 (delta 59), pack-reused 253[K
Receiving objects: 100% (363/363), 793.24 KiB | 24.79 MiB/s, done.
Resolving deltas: 100% (221/221), done.


In [1]:
!pip install --upgrade torch==1.13.1



In [3]:
run_train = """
#!/bin/bash
lang=$1
arch=${2:-tagtransformer}
suff=$3

lr=0.001
scheduler=warmupinvsqr
epochs=100
warmup=100
beta2=0.98       # 0.999
label_smooth=0.1 # 0.0
total_eval=50
bs=400 # 256

# transformer
layers=6
hs=1024
embed_dim=256
nb_heads=4
#dropout=${2:-0.3}
dropout=0.3
ckpt_dir=checkpoints/sig22

path=../dataset

python src/train.py \
    --dataset sigmorphon17task1 \
    --train $path/$lang.train.tsv \
    --dev $path/$lang.dev.tsv \
    --test $path/$lang.test.tsv \
    --model $ckpt_dir/$arch/$lang \
    --decode greedy --max_decode_len 32 \
    --embed_dim $embed_dim --src_hs $hs --trg_hs $hs --dropout $dropout --nb_heads $nb_heads \
    --label_smooth $label_smooth --total_eval $total_eval \
    --src_layer $layers --trg_layer $layers --max_norm 1 --lr $lr --shuffle \
    --arch $arch --gpuid 0 --estop 1e-8 --bs $bs --epochs $epochs \
    --scheduler $scheduler --warmup_steps $warmup --cleanup_anyway --beta2 $beta2 --bestacc
"""

%cd neural-transducer/
with open('run_train_66.sh', 'w') as f:
  f.write(run_train)
!make
!bash run_train_66.sh xty

[Errno 2] No such file or directory: 'neural-transducer/'
/home/ubuntu/11824_Subword_Modeling/proj2/neural-transducer
make: Nothing to be done for 'all'.
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: seed - 0
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: train - ['../dataset/xty.train.tsv']
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: dev - ['../dataset/xty.dev.tsv']
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: test - ['../dataset/xty.test.tsv']
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: model - 'checkpoints/sig22/tagtransformer/xty'
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: load - ''
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: bs - 400
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: epochs - 100
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: max_steps - 0
INFO - 03/10/24 02:24:34 - 0:00:00 - command line argument: warmup_steps - 100
INFO - 03/10/24 02:2

INFO - 03/10/24 02:24:35 - 0:00:01 - maximum training 700 steps (100 epochs)
INFO - 03/10/24 02:24:35 - 0:00:01 - evaluate every 2 epochs
INFO - 03/10/24 02:24:35 - 0:00:01 - At 0-th epoch with lr 0.000000.
  0%|| 0/7 [00:00<?, ?it/s]-------------

read file: 2677it [00:00, 171053.94it/s]

build tensor: 2677it [00:00, 84218.93it/s]

build tensor: 2677it [00:00, 83375.92it/s]
100%|| 7/7 [00:02<00:00,  2.77it/s]
INFO - 03/10/24 02:24:38 - 0:00:04 - Running average train loss is 3.908443178449358 at epoch 0
INFO - 03/10/24 02:24:38 - 0:00:04 - At 1-th epoch with lr 0.000070.
100%|| 7/7 [00:02<00:00,  3.43it/s]
INFO - 03/10/24 02:24:40 - 0:00:06 - Running average train loss is 3.157003198351179 at epoch 1
INFO - 03/10/24 02:24:40 - 0:00:06 - At 2-th epoch with lr 0.000140.
100%|| 7/7 [00:02<00:00,  3.39it/s]
INFO - 03/10/24 02:24:42 - 0:00:08 - Running average train loss is 2.760682685034616 at epoch 2
  0%|| 0/1 [00:00<?, ?it/s]-------------

read file: 126it [00:00, 174071.91it/s]

build

100%|| 1/1 [00:00<00:00,  4.69it/s]
INFO - 03/10/24 02:26:21 - 0:01:47 - dev accuracy is 73.0159 at epoch 44
INFO - 03/10/24 02:26:21 - 0:01:47 - dev average edit distance is 0.754 at epoch 44
INFO - 03/10/24 02:26:22 - 0:01:47 - At 45-th epoch with lr 0.000563.
100%|| 7/7 [00:02<00:00,  3.37it/s]
INFO - 03/10/24 02:26:24 - 0:01:50 - Running average train loss is 0.7298248069626945 at epoch 45
INFO - 03/10/24 02:26:24 - 0:01:50 - At 46-th epoch with lr 0.000557.
100%|| 7/7 [00:02<00:00,  3.45it/s]
INFO - 03/10/24 02:26:26 - 0:01:52 - Running average train loss is 0.7293112788881574 at epoch 46
100%|| 1/1 [00:00<00:00, 47.53it/s]
INFO - 03/10/24 02:26:26 - 0:01:52 - Average dev loss is 0.8566110730171204 at epoch 46
100%|| 1/1 [00:00<00:00,  4.73it/s]
INFO - 03/10/24 02:26:26 - 0:01:52 - dev accuracy is 73.8095 at epoch 46
INFO - 03/10/24 02:26:26 - 0:01:52 - dev average edit distance is 0.7302 at epoch 46
INFO - 03/10/24 02:26:26 - 0:01:52 - At 47-th epoch with lr 0.000551.
100%|| 7/7 

100%|| 7/7 [00:02<00:00,  3.48it/s]
INFO - 03/10/24 02:27:15 - 0:02:41 - Running average train loss is 0.7053625157901219 at epoch 67
INFO - 03/10/24 02:27:15 - 0:02:41 - At 68-th epoch with lr 0.000458.
100%|| 7/7 [00:01<00:00,  3.50it/s]
INFO - 03/10/24 02:27:17 - 0:02:43 - Running average train loss is 0.7049819486481803 at epoch 68
100%|| 1/1 [00:00<00:00, 48.00it/s]
INFO - 03/10/24 02:27:17 - 0:02:43 - Average dev loss is 0.8557988405227661 at epoch 68
100%|| 1/1 [00:00<00:00,  4.69it/s]
INFO - 03/10/24 02:27:18 - 0:02:43 - dev accuracy is 73.8095 at epoch 68
INFO - 03/10/24 02:27:18 - 0:02:43 - dev average edit distance is 0.7302 at epoch 68
INFO - 03/10/24 02:27:18 - 0:02:44 - At 69-th epoch with lr 0.000455.
100%|| 7/7 [00:02<00:00,  3.45it/s]
INFO - 03/10/24 02:27:20 - 0:02:46 - Running average train loss is 0.7051427960395813 at epoch 69
INFO - 03/10/24 02:27:20 - 0:02:46 - At 70-th epoch with lr 0.000452.
100%|| 7/7 [00:02<00:00,  3.31it/s]
INFO - 03/10/24 02:27:22 - 0:02:48

In [2]:
%cd neural-transducer/

/home/ubuntu/11824_Subword_Modeling/proj2/neural-transducer


In [None]:
!bash run_train_2.sh kbd

INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: seed - 0
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: train - ['../dataset/kbd.train.tsv']
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: dev - ['../dataset/kbd.dev.tsv']
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: test - ['../dataset/kbd.test.tsv']
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: model - 'checkpoints/sig22/hmmfull/kbd'
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: load - ''
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: bs - 64
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: epochs - 100
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: max_steps - 0
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: warmup_steps - 100
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: total_eval - 50
INFO - 03/10/24 04:15:14 - 0:00:00 - command line argument: optimizer - <Optimizer.adam: 'adam'>
INFO - 03

INFO - 03/10/24 04:17:19 - 0:02:05 - At 5-th epoch with lr 0.000767.
100%|| 34/34 [00:21<00:00,  1.58it/s]
INFO - 03/10/24 04:17:40 - 0:02:26 - Running average train loss is 0.00842467401697136 at epoch 5
INFO - 03/10/24 04:17:40 - 0:02:26 - At 6-th epoch with lr 0.000700.
100%|| 34/34 [00:21<00:00,  1.59it/s]
INFO - 03/10/24 04:18:01 - 0:02:48 - Running average train loss is 0.009159151690683383 at epoch 6
100%|| 5/5 [00:00<00:00,  7.05it/s]
INFO - 03/10/24 04:18:02 - 0:02:48 - Average dev loss is 0.19948814064264297 at epoch 6
100%|| 5/5 [00:00<00:00,  7.22it/s]
INFO - 03/10/24 04:18:03 - 0:02:49 - dev accuracy is 69.3333 at epoch 6
INFO - 03/10/24 04:18:03 - 0:02:49 - dev average edit distance is 0.6 at epoch 6
INFO - 03/10/24 04:18:13 - 0:02:59 - At 7-th epoch with lr 0.000648.
100%|| 34/34 [00:21<00:00,  1.58it/s]
INFO - 03/10/24 04:18:35 - 0:03:21 - Running average train loss is 0.004225925247951904 at epoch 7
INFO - 03/10/24 04:18:35 - 0:03:21 - At 8-th epoch with lr 0.000606.
1

100%|| 34/34 [00:21<00:00,  1.55it/s]
INFO - 03/10/24 04:28:08 - 0:12:54 - Running average train loss is 0.000391648513280362 at epoch 28
100%|| 5/5 [00:00<00:00,  7.10it/s]
INFO - 03/10/24 04:28:09 - 0:12:55 - Average dev loss is 0.22348632141947747 at epoch 28
100%|| 5/5 [00:00<00:00,  6.90it/s]
INFO - 03/10/24 04:28:09 - 0:12:56 - dev accuracy is 76.6667 at epoch 28
INFO - 03/10/24 04:28:09 - 0:12:56 - dev average edit distance is 0.3567 at epoch 28
INFO - 03/10/24 04:28:20 - 0:13:06 - At 29-th epoch with lr 0.000318.
100%|| 34/34 [00:21<00:00,  1.57it/s]
INFO - 03/10/24 04:28:42 - 0:13:28 - Running average train loss is 0.0008060962818088977 at epoch 29
INFO - 03/10/24 04:28:42 - 0:13:28 - At 30-th epoch with lr 0.000313.
100%|| 34/34 [00:21<00:00,  1.58it/s]
INFO - 03/10/24 04:29:03 - 0:13:49 - Running average train loss is 0.000925277312998172 at epoch 30
100%|| 5/5 [00:00<00:00,  7.08it/s]
INFO - 03/10/24 04:29:04 - 0:13:50 - Average dev loss is 0.21662504076957703 at epoch 30
1

100%|| 5/5 [00:00<00:00,  7.08it/s]
INFO - 03/10/24 04:38:17 - 0:23:03 - Average dev loss is 0.29728618264198303 at epoch 50
100%|| 5/5 [00:00<00:00,  6.86it/s]
INFO - 03/10/24 04:38:18 - 0:23:04 - dev accuracy is 73.0 at epoch 50
INFO - 03/10/24 04:38:18 - 0:23:04 - dev average edit distance is 0.4133 at epoch 50
INFO - 03/10/24 04:38:28 - 0:23:14 - At 51-th epoch with lr 0.000240.
100%|| 34/34 [00:21<00:00,  1.56it/s]
INFO - 03/10/24 04:38:50 - 0:23:36 - Running average train loss is 1.4408779520636017e-06 at epoch 51
INFO - 03/10/24 04:38:50 - 0:23:36 - At 52-th epoch with lr 0.000238.
100%|| 34/34 [00:21<00:00,  1.56it/s]
INFO - 03/10/24 04:39:12 - 0:23:58 - Running average train loss is 1.146990598499542e-06 at epoch 52
100%|| 5/5 [00:00<00:00,  7.12it/s]
INFO - 03/10/24 04:39:13 - 0:23:59 - Average dev loss is 0.2937383234500885 at epoch 52
100%|| 5/5 [00:00<00:00,  6.95it/s]
INFO - 03/10/24 04:39:13 - 0:24:00 - dev accuracy is 72.3333 at epoch 52
INFO - 03/10/24 04:39:13 - 0:24:

### STATS train_2:

hmmfull, bs 64: 

INFO - 03/09/24 22:44:18 - 1:05:08 - DEV accuracy is 75.3968 at epoch -1

INFO - 03/09/24 22:44:18 - 1:05:08 - DEV average edit distance is 0.5397 at epoch -1

INFO - 03/09/24 22:44:18 - 1:05:08 - DEV xty acc 75.3968 dist 0.5397

### STATS train_3: 

tagtransformer, bs 256, dropout 0.5

INFO - 03/09/24 22:57:31 - 0:02:33 - DEV accuracy is 73.0159 at epoch -1

INFO - 03/09/24 22:57:31 - 0:02:33 - DEV average edit distance is 0.7063 at epoch -1

INFO - 03/09/24 22:57:31 - 0:02:33 - DEV xty acc 73.0159 dist 0.7063

### STATS train_4:
tagtransformer original: 

INFO - 03/10/24 00:36:49 - 0:02:31 - DEV accuracy is 76.9841 at epoch -1

INFO - 03/10/24 00:36:49 - 0:02:31 - DEV average edit distance is 0.6746 at epoch -1

INFO - 03/10/24 00:36:49 - 0:02:31 - DEV xty acc 76.9841 dist 0.6746

### STATS train_6:

tagtransformer, layer 6:

INFO - 03/10/24 02:20:51 - 0:03:45 - DEV accuracy is 79.3651 at epoch -1

INFO - 03/10/24 02:20:51 - 0:03:45 - DEV average edit distance is 0.6746 at epoch -1

INFO - 03/10/24 02:20:51 - 0:03:45 - DEV xty acc 79.3651 dist 0.6746

INFO - 03/10/24 02:20:51 - 0:03:45 - TEST accuracy is 22.5296 at epoch -1

INFO - 03/10/24 02:20:51 - 0:03:45 - TEST average edit distance is 1.8024 at epoch -1

INFO - 03/10/24 02:20:51 - 0:03:45 - TEST xty acc 22.5296 dist 1.8024

### STATS train_66:

tag, layer 6 att_head 8:

INFO - 03/10/24 02:28:32 - 0:03:58 - DEV accuracy is 76.9841 at epoch -1

INFO - 03/10/24 02:28:32 - 0:03:58 - DEV average edit distance is 0.6746 at epoch -1

INFO - 03/10/24 02:28:32 - 0:03:58 - DEV xty acc 76.9841 dist 0.6746

INFO - 03/10/24 02:28:33 - 0:03:59 - TEST accuracy is 22.9249 at epoch -1

INFO - 03/10/24 02:28:33 - 0:03:59 - TEST average edit distance is 1.917 at epoch -1

INFO - 03/10/24 02:28:33 - 0:03:59 - TEST xty acc 22.9249 dist 1.917