# CMU 11424/11824 Spring 2024: Reinflection and Paradigm Completion

## Baselines for Unimorph Reinflection Task


## Download Datasets

Data should be downloaded from Canvas and uploaded to the Colab filesystem.

In [None]:
!unzip dataset.zip

In [None]:
# %cd /content/
# !rm -rf dataset
# !unzip dataset.zip

/content
unzip:  cannot find or open dataset.zip, dataset.zip.zip or dataset.zip.ZIP.


## Performance Measures

Here, we are using exact match for evaluation.

In [2]:
import unicodedata

def evaluate(gold, pred):

    preds = [int(unicodedata.normalize("NFC",p)==unicodedata.normalize("NFC",g)) for p, g in zip(pred, gold)]
    if len(preds) == 0:
        return 0
    return sum(preds)/len(preds)

## Non-Neural Baseline

This method is pulled from the Unimorph non-neural baseline for Sigmorphon Reinflection Shared Task 2022.

In [3]:
import sys, os, getopt, re
from functools import wraps
from glob import glob

In [4]:
def hamming(s,t):
    return sum(1 for x,y in zip(s,t) if x != y)


def halign(s,t):
    """Align two strings by Hamming distance."""
    slen = len(s)
    tlen = len(t)
    minscore = len(s) + len(t) + 1
    for upad in range(0, len(t)+1):
        upper = '_' * upad + s + (len(t) - upad) * '_'
        lower = len(s) * '_' + t
        score = hamming(upper, lower)
        if score < minscore:
            bu = upper
            bl = lower
            minscore = score

    for lpad in range(0, len(s)+1):
        upper = len(t) * '_' + s
        lower = (len(s) - lpad) * '_' + t + '_' * lpad
        score = hamming(upper, lower)
        if score < minscore:
            bu = upper
            bl = lower
            minscore = score

    zipped = zip(bu,bl)
    newin  = ''.join(i for i,o in zipped if i != '_' or o != '_')
    newout = ''.join(o for i,o in zipped if i != '_' or o != '_')
    return newin, newout


def levenshtein(s, t, inscost = 1.0, delcost = 1.0, substcost = 1.0):
    """Recursive implementation of Levenshtein, with alignments returned."""
    @memolrec
    def lrec(spast, tpast, srem, trem, cost):
        if len(srem) == 0:
            return spast + len(trem) * '_', tpast + trem, '', '', cost + len(trem)
        if len(trem) == 0:
            return spast + srem, tpast + len(srem) * '_', '', '', cost + len(srem)

        addcost = 0
        if srem[0] != trem[0]:
            addcost = substcost

        return min((lrec(spast + srem[0], tpast + trem[0], srem[1:], trem[1:], cost + addcost),
                   lrec(spast + '_', tpast + trem[0], srem, trem[1:], cost + inscost),
                   lrec(spast + srem[0], tpast + '_', srem[1:], trem, cost + delcost)),
                   key = lambda x: x[4])

    answer = lrec('', '', s, t, 0)
    return answer[0],answer[1],answer[4]


def memolrec(func):
    """Memoizer for Levenshtein."""
    cache = {}
    @wraps(func)
    def wrap(sp, tp, sr, tr, cost):
        if (sr,tr) not in cache:
            res = func(sp, tp, sr, tr, cost)
            cache[(sr,tr)] = (res[0][len(sp):], res[1][len(tp):], res[4] - cost)
        return sp + cache[(sr,tr)][0], tp + cache[(sr,tr)][1], '', '', cost + cache[(sr,tr)][2]
    return wrap


def alignprs(lemma, form):
    """Break lemma/form into three parts:
    IN:  1 | 2 | 3
    OUT: 4 | 5 | 6
    1/4 are assumed to be prefixes, 2/5 the stem, and 3/6 a suffix.
    1/4 and 3/6 may be empty.
    """

    al = levenshtein(lemma, form, substcost = 1.1) # Force preference of 0:x or x:0 by 1.1 cost
    alemma, aform = al[0], al[1]
    # leading spaces
    lspace = max(len(alemma) - len(alemma.lstrip('_')), len(aform) - len(aform.lstrip('_')))
    # trailing spaces
    tspace = max(len(alemma[::-1]) - len(alemma[::-1].lstrip('_')), len(aform[::-1]) - len(aform[::-1].lstrip('_')))
    return alemma[0:lspace], alemma[lspace:len(alemma)-tspace], alemma[len(alemma)-tspace:], aform[0:lspace], aform[lspace:len(alemma)-tspace], aform[len(alemma)-tspace:]


def prefix_suffix_rules_get(lemma, form):
    """Extract a number of suffix-change and prefix-change rules
    based on a given example lemma+inflected form."""
    lp,lr,ls,fp,fr,fs = alignprs(lemma, form) # Get six parts, three for in three for out

    # Suffix rules
    ins  = lr + ls + ">"
    outs = fr + fs + ">"
    srules = set()
    for i in range(min(len(ins), len(outs))):
        srules.add((ins[i:], outs[i:]))
    srules = {(x[0].replace('_',''), x[1].replace('_','')) for x in srules}

    # Prefix rules
    prules = set()
    if len(lp) >= 0 or len(fp) >= 0:
        inp = "<" + lp
        outp = "<" + fp
        for i in range(0,len(fr)):
            prules.add((inp + fr[:i],outp + fr[:i]))
            prules = {(x[0].replace('_',''), x[1].replace('_','')) for x in prules}

    return prules, srules


def apply_best_rule(lemma, msd, allprules, allsrules):
    """Applies the longest-matching suffix-changing rule given an input
    form and the MSD. Length ties in suffix rules are broken by frequency.
    For prefix-changing rules, only the most frequent rule is chosen."""

    bestrulelen = 0
    base = "<" + lemma + ">"
    if msd not in allprules and msd not in allsrules:
        return lemma # Haven't seen this inflection, so bail out

    if msd in allsrules:
        applicablerules = [(x[0],x[1],y) for x,y in allsrules[msd].items() if x[0] in base]
        if applicablerules:
            bestrule = max(applicablerules, key = lambda x: (len(x[0]), x[2], len(x[1])))
            base = base.replace(bestrule[0], bestrule[1])

    if msd in allprules:
        applicablerules = [(x[0],x[1],y) for x,y in allprules[msd].items() if x[0] in base]
        if applicablerules:
            bestrule = max(applicablerules, key = lambda x: (x[2]))
            base = base.replace(bestrule[0], bestrule[1])

    base = base.replace('<', '')
    base = base.replace('>', '')
    return base


def numleadingsyms(s, symbol):
    return len(s) - len(s.lstrip(symbol))


def numtrailingsyms(s, symbol):
    return len(s) - len(s.rstrip(symbol))

In [5]:
lang = 'kbd'

allprules, allsrules = {}, {}
lines = [line.strip() for line in open(f"dataset/{lang}.train.tsv", "r") if line != '\n']
trainlemmas = set()
trainmsds = set()

# First, test if language is predominantly suffixing or prefixing
# If prefixing, work with reversed strings
prefbias, suffbias = 0,0
for l in lines:
  lemma, form, msd = l.split(u'\t')
  trainlemmas.add(lemma)
  trainmsds.add(msd)
  aligned = halign(lemma, form)
  if ' ' not in aligned[0] and ' ' not in aligned[1] and '-' not in aligned[0] and '-' not in aligned[1]:
      prefbias += numleadingsyms(aligned[0],'_') + numleadingsyms(aligned[1],'_')
      suffbias += numtrailingsyms(aligned[0],'_') + numtrailingsyms(aligned[1],'_')
for l in lines: # Read in lines and extract transformation rules from pairs
    lemma, form, msd = l.split(u'\t')
    if prefbias > suffbias:
        lemma = lemma[::-1]
        form = form[::-1]
    prules, srules = prefix_suffix_rules_get(lemma, form)

    if msd not in allprules and len(prules) > 0:
        allprules[msd] = {}
    if msd not in allsrules and len(srules) > 0:
        allsrules[msd] = {}

    for r in prules:
        if (r[0],r[1]) in allprules[msd]:
            allprules[msd][(r[0],r[1])] = allprules[msd][(r[0],r[1])] + 1
        else:
            allprules[msd][(r[0],r[1])] = 1

    for r in srules:
        if (r[0],r[1]) in allsrules[msd]:
            allsrules[msd][(r[0],r[1])] = allsrules[msd][(r[0],r[1])] + 1
        else:
            allsrules[msd][(r[0],r[1])] = 1

evallines = [line.strip() for line in open(f"dataset/{lang}.dev.tsv", "r") if line != '\n']
outfile = open(f"{lang}.txt", "w")

pred = []
gold = []

for l in evallines:
    lemma, correct, msd = l.split(u'\t')
    # lemma, msd = l.split(u'\t')
    if prefbias > suffbias:
        lemma = lemma[::-1]
    outform = apply_best_rule(lemma, msd, allprules, allsrules)
    if prefbias > suffbias:
        outform = outform[::-1]
        lemma = lemma[::-1]
    pred.append(outform)
    gold.append(correct)

    outfile.write(outform + "\n")

print(pred[:10])
print(gold[:10])
print(evaluate(pred, gold))


['абджхэ', 'абджхэмэ', 'абджхыу', 'абджм', 'абджр', 'абджхэр', 'абджу', 'абджхэмкӏэ', 'абджхэкӏэ', 'абджмкӏэ']
['абджэхэ', 'абджэхэмэ', 'абджэхыу', 'абджэм', 'абджэр', 'абджэхэр', 'абджыу', 'абджэхэмкӏэ', 'абджэхэкӏэ', 'абджэмкӏэ']
0.8833333333333333


## Neural Baseline

This method is pulled from the Unimorph non-neural baseline for Sigmorphon Reinflection Shared Task 2022.

In [6]:
!git clone https://github.com/shijie-wu/neural-transducer/

Cloning into 'neural-transducer'...
remote: Enumerating objects: 363, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 363 (delta 89), reused 61 (delta 59), pack-reused 253[K
Receiving objects: 100% (363/363), 793.24 KiB | 24.79 MiB/s, done.
Resolving deltas: 100% (221/221), done.


In [7]:
!pip install --upgrade torch==1.13.1

Collecting torch==1.13.1
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==1.13.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==1.13.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==1.13.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==1.13.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m350.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_cublas_cu11-11.10.3.66-py3-none-m

In [11]:
run_train = """
#!/bin/bash
lang=$1
arch=${2:-tagtransformer}
suff=$3

lr=0.001
scheduler=warmupinvsqr
epochs=100
warmup=100
beta2=0.98       # 0.999
label_smooth=0.1 # 0.0
total_eval=50
bs=400 # 256

# transformer
layers=4
hs=1024
embed_dim=256
nb_heads=4
#dropout=${2:-0.3}
dropout=0.3
ckpt_dir=checkpoints/sig22

path=../dataset

python src/train.py \
    --dataset sigmorphon17task1 \
    --train $path/$lang.train.tsv \
    --dev $path/$lang.dev.tsv \
    --test $path/$lang.testhidden.tsv \
    --model $ckpt_dir/$arch/$lang \
    --decode greedy --max_decode_len 32 \
    --embed_dim $embed_dim --src_hs $hs --trg_hs $hs --dropout $dropout --nb_heads $nb_heads \
    --label_smooth $label_smooth --total_eval $total_eval \
    --src_layer $layers --trg_layer $layers --max_norm 1 --lr $lr --shuffle \
    --arch $arch --gpuid 0 --estop 1e-8 --bs $bs --epochs $epochs \
    --scheduler $scheduler --warmup_steps $warmup --cleanup_anyway --beta2 $beta2 --bestacc
"""

%cd neural-transducer/
with open('run_train_4.sh', 'w') as f:
  f.write(run_train)
!make
!bash run_train_3.sh xty

[Errno 2] No such file or directory: 'neural-transducer/'
/home/ubuntu/11824_Subword_Modeling/proj2/neural-transducer
make: Nothing to be done for 'all'.
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: seed - 0
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: train - ['../dataset/xty.train.tsv']
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: dev - ['../dataset/xty.dev.tsv']
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: test - None
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: model - 'checkpoints/sig22/tagtransformer/xty'
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: load - ''
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: bs - 256
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: epochs - 100
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: max_steps - 0
INFO - 03/09/24 23:02:34 - 0:00:00 - command line argument: warmup_steps - 100
INFO - 03/09/24 23:02:34 - 0:00:00 - comman

INFO - 03/09/24 23:02:35 - 0:00:01 - maximum training 1100 steps (100 epochs)
INFO - 03/09/24 23:02:35 - 0:00:01 - evaluate every 2 epochs
INFO - 03/09/24 23:02:35 - 0:00:01 - At 0-th epoch with lr 0.000000.
  0%|| 0/11 [00:00<?, ?it/s]
read file: 2677it [00:00, 181823.58it/s]

build tensor: 2677it [00:00, 87808.43it/s]

build tensor: 2677it [00:00, 86988.01it/s]
100%|| 11/11 [00:01<00:00,  6.28it/s]
INFO - 03/09/24 23:02:36 - 0:00:03 - Running average train loss is 3.94738151810386 at epoch 0
INFO - 03/09/24 23:02:36 - 0:00:03 - At 1-th epoch with lr 0.000110.
100%|| 11/11 [00:01<00:00,  8.60it/s]
INFO - 03/09/24 23:02:38 - 0:00:04 - Running average train loss is 3.1707691712812944 at epoch 1
INFO - 03/09/24 23:02:38 - 0:00:04 - At 2-th epoch with lr 0.000220.
100%|| 11/11 [00:01<00:00,  8.35it/s]
INFO - 03/09/24 23:02:39 - 0:00:05 - Running average train loss is 2.714820623397827 at epoch 2
  0%|| 0/1 [00:00<?, ?it/s]
read file: 126it [00:00, 185954.36it/s]

build tensor: 126it [00:0

100%|| 11/11 [00:01<00:00,  8.40it/s]
INFO - 03/09/24 23:03:09 - 0:00:35 - Running average train loss is 0.9665105126120828 at epoch 22
100%|| 1/1 [00:00<00:00, 72.40it/s]
INFO - 03/09/24 23:03:09 - 0:00:35 - Average dev loss is 1.064989686012268 at epoch 22
100%|| 1/1 [00:00<00:00,  8.63it/s]
INFO - 03/09/24 23:03:09 - 0:00:36 - dev accuracy is 59.5238 at epoch 22
INFO - 03/09/24 23:03:09 - 0:00:36 - dev average edit distance is 0.9841 at epoch 22
INFO - 03/09/24 23:03:09 - 0:00:36 - At 23-th epoch with lr 0.000629.
100%|| 11/11 [00:01<00:00,  8.58it/s]
INFO - 03/09/24 23:03:11 - 0:00:37 - Running average train loss is 0.9533577453006398 at epoch 23
INFO - 03/09/24 23:03:11 - 0:00:37 - At 24-th epoch with lr 0.000615.
100%|| 11/11 [00:01<00:00,  8.55it/s]
INFO - 03/09/24 23:03:12 - 0:00:38 - Running average train loss is 0.9440931623632257 at epoch 24
100%|| 1/1 [00:00<00:00, 76.52it/s]
INFO - 03/09/24 23:03:12 - 0:00:38 - Average dev loss is 1.0561436414718628 at epoch 24
100%|| 1/1 

100%|| 1/1 [00:00<00:00,  7.51it/s]
INFO - 03/09/24 23:03:42 - 0:01:09 - dev accuracy is 58.7302 at epoch 44
INFO - 03/09/24 23:03:42 - 0:01:09 - dev average edit distance is 0.9841 at epoch 44
INFO - 03/09/24 23:03:42 - 0:01:09 - At 45-th epoch with lr 0.000449.
100%|| 11/11 [00:01<00:00,  8.34it/s]
INFO - 03/09/24 23:03:44 - 0:01:10 - Running average train loss is 0.8413091193545948 at epoch 45
INFO - 03/09/24 23:03:44 - 0:01:10 - At 46-th epoch with lr 0.000445.
100%|| 11/11 [00:01<00:00,  8.38it/s]
INFO - 03/09/24 23:03:45 - 0:01:11 - Running average train loss is 0.8332182494076815 at epoch 46
100%|| 1/1 [00:00<00:00, 74.90it/s]
INFO - 03/09/24 23:03:45 - 0:01:12 - Average dev loss is 0.9565414786338806 at epoch 46
100%|| 1/1 [00:00<00:00,  7.47it/s]
INFO - 03/09/24 23:03:45 - 0:01:12 - dev accuracy is 61.1111 at epoch 46
INFO - 03/09/24 23:03:45 - 0:01:12 - dev average edit distance is 0.9524 at epoch 46
INFO - 03/09/24 23:03:45 - 0:01:12 - At 47-th epoch with lr 0.000440.
100%||

INFO - 03/09/24 23:04:16 - 0:01:42 - At 67-th epoch with lr 0.000368.
100%|| 11/11 [00:01<00:00,  8.42it/s]
INFO - 03/09/24 23:04:17 - 0:01:43 - Running average train loss is 0.798019219528545 at epoch 67
INFO - 03/09/24 23:04:17 - 0:01:43 - At 68-th epoch with lr 0.000366.
100%|| 11/11 [00:01<00:00,  8.47it/s]
INFO - 03/09/24 23:04:18 - 0:01:45 - Running average train loss is 0.7972369519147006 at epoch 68
100%|| 1/1 [00:00<00:00, 73.53it/s]
INFO - 03/09/24 23:04:18 - 0:01:45 - Average dev loss is 0.9250949621200562 at epoch 68
100%|| 1/1 [00:00<00:00,  7.47it/s]
INFO - 03/09/24 23:04:18 - 0:01:45 - dev accuracy is 69.0476 at epoch 68
INFO - 03/09/24 23:04:18 - 0:01:45 - dev average edit distance is 0.8095 at epoch 68
INFO - 03/09/24 23:04:19 - 0:01:45 - At 69-th epoch with lr 0.000363.
100%|| 11/11 [00:01<00:00,  8.49it/s]
INFO - 03/09/24 23:04:20 - 0:01:46 - Running average train loss is 0.7980135354128751 at epoch 69
INFO - 03/09/24 23:04:20 - 0:01:46 - At 70-th epoch with lr 0.000

100%|| 11/11 [00:01<00:00,  8.40it/s]
INFO - 03/09/24 23:04:51 - 0:02:18 - Running average train loss is 0.7760233391414989 at epoch 90
100%|| 1/1 [00:00<00:00, 76.20it/s]
INFO - 03/09/24 23:04:51 - 0:02:18 - Average dev loss is 0.906409740447998 at epoch 90
100%|| 1/1 [00:00<00:00,  6.99it/s]
INFO - 03/09/24 23:04:51 - 0:02:18 - dev accuracy is 73.0159 at epoch 90
INFO - 03/09/24 23:04:51 - 0:02:18 - dev average edit distance is 0.746 at epoch 90
INFO - 03/09/24 23:04:52 - 0:02:18 - At 91-th epoch with lr 0.000316.
100%|| 11/11 [00:01<00:00,  8.46it/s]
INFO - 03/09/24 23:04:53 - 0:02:19 - Running average train loss is 0.7757478356361389 at epoch 91
INFO - 03/09/24 23:04:53 - 0:02:19 - At 92-th epoch with lr 0.000314.
100%|| 11/11 [00:01<00:00,  8.58it/s]
INFO - 03/09/24 23:04:54 - 0:02:21 - Running average train loss is 0.7766493233767423 at epoch 92
100%|| 1/1 [00:00<00:00, 74.07it/s]
INFO - 03/09/24 23:04:54 - 0:02:21 - Average dev loss is 0.9010225534439087 at epoch 92
100%|| 1/1 [

In [None]:
!bash run_train.sh xty

### STATS train_2:

hmmfull, bs 64: 

INFO - 03/09/24 22:44:18 - 1:05:08 - DEV accuracy is 75.3968 at epoch -1

INFO - 03/09/24 22:44:18 - 1:05:08 - DEV average edit distance is 0.5397 at epoch -1

INFO - 03/09/24 22:44:18 - 1:05:08 - DEV xty acc 75.3968 dist 0.5397

### STATS train_3: 

tagtransformer, bs 256, dropout 0.5

INFO - 03/09/24 22:57:31 - 0:02:33 - DEV accuracy is 73.0159 at epoch -1

INFO - 03/09/24 22:57:31 - 0:02:33 - DEV average edit distance is 0.7063 at epoch -1

INFO - 03/09/24 22:57:31 - 0:02:33 - DEV xty acc 73.0159 dist 0.7063

### STATS:
tagtransformer original: 

INFO - 03/09/24 23:05:06 - 0:02:32 - DEV accuracy is 73.0159 at epoch -1

INFO - 03/09/24 23:05:06 - 0:02:32 - DEV average edit distance is 0.7063 at epoch -1

INFO - 03/09/24 23:05:06 - 0:02:32 - DEV xty acc 73.0159 dist 0.7063