In [1]:
#!/usr/bin/env python3

This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_en.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_en.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [2]:
import logging
import math
import os
from pathlib import Path
from typing import Callable

In [3]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel
from lexicon import build_lexicon
import torch

  from .autonotebook import tqdm as notebook_tqdm


Set up logging.

In [4]:
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG
# torch.autograd.set_detect_anomaly(True)    # uncomment to improve error messages from .backward(), but slows down

Switch working directory to the directory where the data live.  You may need to edit this line.

In [5]:
os.chdir("../data")

In [6]:
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # all training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
log.info(f"Tagset: f{list(entrain.tagset)}")
known_vocab = TaggedCorpus(Path("ensup")).vocab    # words seen with supervised tags; used in evaluation

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types
INFO : Tagset: f['W', 'J', 'N', 'C', 'V', 'I', 'D', ',', 'M', 'P', '.', 'E', 'R', '`', "'", 'T', '$', ':', '-', '#', 'S', 'F', 'U', 'L', '_EOS_TAG_', '_BOS_TAG_']
INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types


Make an HMM.  Let's do supervised pre-training to approximately
maximize the regularized log-likelihood.  If you want to speed this
up, you can increase the tolerance of training (using the
`tolerance` argument), since we don't really have to train to
convergence.

We arbitrarily choose `reg=1`, but it would be better to search
for the best regularization strength.

In [7]:
lexicon = build_lexicon(entrain, embeddings_file=Path('words-50.txt'))  # works better with more dims!
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab, lexicon)  # randomly initialized parameters
loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
hmm.train(corpus=ensup, loss=loss_sup, 
          minibatch_size=30, evalbatch_size=10000, 
          reg=1, lr=0.0001, save_path="ensup_hmm.pkl") 

INFO : From words-50.txt, got embeddings for 10420 of 18461 word types
INFO : Training HiddenMarkovModel with 1976 parameters
4051it [00:07, 545.22it/s]
INFO : Cross-entropy: 12.5261 nats (= perplexity 275434.516)
1it [00:07,  7.62s/it]

12.526105188718478


4051it [00:06, 578.92it/s]
INFO : Cross-entropy: 8.8428 nats (= perplexity 6924.085)
10001it [02:44, 10.03it/s]

8.842761257247592


4051it [00:07, 563.27it/s]
INFO : Cross-entropy: 8.5613 nats (= perplexity 5225.534)
20001it [05:12, 10.94it/s]

8.561312288720398


4051it [00:06, 596.81it/s]
INFO : Cross-entropy: 8.4019 nats (= perplexity 4455.305)
30001it [07:43, 12.07it/s]

8.401850737013811


4051it [00:07, 508.69it/s]
INFO : Cross-entropy: 8.3053 nats (= perplexity 4045.106)
40001it [10:22,  7.51it/s]

8.305262947065351


4051it [00:07, 571.20it/s] 
INFO : Cross-entropy: 8.2401 nats (= perplexity 3790.021)
50001it [12:51, 11.19it/s]

8.240126956460243


4051it [00:06, 592.70it/s]
INFO : Cross-entropy: 8.1914 nats (= perplexity 3609.623)
60001it [15:14, 11.68it/s]

8.191358508598459


4051it [00:06, 652.33it/s]
INFO : Cross-entropy: 8.1560 nats (= perplexity 3484.173)
70001it [17:38, 11.27it/s]

8.155985990897781


4051it [00:07, 540.60it/s] 
INFO : Cross-entropy: 8.1244 nats (= perplexity 3375.717)
80001it [19:57, 10.56it/s]

8.124363016004184


4051it [00:06, 646.20it/s]
INFO : Cross-entropy: 8.0995 nats (= perplexity 3292.720)
90001it [22:18, 13.33it/s]

8.099469246118877


4051it [00:07, 570.29it/s]
INFO : Cross-entropy: 8.0781 nats (= perplexity 3223.228)
100001it [24:33,  9.80it/s]

8.078138514234725


4051it [00:07, 515.94it/s]]
INFO : Cross-entropy: 8.0614 nats (= perplexity 3169.645)
110001it [26:55,  9.76it/s]

8.06137496290274


4051it [00:08, 457.09it/s]]
INFO : Cross-entropy: 8.0456 nats (= perplexity 3119.964)
120001it [29:33,  9.61it/s]

8.04557678471941


4051it [00:06, 594.85it/s]]
INFO : Cross-entropy: 8.0326 nats (= perplexity 3079.783)
130001it [32:09, 10.09it/s]

8.03261450685748


4051it [00:07, 513.14it/s]]
INFO : Cross-entropy: 8.0181 nats (= perplexity 3035.365)
140001it [34:45, 10.03it/s]

8.018086800231936


4051it [00:06, 611.15it/s]]
INFO : Cross-entropy: 8.0106 nats (= perplexity 3012.711)
INFO : Saving model to ensup_hmm.pkl
INFO : Saved model to ensup_hmm.pkl
150000it [37:05, 67.41it/s]

8.010595584579315





Now let's throw in the unsupervised training data as well, and continue
training to try to improve accuracy on held-out development data.
We'll stop when this accuracy stops getting better.

This step is delicate, so we'll use a much smaller learning rate and
pause to evaluate more often, in hopes that tagging accuracy will go
up for a little bit before it goes down again (see Merialdo 1994).
(Log-likelihood will continue to improve, just not accuracy.)

In [8]:
hmm = HiddenMarkovModel.load("ensup_hmm.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev,
          minibatch_size=30, evalbatch_size=len(entrain)//4, # evaluate 4 times per epoch
          reg=1, lr=0.000001, save_path="entrain_hmm.pkl")

INFO : Loading model from ensup_hmm.pkl
INFO : Loaded model from ensup_hmm.pkl
INFO : Training HiddenMarkovModel with 1976 parameters
996it [00:02, 420.52it/s]
INFO : Cross-entropy: 8.0400 nats (= perplexity 3102.635)


8.040006991669514


996it [00:02, 359.64it/s]
INFO : Tagging accuracy: all: 81.739%, known: 83.175%, seen: 72.391%, novel: 62.814%
996it [00:02, 344.89it/s]
INFO : Cross-entropy: nan nats (= perplexity nan)


nan


996it [00:03, 331.56it/s]
INFO : Tagging accuracy: all: 4.664%, known: 5.073%, seen: 0.168%, novel: 0.000%
INFO : Saving model to entrain_hmm.pkl
INFO : Saved model to entrain_hmm.pkl
2016it [00:48, 41.65it/s]


You can also retry the above workflow where you start with a worse
supervised model (like Merialdo).  Replace `ensup` throughout the
corpus setup with `ensup-tiny`, which is only 25 sentences (that
cover all tags in `endev`).  And change the names of your saved
models.

More detailed look at the first 10 sentences in the held-out corpus,
including Viterbi tagging.

In [9]:
for m, sentence in enumerate(endev):
    if m >= 10: break
    viterbi = hmm.viterbi_tagging(sentence.desupervise(), endev)
    counts = eval_tagging(predicted=viterbi, gold=sentence, 
                          known_vocab=known_vocab)
    num = counts['NUM', 'ALL']
    denom = counts['DENOM', 'ALL']
    
    log.info(f"Gold:    {sentence}")
    log.info(f"Viterbi: {viterbi}")
    log.info(f"Loss:    {denom - num}/{denom}")
    log.info(f"Prob:    {math.exp(hmm.log_prob(sentence, endev))}")

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: [('_BOS_WORD_', 'W'), ('``', 'W'), ('We', 'W'), ("'re", 'W'), ('strongly', 'W'), ('_OOV_', 'W'), ('that', 'W'), ('anyone', 'W'), ('who', 'W'), ('has', 'W'), ('eaten', 'W'), ('in', 'W'), ('the', 'W'), ('cafeteria', 'W'), ('this', 'W'), ('month', 'W'), ('have', 'W'), ('the', 'W'), ('shot', 'W'), (',', 'W'), ("''", 'W'), ('Mr.', 'W'), ('Mattausch', 'W'), ('added', 'W'), (',', 'W'), ('``', 'W'), ('and', 'W'), ('that', 'W'), ('means', 'W'), ('virtually', 'W'), ('everyone', 'W'), ('who', 'W'), ('works', 'W'), ('here', 'W'), ('.', 'W'), ('_EOS_WORD_', '_EOS_TAG_')]
INFO : Loss:    33/36
INFO : Prob:    nan
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_