# HMM Experiments

In [46]:
# Read annotated corpora with NLTK
# first download data
import nltk
from includes import *

#nltk.download()
# it will open a GUI and you have to double click in "all" to download 
# this will download different types of annotated corpora

Here is the complete **PTB** data

In [13]:
# inspect PoS from Treebank
# we use the universal tagset
treebank_sents = nltk.corpus.treebank.tagged_sents(tagset='universal')

Now we load the ids of sentences corresponding to training, development, and test sets

In [14]:
training_ids = [int(i) for i in open('training.ids') if i.strip()]
development_ids = [int(i) for i in open('development.ids') if i.strip()]
test_ids = [int(i) for i in open('test.ids') if i.strip()]
print(len(training_ids), len(development_ids), len(test_ids))

3000 114 800


Now we separate the 3 parts of the annotation

In [15]:
ptb_train = [treebank_sents[i] for i in training_ids]
ptb_dev = [treebank_sents[i] for i in development_ids]
ptb_test = [treebank_sents[i] for i in test_ids]

1. We *suggest* you print details about the data such as number of unique words/tags, total tokens, number of sentences, etc.
2. Then you can copy here your implementation from lab4 or import it from a separate python file if you want (but make sure to submit those too)
3. Then you can go on with training/development and test

The following are just **tips** (you need not necessarily follow them).

In [52]:
def extract_sentences(treebank_corpus):
    sentences = []
    for observations in treebank_corpus:
        sentences.append([x for x, c in observations])
    return sentences

def accuracy(gold_sequences, pred_sequences):
    """
    Return percentage of instances in the test data that our tagger labeled correctly.
    
    :param gold_sequences: a list of tag sequences that can be assumed to be correct
    :param pred_sequences: a list of tag sequences predicted by Viterbi    
    """
    count_correct, count_total = 0, 0
    for i, combined in enumerate(zip(pred_sequences, gold_sequences)):
        for p, g in list(zip(*combined)):
            if p == g:
                count_correct += 1
            count_total += 1
    if count_total:
        return count_correct / count_total
    return None

def predict_corpus(test_set, hmm):
    """
    Returns viterbi predictions for all sentences in a given corpus
    
    :param test_set: A corpus of tagged sentences
    :param hmm     : A language model
    """
    gold_sequences, pred_sequences = list(), list()
    print('Making predictions', end='')
    for i, sequence in enumerate(test_set):
        if i % round(len(test_set) / 10) == 0:
            print('.', end='')
        sentence , tags = map(list, zip(*sequence))
        viterbi_tags, _ = viterbi_recursion(sentence, hmm)
        gold_sequences.append(tags)
        pred_sequences.append(viterbi_tags)
    print()
    return gold_sequences, pred_sequences

In [35]:
# Grid search for hyperparameters
models = []
for alpha in [0.01, 0.1, 1., 10.]:
    for beta in [0.01, 0.1, 1., 10.]:
        print("a, b:", alpha, beta)
        hmm = HMMLM(alpha, beta)
        hmm.estimate_model(ptb_train)
        models.append(hmm)

a, b: 0.01 0.01
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.01 0.1
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.01 1.0
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.01 10.0
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.1 0.01
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.1 0.1
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.1 1.0
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 0.1 10.0
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 1.0 0.01
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 1.0 0.1
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 1.0 1.0
Start counting
Start calculating cpd's
..............
Finished cpd's
a, b: 1.0 10.0
Start counting
Start calculating cpd's
..............
Finished 

In [55]:
sents = extract_sentences(ptb_dev)
results = []
for model in models:
    a = model._transition_alpha
    b = model._emission_alpha
    ppl = log_perplexity(sents, model)
    acc = accuracy(*predict_corpus(ptb_dev, model))
    results.append([a, b, ppl, acc])


headers = ['alpha', 'beta', 'ppl', 'acc']

print()
print(tabulate(results, headers=headers))

Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........
Making predictions...........

  alpha    beta      ppl       acc
-------  ------  -------  --------
   0.01    0.01  4.56627  0.880875
   0.01    0.1   4.41098  0.880875
   0.01    1     4.68071  0.841057
   0.01   10     5.53552  0.751632
   0.1     0.01  4.56626  0.880875
   0.1     0.1   4.41095  0.880875
   0.1     1     4.68069  0.841057
   0.1    10     5.53551  0.751632
   1       0.01  4.56613  0.880548
   1       0.1   4.41072  0.880875
   1       1     4.68042  0.841057
   1      10     5.53537  0.751632
  10       0.01  4.56519  0.8

In [70]:
# Best test set model 13 with alpha 10 and beta 0.1
# Both perplexity and accuracy are best at this model
print(results[13])
# Now use test set to calc perplexity and accuracy

ppl = log_perplexity(extract_sentences(ptb_test), models[13])
acc = accuracy(*predict_corpus(ptb_test, models[13]))
print(round(ppl,3), round(acc,3))

[10.0, 0.1, 4.408709283788377, 0.881201044386423]
Making predictions..........4.41 0.886


You can also use *tabulate* to print some examples. For example:

In [41]:
def tabulate_example(sentence, gold_pos, pred_pos):
    illustration = []
    for w, g, p in zip(sentence, gold_pos, pred_pos):
        illustration.append([w, g, p])
    return illustration

In [92]:
for i in range(4):
    sentence, gold = map(list, zip(*ptb_test[i]))
    pred, _ = viterbi_recursion(sentence, models[13])
    illustration = tabulate_example(sentence, gold, pred)
    print()
    print(tabulate(illustration, headers=['Word', 'Gold', 'Pred']))
    print("accuracy:",round(accuracy([gold], [pred]),4))


Word             Gold    Pred
---------------  ------  ------
But              CONJ    ADJ
Rep.             NOUN    NOUN
Marge            NOUN    ADP
Roukema          NOUN    NOUN
-LRB-            .       .
R.               NOUN    NOUN
,                .       .
N.J              NOUN    NOUN
.                .       .
-RRB-            .       .
instead          ADV     ADV
praised          VERB    ADP
the              DET     DET
House            NOUN    NOUN
's               PRT     PRT
acceptance       NOUN    NOUN
of               ADP     ADP
a                DET     DET
new              ADJ     ADJ
youth            NOUN    NOUN
``               .       .
training         NOUN    NOUN
''               .       .
wage             NOUN    NOUN
,                .       .
a                DET     DET
subminimum       NOUN    NOUN
that             ADP     ADP
GOP              NOUN    NOUN
administrations  NOUN    PRT
have             VERB    VERB
sought           VERB    VERB
*T*-1     