In [1]:
import nltk
from nltk.corpus import treebank

nltk.download('punkt')
nltk.download('treebank')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [4]:
tagged_sents = treebank.tagged_sents()
training_set = tagged_sents[:3500]
test_set = tagged_sents[3500:]

In [5]:
print("Length of training set: ", len(training_set))
print("Length of test set: ", len(test_set))

print("First sentence in training set: ", training_set[0])
print("First sentence in test set: ", test_set[0])

Length of training set:  3500
Length of test set:  414
First sentence in training set:  [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
First sentence in test set:  [('About', 'IN'), ('30', 'CD'), ('%', 'NN'), ('of', 'IN'), ('Ratners', 'NNP'), ("'s", 'POS'), ('profit', 'NN'), ('already', 'RB'), ('is', 'VBZ'), ('derived', 'VBN'), ('*-1', '-NONE-'), ('from', 'IN'), ('the', 'DT'), ('U.S.', 'NNP'), ('.', '.')]


In [11]:
def train_tagger(model, train_set, backoff=None):
    return model(train_set, backoff=backoff)

In [19]:
def train_many_taggers(models: list, train_set, backoff: list):
    out = []
    for model, backoff in zip(models, backoff):
        out.append(train_tagger(model, train_set, backoff=backoff))
    return tuple(out)

In [24]:
def evaluate_tagger(tagger, test_set):
    return tagger.accuracy(test_set)

In [22]:
def evaluate_many_taggers(taggers: list, test_set):
    out = []
    for tagger in taggers:
        out.append(evaluate_tagger(tagger, test_set))
    return tuple(out)

In [20]:
affix_tagger, unigram_tagger, bigram_tagger, trigram_tagger = train_many_taggers(
    [nltk.AffixTagger, nltk.UnigramTagger, nltk.BigramTagger, nltk.TrigramTagger],
    training_set,
    [None, None, None, None])

In [25]:
affix_accuracy, unigram_accuracy, bigram_accuracy, trigram_accuracy = evaluate_many_taggers(
    [affix_tagger, unigram_tagger, bigram_tagger, trigram_tagger],
    test_set)

In [27]:
def print_evaluation_result(name, accuracy):
    print(f"{name} accuracy: {accuracy}")

In [28]:
def print_many_evaluation_results(names: list, accuracies: list):
    for name, accuracy in zip(names, accuracies):
        print_evaluation_result(name, accuracy)

In [29]:
print_many_evaluation_results(
    ["Affix", "Unigram", "Bigram", "Trigram"],
    [affix_accuracy, unigram_accuracy, bigram_accuracy, trigram_accuracy])

Affix accuracy: 0.2756317165262852
Unigram accuracy: 0.8607803272340013
Bigram accuracy: 0.13466937748087907
Trigram accuracy: 0.08064672281924679


In [30]:
new_unigram_tagger, new_bigram_tagger, new_trigram_tagger = train_many_taggers(
    [nltk.UnigramTagger, nltk.BigramTagger, nltk.TrigramTagger],
    training_set,
    [affix_tagger, unigram_tagger, bigram_tagger])

In [31]:
new_unigram_accuracy, new_bigram_accuracy, new_trigram_accuracy = evaluate_many_taggers(
    [new_unigram_tagger, new_bigram_tagger, new_trigram_tagger],
    test_set)

In [32]:
print_many_evaluation_results(
    ["New Unigram", "New Bigram", "New Trigram"],
    [new_unigram_accuracy, new_bigram_accuracy, new_trigram_accuracy])

New Unigram accuracy: 0.8985380966211637
New Bigram accuracy: 0.8701713621841417
New Trigram accuracy: 0.13525026624068157


# The tagging accuracy for the individual taggers without a backoff model is (much) lower than the tagging accuracy for the corresponding taggers when using a backoff model. Explain why this is the case. In particular explain this for the case of the BigramTagger.

TODO answer here

In [43]:
off_the_shelf_tagger_tokenization_result = nltk.pos_tag_sents([[y[0] for y in x] for x in test_set])

In [44]:
def evaluator(results: list[list[tuple[str, str]]], expected: list[list[tuple[str, str]]]):
    mismatching_tags = 0
    total_tags = 0
    for result, expected in zip(results, expected):
        for result_token, expected_token in zip(result, expected):
            total_tags += 1
            if result_token[0] != expected_token[0]:
                raise ValueError("Token mismatch")
            if result_token[1] != expected_token[1]:
                mismatching_tags += 1
    return mismatching_tags / total_tags

In [45]:
evaluator(off_the_shelf_tagger_tokenization_result, test_set)

0.10852938328976668