In [1]:
from collections import defaultdict


In [2]:
POS_TAGS = [
    "NOUN", "VERB", "ADJ", "ADV",
    "PRON", "DET", "PREP", "CONJ"
]


In [3]:
training_data = [
    [("the", "DET"), ("dog", "NOUN"), ("runs", "VERB")],
    [("a", "DET"), ("cat", "NOUN"), ("sleeps", "VERB")],

    [("she", "PRON"), ("likes", "VERB"), ("fast", "ADJ"), ("cars", "NOUN")],
    [("he", "PRON"), ("runs", "VERB"), ("quickly", "ADV")],

    [("the", "DET"), ("big", "ADJ"), ("dog", "NOUN")],
    [("a", "DET"), ("small", "ADJ"), ("cat", "NOUN")],

    [("dogs", "NOUN"), ("run", "VERB"), ("fast", "ADV")],
    [("cats", "NOUN"), ("sleep", "VERB"), ("peacefully", "ADV")],

    [("the", "DET"), ("dog", "NOUN"), ("in", "PREP"), ("park", "NOUN")],
    [("cat", "NOUN"), ("on", "PREP"), ("table", "NOUN")],

    [("dogs", "NOUN"), ("and", "CONJ"), ("cats", "NOUN")],
    [("run", "VERB"), ("and", "CONJ"), ("jump", "VERB")]
]


In [4]:
word_tag_count = defaultdict(lambda: defaultdict(int))
tag_tag_count = defaultdict(lambda: defaultdict(int))
tag_count = defaultdict(int)
vocabulary = set()


In [5]:
for sentence in training_data:
    prev_tag = "<START>"
    for word, tag in sentence:
        word = word.lower()
        vocabulary.add(word)

        word_tag_count[word][tag] += 1
        tag_tag_count[prev_tag][tag] += 1
        tag_count[tag] += 1

        prev_tag = tag


In [6]:
def tag_sentence(sentence):
    words = sentence.lower().split()
    prev_tag = "<START>"
    tagged = []

    for word in words:
        best_tag = None
        best_score = 0

        # Unknown word fallback
        if word not in vocabulary:
            tagged.append((word, "NOUN"))
            prev_tag = "NOUN"
            continue

        for tag in POS_TAGS:
            emission = word_tag_count[word][tag] / tag_count[tag] if tag_count[tag] else 0
            transition_total = sum(tag_tag_count[prev_tag].values())
            transition = (
                tag_tag_count[prev_tag][tag] / transition_total
                if transition_total > 0 else 0
            )

            score = emission * transition

            if score > best_score:
                best_score = score
                best_tag = tag

        tagged.append((word, best_tag))
        prev_tag = best_tag

    return tagged


In [7]:
test_sentences = [
    "the dog runs fast",
    "she runs quickly",
    "dogs and cats sleep",
    "the cat on table",
    "he likes big dogs"
]

for s in test_sentences:
    print(s)
    print(tag_sentence(s))
    print()


the dog runs fast
[('the', 'DET'), ('dog', 'NOUN'), ('runs', 'VERB'), ('fast', 'ADV')]

she runs quickly
[('she', 'PRON'), ('runs', 'VERB'), ('quickly', 'ADV')]

dogs and cats sleep
[('dogs', 'NOUN'), ('and', 'CONJ'), ('cats', 'NOUN'), ('sleep', 'VERB')]

the cat on table
[('the', 'DET'), ('cat', 'NOUN'), ('on', 'PREP'), ('table', 'NOUN')]

he likes big dogs
[('he', 'PRON'), ('likes', 'VERB'), ('big', 'ADJ'), ('dogs', 'NOUN')]

