In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')
tagged_sentences = brown.tagged_sents(tagset='universal')

In [None]:
words = []
tags = []
for sentence in tagged_sentences:
    for word, tag in sentence:
        words.append(word.lower().rstrip("'s") if word.endswith("'s") else word.lower())
        # words.append(word.lower())
        tags.append(tag)
        print(words[-1], " : ", tags[-1])

In [None]:
len(tagged_sentences)

In [None]:
from collections import defaultdict

class HMMTagger:
    def __init__(self, train_sentences, words, tags):
        self.train_sentences = train_sentences
        self.words = words
        self.tags = tags
        self.tagset = set(tags)
        self.wordset = set(words)
        self.initial_probs = {}
        self.transition_probs = {}
        self.emission_probs = {}
    
    def train(self):
        self._compute_initial_probs()
        self._compute_transition_probs()
        self._compute_emission_probs()
    
    def _compute_initial_probs(self):
        tag_counts = defaultdict(int)
        
        for sentence in self.train_sentences:
            if len(sentence) > 0:  # If the sentence is not empty
                first_tag = sentence[0][1]
                tag_counts[first_tag] += 1

        total_sentences = len(self.train_sentences)

        # Calculate initial probabilities
        for tag in tag_counts:
            self.initial_probs[tag] = tag_counts[tag] / total_sentences

    def _compute_transition_probs(self):
        bigram_counts = defaultdict(int)
        tag_counts = defaultdict(int)

        for sentence in self.train_sentences:
            for i in range(len(sentence)):
                tag_counts[sentence[i][1]] += 1
                if i > 0:
                    bigram_counts[(sentence[i - 1][1], sentence[i][1])] += 1

        # Calculate transition probabilities
        for tag1 in self.tagset:
            for tag2 in self.tagset:
                if tag_counts[tag1] > 0:
                    self.transition_probs[(tag1, tag2)] = bigram_counts[(tag1, tag2)] / tag_counts[tag1]
                else:
                    self.transition_probs[(tag1, tag2)] = 0.0

    def _compute_emission_probs(self):
        word_tag_counts = defaultdict(int)
        tag_counts = defaultdict(int)

        # Calculate the frequency of (word, tag) pairs and individual tags
        for w, t in zip(self.words, self.tags):
            word_tag_counts[(w, t)] += 1
            tag_counts[t] += 1

        # Calculate emission probabilities
        for (w, t) in word_tag_counts:
            if tag_counts[t] > 0:
                self.emission_probs[(t, w)] = word_tag_counts[(w, t)] / tag_counts[t]
            else:
                self.emission_probs[(t, w)] = 0.0

    def _viterbi(self, phrase):
        phrase = [word.lower() for word in phrase]
        T = len(phrase)
        N = len(self.tagset)
        tags_list = list(self.tagset)

        # Initialize the Viterbi table
        V = [[0.0] * N for _ in range(T)]
        backpointer = [[0] * N for _ in range(T)]

        # Initialize the first row of the Viterbi table
        for j in range(N):
            tag = tags_list[j]
            V[0][j] = self.initial_probs.get(tag, 1e-6) * self.emission_probs.get((tag, phrase[0]), 1e-6)  # Smoothing for unseen words
            backpointer[0][j] = 0  # Start state has no previous state

        # Fill in the rest of the Viterbi table
        for i in range(1, T):
            for j in range(N):
                max_prob = -float('inf')
                max_prev = 0
                for k in range(N):
                    prob = V[i - 1][k] * self.transition_probs.get((tags_list[k], tags_list[j]), 1e-6) * self.emission_probs.get((tags_list[j], phrase[i]), 1e-6)  # Smoothing for unseen words
                    if prob > max_prob:
                        max_prob = prob
                        max_prev = k
                V[i][j] = max_prob
                backpointer[i][j] = max_prev

        # Find the most likely final state
        final_state = max(range(N), key=lambda j: V[T - 1][j])
        max_prob = V[T - 1][final_state]

        # Backtrack to find the most likely sequence of tags
        result_tags = [tags_list[final_state]]
        for i in range(T - 2, -1, -1):
            final_state = backpointer[i + 1][final_state]
            result_tags.insert(0, tags_list[final_state])

        return result_tags

In [None]:
model = HMMTagger(tagged_sentences, words, tags)
model.train()

In [None]:
print("alex" in words)
print("prathamesh" in words)

In [None]:
test_sentence_1 = ["Alex", "is", "a", "sincere", "student"]
test_sentence_2 = ["Prathamesh", "is", "a", "sincere", "student"]

predicted_tags_1 = model._viterbi(test_sentence_1)
for word, tag in zip(test_sentence_1, predicted_tags_1):
    print(word, ":", tag)
print("---------")
predicted_tags_2 = model._viterbi(test_sentence_2)
for word, tag in zip(test_sentence_2, predicted_tags_2):
    print(word, ":", tag)