In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')
tagged_sentences = brown.tagged_sents(tagset='universal')

In [None]:
# len(tagged_sentences)
# print(tagged_sentences[0])
tags = set()
for sent in tagged_sentences:
    for i in sent:
        tags.add(i[1])
print(tags)

In [None]:
for ts in tagged_sentences:
    for (w, t) in ts:
        if '-' in w:
            print(w, " : ", t)

In [None]:
from collections import defaultdict

class HMMTagger:
    def __init__(self, train_sentences, words, tags):
        self.train_sentences = train_sentences
        self.words = words
        self.tags = tags
        self.tagset = set(tags)
        self.wordset = set(words)
        self.initial_probs = {}
        self.transition_probs = {}
        self.emission_probs = {}

    def train(self):
        self._compute_initial_probs()
        self._compute_transition_probs()
        self._compute_emission_probs()

    def _compute_initial_probs(self):
        tag_counts = defaultdict(int)

        for sentence in self.train_sentences:
            if len(sentence) > 0:  # If the sentence is not empty
                first_tag = sentence[0][1]
                tag_counts[first_tag] += 1

        total_sentences = len(self.train_sentences)

        # Calculate initial probabilities
        for tag in tag_counts:
            self.initial_probs[tag] = tag_counts[tag] / total_sentences

    def _compute_transition_probs(self):
        bigram_counts = defaultdict(int)
        tag_counts = defaultdict(int)

        for sentence in self.train_sentences:
            for i in range(len(sentence)):
                tag_counts[sentence[i][1]] += 1
                if i > 0:
                    bigram_counts[(sentence[i - 1][1], sentence[i][1])] += 1

        # Calculate transition probabilities
        for tag1 in self.tagset:
            for tag2 in self.tagset:
                if tag_counts[tag1] > 0:
                    self.transition_probs[(tag1, tag2)] = bigram_counts[(tag1, tag2)] / tag_counts[tag1]
                else:
                    self.transition_probs[(tag1, tag2)] = 0.0

    def _compute_emission_probs(self):
        word_tag_counts = defaultdict(int)
        tag_counts = defaultdict(int)

        # Calculate the frequency of (word, tag) pairs and individual tags
        for w, t in zip(self.words, self.tags):
            word_tag_counts[(w, t)] += 1
            tag_counts[t] += 1

        # Calculate emission probabilities
        for (w, t) in word_tag_counts:
            if tag_counts[t] > 0:
                self.emission_probs[(t, w)] = word_tag_counts[(w, t)] / tag_counts[t]
            else:
                self.emission_probs[(t, w)] = 0.0

    def _viterbi(self, phrase):
        phrase = [word.lower().rstrip("'s") if word.endswith("'s") else word.lower() for word in phrase]
        T = len(phrase)
        N = len(self.tagset)
        tags_list = list(self.tagset)

        # Initialize the Viterbi table
        V = [[0.0] * N for _ in range(T)]
        backpointer = [[0] * N for _ in range(T)]

        # Initialize the first row of the Viterbi table
        for j in range(N):
            tag = tags_list[j]
            V[0][j] = self.initial_probs.get(tag, 1e-6) * self.emission_probs.get((tag, phrase[0]), 1e-6)  # Smoothing for unseen words
            backpointer[0][j] = 0  # Start state has no previous state

        # Fill in the rest of the Viterbi table
        for i in range(1, T):
            for j in range(N):
                max_prob = -float('inf')
                max_prev = 0
                for k in range(N):
                    prob = V[i - 1][k] * self.transition_probs.get((tags_list[k], tags_list[j]), 1e-6) * self.emission_probs.get((tags_list[j], phrase[i]), 1e-6)  # Smoothing for unseen words
                    if prob > max_prob:
                        max_prob = prob
                        max_prev = k
                V[i][j] = max_prob
                backpointer[i][j] = max_prev

        # Find the most likely final state
        final_state = max(range(N), key=lambda j: V[T - 1][j])
        max_prob = V[T - 1][final_state]

        # Backtrack to find the most likely sequence of tags
        result_tags = [tags_list[final_state]]
        for i in range(T - 2, -1, -1):
            final_state = backpointer[i + 1][final_state]
            result_tags.insert(0, tags_list[final_state])

        return result_tags

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already split your words and tags
kf = KFold(n_splits=5)
overall_confusion_matrix = np.zeros((12,12))
tag_list = ['DET', 'PRT', 'ADV', 'X', 'CONJ', 'ADJ', 'ADP', 'PRON', 'NOUN', '.', 'NUM', 'VERB']

all_y_true = []
all_y_pred = []
fold = 0

for train_index, test_index in kf.split(tagged_sentences):
    fold += 1
    train_sentences = [tagged_sentences[i] for i in train_index]
    test_sentences = [tagged_sentences[i] for i in test_index]

    # creating words and tags list
    words = []
    tags = []
    for sentence in train_sentences:
        for word, tag in sentence:
            words.append(word.lower().rstrip("'s") if word.endswith("'s") else word.lower())
            tags.append(tag)

    # Instantiate and train the model for this fold
    model = HMMTagger(train_sentences, words, tags)
    model.train()

    y_true = []
    y_pred = []

    for sentence in test_sentences:
        words_in_sentence = [word.lower() for word, tag in sentence]
        true_tags = [tag for word, tag in sentence]

        # Handle out-of-vocabulary (OOV) words inside _viterbi if necessary
        predicted_tags = model._viterbi(words_in_sentence)

        y_true.extend(true_tags)
        y_pred.extend(predicted_tags)

    # Add predictions of this fold to the overall lists
    all_y_true.extend(y_true)
    all_y_pred.extend(y_pred)

    # Calculate and accumulate the confusion matrix for this fold
    fold_confusion_matrix = confusion_matrix(y_true, y_pred, labels=tag_list)
    overall_confusion_matrix += fold_confusion_matrix
    print('-----------')
    print("Fold: ", fold)
    # Plot confusion matrix for the current fold
    plt.figure(figsize=(10, 8))
    sns.heatmap(fold_confusion_matrix, annot=True, fmt="d", xticklabels=tag_list, yticklabels=tag_list)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - Fold')
    plt.show()

    print(classification_report(y_true, y_pred, labels=tag_list))
    print('-----------')





In [None]:
print("Overall Accuracy")

print(classification_report(all_y_true, all_y_pred, labels=tag_list))


plt.figure(figsize=(12, 10))
overall_confusion_matrix = overall_confusion_matrix.astype('float') / overall_confusion_matrix.sum(axis=1)[:, np.newaxis]

sns.heatmap(overall_confusion_matrix, annot=True, fmt=".3f",xticklabels=tag_list, yticklabels=tag_list)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Overall Confusion Matrix Normalised')
plt.show()

In [None]:
def predict(sent):
    sent  = sent.split()
    pred = model._viterbi(sent)
    return pred

In [None]:
predict('Hello how are you')


In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report, fbeta_score
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already split your words and tags
kf = KFold(n_splits=5)
overall_confusion_matrix = np.zeros((12,12))
tag_list = ['DET', 'PRT', 'ADV', 'X', 'CONJ', 'ADJ', 'ADP', 'PRON', 'NOUN', '.', 'NUM', 'VERB']

all_y_true = []
all_y_pred = []
fold = 0

for train_index, test_index in kf.split(tagged_sentences):
    fold += 1
    train_sentences = [tagged_sentences[i] for i in train_index]
    test_sentences = [tagged_sentences[i] for i in test_index]

    # creating words and tags list
    words = []
    tags = []
    for sentence in train_sentences:
        for word, tag in sentence:
            words.append(word.lower().rstrip("'s") if word.endswith("'s") else word.lower())
            tags.append(tag)

    # Instantiate and train the model for this fold
    model = HMMTagger(train_sentences, words, tags)
    model.train()

    y_true = []
    y_pred = []

    for sentence in test_sentences:
        words_in_sentence = [word.lower() for word, tag in sentence]
        true_tags = [tag for word, tag in sentence]

        # Handle out-of-vocabulary (OOV) words inside _viterbi if necessary
        predicted_tags = model._viterbi(words_in_sentence)

        y_true.extend(true_tags)
        y_pred.extend(predicted_tags)

    # Add predictions of this fold to the overall lists
    all_y_true.extend(y_true)
    all_y_pred.extend(y_pred)

    # Calculate and accumulate the confusion matrix for this fold
    fold_confusion_matrix = confusion_matrix(y_true, y_pred, labels=tag_list)
    overall_confusion_matrix += fold_confusion_matrix
    print('-----------')
    print("Fold: ", fold)
    # Plot confusion matrix for the current fold
    plt.figure(figsize=(10, 8))
    sns.heatmap(fold_confusion_matrix, annot=True, fmt="d", xticklabels=tag_list, yticklabels=tag_list)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - Fold {fold}')
    plt.show()

    print(classification_report(y_true, y_pred, labels=tag_list))
    print('-----------')

# After completing all folds, compute F0.5 and F2 for the entire set

# Classification report for overall performance
print("Overall Classification Report")
print(classification_report(all_y_true, all_y_pred, labels=tag_list))

# F0.5 score
f0_5 = fbeta_score(all_y_true, all_y_pred, beta=0.5, labels=tag_list, average='weighted')
print("Overall F0.5 Score:", f0_5)

# F2 score
f2 = fbeta_score(all_y_true, all_y_pred, beta=2, labels=tag_list, average='weighted')
print("Overall F2 Score:", f2)

# Plot overall confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(overall_confusion_matrix, annot=True, fmt="d", xticklabels=tag_list, yticklabels=tag_list)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Overall Confusion Matrix')
plt.show()
