<a href="https://colab.research.google.com/github/Maruf346/AI-ML-with-python/blob/main/Hidden_Markov_Model_(HMM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Part-of-Speech Tagging**

In [15]:
# Install NLTK
!pip install nltk

# Import necessary libraries
import numpy as np
import nltk
from collections import defaultdict
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import brown

# Download required NLTK data
nltk.download('brown')
nltk.download('universal_tagset')




[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

# **Data Preparation**

In [16]:
# Load the Brown corpus with universal tags
tagged_sents = brown.tagged_sents(tagset='universal')

# Prepare vocabulary and tags
word_counts = defaultdict(int)
tag_counts = defaultdict(int)

# Count word and tag occurrences
for sent in tagged_sents:
    for word, tag in sent:
        word_counts[word.lower()] += 1
        tag_counts[tag] += 1

# Create mappings for words and tags
vocab = list(word_counts.keys())
tags = list(tag_counts.keys())

word2idx = {w: i for i, w in enumerate(vocab)}
tag2idx = {t: i for i, t in enumerate(tags)}

# Print the number of unique words and tags
print(f"Vocabulary size: {len(vocab)}")
print(f"Number of unique tags: {len(tags)}")


Vocabulary size: 49815
Number of unique tags: 12


# **Model Initialization**

In [18]:
# Initialize HMM parameters
n_states = len(tags)
n_observations = len(vocab)

# Uniform initial distribution (Start probabilities)
start_prob = np.ones(n_states) / n_states

# Initialize transition and emission matrices
trans_mat = np.ones((n_states, n_states)) / n_states
emit_mat = np.ones((n_states, n_observations)) / n_observations

# Count occurrences for Maximum Likelihood Estimation (MLE)
trans_counts = np.zeros((n_states, n_states))
emit_counts = np.zeros((n_states, n_observations))


# **Parameter Estimation**

In [19]:
# Estimate parameters from data
for sent in tagged_sents:
    prev_tag = None
    for word, tag in sent:
        word_idx = word2idx[word.lower()]
        tag_idx = tag2idx[tag]

        # Update start probabilities
        if prev_tag is None:
            start_prob[tag_idx] += 1
        else:
            trans_counts[prev_tag, tag_idx] += 1

        # Update emission counts
        emit_counts[tag_idx, word_idx] += 1

        prev_tag = tag_idx

# Normalize counts to probabilities
start_prob = start_prob / start_prob.sum()
trans_mat = trans_counts / trans_counts.sum(axis=1, keepdims=True)
emit_mat = emit_counts / emit_counts.sum(axis=1, keepdims=True)


# **Viterbi Implementation**

In [20]:
# Viterbi decoding implementation
def viterbi_decode(sentence, tags, start_prob, trans_mat, emit_mat):
    # Convert sentence to indices
    obs_seq = [word2idx.get(w.lower(), 0) for w in sentence.split()]
    T = len(obs_seq)
    N = len(tags)

    # Initialize DP tables
    delta = np.zeros((T, N))
    psi = np.zeros((T, N), dtype=int)

    # Initialization step
    delta[0] = start_prob * emit_mat[:, obs_seq[0]]

    # Recursion step
    for t in range(1, T):
        for j in range(N):
            trans_probs = delta[t-1] * trans_mat[:, j]
            psi[t, j] = np.argmax(trans_probs)
            delta[t, j] = np.max(trans_probs) * emit_mat[j, obs_seq[t]]

    # Backtracking to find the most probable state sequence
    path = np.zeros(T, dtype=int)
    path[-1] = np.argmax(delta[-1])
    for t in range(T - 2, -1, -1):
        path[t] = psi[t + 1, path[t + 1]]

    return [tags[i] for i in path]


# **Evaluation Metrics**

In [21]:
# Function to evaluate the model on test data
def evaluate_model(test_sents, word2idx, tag2idx, tags, start_prob, trans_mat, emit_mat):
    y_true, y_pred = [], []
    for sent in test_sents:
        words = [w for w, t in sent]
        true_tags = [t for w, t in sent]
        pred_tags = viterbi_decode(' '.join(words), tags, start_prob, trans_mat, emit_mat)

        y_true.extend(true_tags)
        y_pred.extend(pred_tags)

    # Calculate accuracy and generate classification report
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))


# **Testing and Evaluation**

In [22]:
test_sents = brown.tagged_sents(categories='news', tagset='universal')[:100]

# Evaluate the model
evaluate_model(test_sents, word2idx, tag2idx, tags, start_prob, trans_mat, emit_mat)

Accuracy: 0.972663139329806
Classification Report:
              precision    recall  f1-score   support

           .       1.00      1.00      1.00       259
         ADJ       0.90      0.95      0.92       120
         ADP       0.98      0.95      0.96       256
         ADV       0.91      0.90      0.90        58
        CONJ       1.00      1.00      1.00        47
         DET       0.99      1.00      0.99       279
        NOUN       0.98      0.98      0.98       721
         NUM       1.00      1.00      1.00        39
        PRON       0.91      1.00      0.95        48
         PRT       0.95      0.93      0.94        57
        VERB       0.97      0.96      0.96       384

    accuracy                           0.97      2268
   macro avg       0.96      0.97      0.97      2268
weighted avg       0.97      0.97      0.97      2268

