<a href="https://colab.research.google.com/github/NeelamTharunKumar/Google_Colab/blob/main/Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import numpy as np
import nltk
from collections import defaultdict

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')

# Sample training dataset
tagged_sentences = [
    [('the', 'DET'), ('dog', 'NOUN'), ('barks', 'VERB')],
    [('a', 'DET'), ('cat', 'NOUN'), ('meows', 'VERB')],
    [('the', 'DET'), ('cat', 'NOUN'), ('sleeps', 'VERB')],
    [('a', 'DET'), ('dog', 'NOUN'), ('runs', 'VERB')]
]

# Extract vocabulary and tagset
vocab = set(word for sentence in tagged_sentences for word, _ in sentence)
tagset = set(tag for sentence in tagged_sentences for _, tag in sentence)

# Initialize probabilities
transition_probs = defaultdict(lambda: defaultdict(lambda: 0))
emission_probs = defaultdict(lambda: defaultdict(lambda: 0))
initial_probs = defaultdict(lambda: 0)

# Count occurrences for probabilities
tag_count = defaultdict(lambda: 0)

for sentence in tagged_sentences:
    prev_tag = None
    for i, (word, tag) in enumerate(sentence):
        tag_count[tag] += 1
        emission_probs[tag][word] += 1
        if i == 0:
            initial_probs[tag] += 1
        if prev_tag is not None:
            transition_probs[prev_tag][tag] += 1
        prev_tag = tag

# Normalize probabilities
total_sentences = len(tagged_sentences)

for tag in tagset:
    initial_probs[tag] = initial_probs[tag] / total_sentences
    total_transitions = sum(transition_probs[tag].values())
    total_emissions = sum(emission_probs[tag].values())

    for next_tag in tagset:
        transition_probs[tag][next_tag] = (transition_probs[tag][next_tag] + 1) / (total_transitions + len(tagset))  # Laplace smoothing

    for word in vocab:
        emission_probs[tag][word] = (emission_probs[tag][word] + 1) / (total_emissions + len(vocab))  # Laplace smoothing

# Viterbi Algorithm Implementation
def viterbi(sentence):
    V = [{}]  # Store probabilities
    path = {}  # Store best paths

    # Initialization step
    for tag in tagset:
        V[0][tag] = np.log(initial_probs[tag] + 1e-6) + np.log(emission_probs[tag].get(sentence[0], 1e-6))
        path[tag] = [tag]

    # Recursion step
    for t in range(1, len(sentence)):
        V.append({})
        new_path = {}

        for tag in tagset:
            (prob, state) = max(
                (V[t-1][prev_tag] + np.log(transition_probs[prev_tag].get(tag, 1e-6)) +
                 np.log(emission_probs[tag].get(sentence[t], 1e-6)), prev_tag) for prev_tag in tagset)
            V[t][tag] = prob
            new_path[tag] = path[state] + [tag]

        path = new_path

    # Termination step
    (prob, state) = max((V[len(sentence) - 1][tag], tag) for tag in tagset)
    return path[state]

# Take user input and tokenize
user_sentence = input("Enter a sentence: ").lower()
user_tokens = nltk.word_tokenize(user_sentence)

# Predict POS tags
predicted_tags = viterbi(user_tokens)

# Display results
print("\nPredicted POS Tags:")
for word, tag in zip(user_tokens, predicted_tags):
    print(f"{word} -> {tag}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
