In [None]:
import nltk
import numpy as np
from nltk.corpus import treebank
from nltk.tag import hmm
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential  # type: ignore
from tensorflow.keras.layers import Embedding, LSTM, Dense # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from tensorflow.keras.utils import to_categorical # type: ignore
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense # type: ignore
import tensorflow as tf

# Download NLTK Treebank Corpus

nltk.download('treebank')

# Load and Prepare the Data

tagged_sentences = treebank.tagged_sents()
train_data, test_data = train_test_split(tagged_sentences, test_size=0.2, random_state=42)

def split_words_tags(sentences):
    words, tags = [], []
    for sent in sentences:
        w, t = zip(*sent)
        words.append(list(w))
        tags.append(list(t))
    return words, tags

train_words, train_tags = split_words_tags(train_data)
test_words, test_tags = split_words_tags(test_data)

# 1.  HMM Model using NLTK

hmm_tagger = hmm.HiddenMarkovModelTagger.train(train_data)

def evaluate_hmm(model, data):
    correct, total = 0, 0
    for sent in data:
        words = [w for w, _ in sent]
        true_tags = [t for _, t in sent]
        pred_tags = [t for _, t in model.tag(words)]
        correct += sum(p == t for p, t in zip(pred_tags, true_tags))
        total += len(true_tags)
    return correct / total

hmm_acc = evaluate_hmm(hmm_tagger, test_data)
print(f"HMM Accuracy: {hmm_acc:.4f}")

# 2. LSTM Neural Network Model

# Tokenize words
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts([word for sent in train_words for word in sent])
X_train = tokenizer.texts_to_sequences(train_words)
X_test = tokenizer.texts_to_sequences(test_words)

# Pad sequences
max_len = max(max(len(x) for x in X_train), max(len(x) for x in X_test))
X_train_padded = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=max_len, padding='post')

# Create tag vocab
tag_vocab = sorted(set(tag for sent in train_tags for tag in sent))
tag2idx = {tag: idx for idx, tag in enumerate(tag_vocab)}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

# Convert tags to indices
y_train = [[tag2idx[tag] for tag in sent] for sent in train_tags]
y_test = [[tag2idx.get(tag, 0) for tag in sent] for sent in test_tags]

# Pad tag sequences
y_train_padded = pad_sequences(y_train, maxlen=max_len, padding='post')
y_test_padded = pad_sequences(y_test, maxlen=max_len, padding='post')

# One-hot encode tags
y_train_onehot = to_categorical(y_train_padded, num_classes=len(tag2idx))
y_test_onehot = to_categorical(y_test_padded, num_classes=len(tag2idx))

# Build and Train LSTM Model
model = Sequential([
    Input(shape=(max_len,)),  
    Embedding(input_dim=20000, output_dim=128),
    LSTM(64, return_sequences=True),
    Dense(len(tag2idx), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train_padded, y_train_onehot, batch_size=128, epochs=3, validation_split=0.1)

# Evaluate the model
loss, lstm_acc = model.evaluate(X_test_padded, y_test_onehot, verbose=0)
print(f"LSTM Accuracy: {lstm_acc:.4f}")


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\gilli\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


HMM Accuracy: 0.9150


Epoch 1/3
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 504ms/step - accuracy: 0.7664 - loss: 1.6844 - val_accuracy: 0.9117 - val_loss: 0.3681
Epoch 2/3
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 494ms/step - accuracy: 0.9163 - loss: 0.3586 - val_accuracy: 0.9214 - val_loss: 0.3401
Epoch 3/3
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 474ms/step - accuracy: 0.9248 - loss: 0.3226 - val_accuracy: 0.9253 - val_loss: 0.3131
LSTM Accuracy: 0.9243
