# 11 Implement POS tagging using LSTM.

In [None]:
# Install necessary libraries
! pip install tensorflow
! pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl (273 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6


In [2]:
# import library
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import treebank
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Jaydip\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [3]:
# Load dataset (treebank for simplicity)
train_sents = treebank.tagged_sents()

In [4]:
# Preprocess the data
words = list(set([word.lower() for sent in train_sents for word, _ in sent]))
tags = list(set([tag for sent in train_sents for _, tag in sent]))

word_to_index = {word: i+2 for i, word in enumerate(words)}
word_to_index['<PAD>'] = 0
word_to_index['<OOV>'] = 1

tag_to_index = {tag: i for i, tag in enumerate(tags)}
index_to_tag = {i: tag for tag, i in tag_to_index.items()}


In [5]:
# Prepare the data for training
def prepare_data(sentences):
    X_data = []
    Y_data = []
    
    for sent in sentences:
        X_sent = [word_to_index.get(word.lower(), 1) for word, _ in sent]  # Convert words to indices
        Y_sent = [tag_to_index[tag] for _, tag in sent]  # Convert tags to indices
        
        X_data.append(X_sent)
        Y_data.append(Y_sent)
    
    X_data = pad_sequences(X_data, padding='post')  # Padding sequences to ensure same length
    Y_data = pad_sequences(Y_data, padding='post')
    
    return np.array(X_data), np.array(Y_data)

X, Y = prepare_data(train_sents)


In [6]:
# Split data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1)


In [7]:
# Build the LSTM Model
model = Sequential()

model.add(Embedding(input_dim=len(word_to_index), output_dim=100))  
model.add(Bidirectional(LSTM(units=100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tag_to_index), activation="softmax")))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [8]:
# Train the model
history = model.fit(X_train, np.expand_dims(Y_train, -1), batch_size=32, epochs=5, validation_data=(X_val, np.expand_dims(Y_val, -1)))


Epoch 1/5
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 861ms/step - accuracy: 0.8698 - loss: 0.7720 - val_accuracy: 0.9379 - val_loss: 0.2533
Epoch 2/5
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 719ms/step - accuracy: 0.9443 - loss: 0.2261 - val_accuracy: 0.9632 - val_loss: 0.1403
Epoch 3/5
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 823ms/step - accuracy: 0.9699 - loss: 0.1216 - val_accuracy: 0.9848 - val_loss: 0.0725
Epoch 4/5
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 763ms/step - accuracy: 0.9875 - loss: 0.0615 - val_accuracy: 0.9901 - val_loss: 0.0446
Epoch 5/5
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 761ms/step - accuracy: 0.9925 - loss: 0.0372 - val_accuracy: 0.9918 - val_loss: 0.0335


In [9]:
# Function to predict tags for a sentence
def predict_tags(sentence):
    sentence = [word_to_index.get(word.lower(), 1) for word in sentence]
    sentence = pad_sequences([sentence], maxlen=X.shape[1], padding='post')
    
    pred = model.predict(sentence)
    pred_tags = [index_to_tag[np.argmax(tag)] for tag in pred[0]]
    return pred_tags

In [10]:
# Example usage
test_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
predicted_tags = predict_tags(test_sentence)
print(list(zip(test_sentence, predicted_tags)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NNP'), ('fox', 'NNP'), ('jumps', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'NN'), ('dog', 'NNP')]
