In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
import numpy as np


In [2]:
sentences = [
    ['I', 'am', 'learning', 'TensorFlow'],
    ['You', 'are', 'writing', 'a', 'tutorial'],
    ['They', 'are', 'reading', 'a', 'book'],
    ['She', 'loves', 'machine', 'learning'],
    ['The', 'cat', 'sits', 'on', 'the', 'mat'],
    ['Birds', 'fly', 'in', 'the', 'sky'],
    ['We', 'are', 'building', 'a', 'model'],
    ['He', 'enjoys', 'playing', 'chess'],
    ['This', 'is', 'an', 'exciting', 'project'],
    ['Data', 'science', 'is', 'fun']
]

In [3]:
pos_tags = [
    ['PRON', 'AUX', 'VERB', 'PROPN'],
    ['PRON', 'AUX', 'VERB', 'DET', 'NOUN'],
    ['PRON', 'AUX', 'VERB', 'DET', 'NOUN'],
    ['PRON', 'VERB', 'NOUN', 'NOUN'],
    ['DET', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN'],
    ['NOUN', 'VERB', 'ADP', 'DET', 'NOUN'],
    ['PRON', 'AUX', 'VERB', 'DET', 'NOUN'],
    ['PRON', 'VERB', 'VERB', 'NOUN'],
    ['DET', 'AUX', 'DET', 'ADJ', 'NOUN'],
    ['NOUN', 'NOUN', 'AUX', 'ADJ']
]

In [4]:
# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)


In [5]:
# Tokenize the POS tags
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(pos_tags)
tag_index = tag_tokenizer.word_index
tag_sequences = tag_tokenizer.texts_to_sequences(pos_tags)

In [6]:
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x7cc1c7c96f50>

In [7]:
word_index

{'are': 1,
 'a': 2,
 'the': 3,
 'learning': 4,
 'is': 5,
 'i': 6,
 'am': 7,
 'tensorflow': 8,
 'you': 9,
 'writing': 10,
 'tutorial': 11,
 'they': 12,
 'reading': 13,
 'book': 14,
 'she': 15,
 'loves': 16,
 'machine': 17,
 'cat': 18,
 'sits': 19,
 'on': 20,
 'mat': 21,
 'birds': 22,
 'fly': 23,
 'in': 24,
 'sky': 25,
 'we': 26,
 'building': 27,
 'model': 28,
 'he': 29,
 'enjoys': 30,
 'playing': 31,
 'chess': 32,
 'this': 33,
 'an': 34,
 'exciting': 35,
 'project': 36,
 'data': 37,
 'science': 38,
 'fun': 39}

In [8]:
sequences

[[6, 7, 4, 8],
 [9, 1, 10, 2, 11],
 [12, 1, 13, 2, 14],
 [15, 16, 17, 4],
 [3, 18, 19, 20, 3, 21],
 [22, 23, 24, 3, 25],
 [26, 1, 27, 2, 28],
 [29, 30, 31, 32],
 [33, 5, 34, 35, 36],
 [37, 38, 5, 39]]

In [9]:
tag_index

{'noun': 1,
 'verb': 2,
 'det': 3,
 'pron': 4,
 'aux': 5,
 'adp': 6,
 'adj': 7,
 'propn': 8}

In [10]:
tag_sequences

[[4, 5, 2, 8],
 [4, 5, 2, 3, 1],
 [4, 5, 2, 3, 1],
 [4, 2, 1, 1],
 [3, 1, 2, 6, 3, 1],
 [1, 2, 6, 3, 1],
 [4, 5, 2, 3, 1],
 [4, 2, 2, 1],
 [3, 5, 3, 7, 1],
 [1, 1, 5, 7]]

In [11]:
max_len = max([len(s) for s in sequences])
X_train = pad_sequences(sequences, maxlen=max_len, padding='post')
y_train = pad_sequences(tag_sequences, maxlen=max_len, padding='post')

In [12]:
X_train

array([[ 6,  7,  4,  8,  0,  0],
       [ 9,  1, 10,  2, 11,  0],
       [12,  1, 13,  2, 14,  0],
       [15, 16, 17,  4,  0,  0],
       [ 3, 18, 19, 20,  3, 21],
       [22, 23, 24,  3, 25,  0],
       [26,  1, 27,  2, 28,  0],
       [29, 30, 31, 32,  0,  0],
       [33,  5, 34, 35, 36,  0],
       [37, 38,  5, 39,  0,  0]], dtype=int32)

In [13]:
y_train

array([[4, 5, 2, 8, 0, 0],
       [4, 5, 2, 3, 1, 0],
       [4, 5, 2, 3, 1, 0],
       [4, 2, 1, 1, 0, 0],
       [3, 1, 2, 6, 3, 1],
       [1, 2, 6, 3, 1, 0],
       [4, 5, 2, 3, 1, 0],
       [4, 2, 2, 1, 0, 0],
       [3, 5, 3, 7, 1, 0],
       [1, 1, 5, 7, 0, 0]], dtype=int32)

In [14]:
num_tags = len(tag_index) + 1  # Add 1 for the padding tag
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_tags)

print(f"Vocabulary size: {len(word_index)}")
print(f"Number of POS tags: {num_tags}")

Vocabulary size: 39
Number of POS tags: 9


In [15]:
y_train

array([[[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.,

In [28]:
# Padding the sequences for sentences and tags
max_len = max(len(seq) for seq in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_len, padding='post')
tag_sequences_padded = pad_sequences(tag_sequences, maxlen=max_len, padding='post')


In [31]:
num_tags = len(tag_index)
tag_sequences_onehot = [to_categorical(seq, num_classes=num_tags + 1) for seq in tag_sequences_padded]


In [25]:
# Define model parameters
vocab_size = len(word_index) + 1   # Add 1 for padding token
embedding_dim = 64
lstm_units = 128


In [36]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=64, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    TimeDistributed(Dense(num_tags + 1, activation='softmax'))
])

In [37]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(sequences_padded, np.array(tag_sequences_onehot), batch_size=2, epochs=10, verbose=1)
model.summary()

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.1977 - loss: 2.1931   
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3750 - loss: 2.1684 
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2757 - loss: 2.1451 
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3044 - loss: 2.1166  
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3926 - loss: 2.0485
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4072 - loss: 1.9766
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3569 - loss: 1.9008 
Epoch 8/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3965 - loss: 1.7100 
Epoch 9/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [38]:
# Predict POS tags for a new sentence
test_sentence =  ['I', 'am', 'learning', 'TensorFlow']
test_sequence = tokenizer.texts_to_sequences([test_sentence])
test_sequence_padded = pad_sequences(test_sequence, maxlen=max_len, padding='post')

In [39]:
# Get predicted tag indices
predictions = model.predict(test_sequence_padded)
predicted_tags = np.argmax(predictions, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453ms/step


In [40]:
# Convert predicted tag indices back to POS tags
predicted_tags_list = [list(tag_index.keys())
                       [list(tag_index.values()).index(p)] for p in predicted_tags[0] if p != 0]

In [41]:
# Print the predicted tags
print(f"Sentence: {test_sentence}")
print(f"Predicted POS tags: {predicted_tags_list}")

Sentence: ['I', 'am', 'learning', 'TensorFlow']
Predicted POS tags: ['pron', 'verb', 'verb', 'noun']
