<a href="https://www.kaggle.com/code/beasttitan/pos-tagging?scriptVersionId=225963815" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from tensorflow.keras.preprocessing.sequence import pad_sequences
import plotly.express as px
from nltk.corpus import brown
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy import random
import tensorflow as tf
import pickle



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Some EDA for The Brown Corpus¶


In [2]:
print("The Brown Corpus Contains "+ str(len(brown.tagged_sents()))+ " tagged sentences")

The Brown Corpus Contains 57340 tagged sentences


In [3]:
for i in range( 3 ):
    index = random.randint(len(brown.sents())-1)
    print( brown.sents()[index] )
    print("\n")

['This', 'machine', 'will', 'cure', 'your', 'cancer-ridden', 'body', "''", '.']


['The', 'board', 'approved', 'and', 'commended', 'the', 'use', 'of', 'birth-control', 'devices', 'as', 'a', 'part', 'of', 'Christian', 'responsibility', 'in', 'family', 'planning', '.']


['Even', 'before', 'he', 'saw', 'the', 'necessity', 'of', 'growing', 'better', 'food', 'and', 'planning', 'good', 'nutrition', ',', 'Mr.', 'Clark', 'felt', 'the', 'school', 'had', 'a', 'good', 'health', 'program', '.']




In [4]:
lengths = [ len( sentence ) for sentence in brown.sents() ]
fig = px.histogram( lengths, title = "sentences Length Histogram" )
fig.show( renderer = "iframe" )

In [5]:
# Choose the max sentence length to be 50
sentence_length = 50

In [6]:
# Extract the part-of-speech tags for each word in each sentence of the Brown Corpus.
tags = [tag for word, tag in brown.tagged_words(tagset = "universal")]
# Count the frequency of each tag
tags_freq = Counter(tags)

In [7]:
fig = px.bar( x = tags_freq.keys(), y = tags_freq.values(), title = "Tags Distribution")
fig.show(renderer = "iframe")

## Data Preprocessing

In [8]:
# Separate the tags from the words and take only sentences with a length less than or equal to 50

sentences = [ [word for  word, tag in sentence] for sentence in brown.tagged_sents(tagset = "universal") if len(sentence) <= 50]

tags = [ [tag for  word, tag in sentence] for sentence in brown.tagged_sents(tagset = "universal") if len(sentence) <= 50 ]


In [9]:
print("Dateset size after droping all sentences with length more than 50 is: "+str(len(sentences)))

Dateset size after droping all sentences with length more than 50 is: 55708


In [10]:
# Split the dataset into training and validation sets
X_train, X_valid, Y_train, Y_valid = train_test_split(sentences, tags, test_size = 0.09, random_state = 42)

In [11]:
# Create a tokenizer for the words and fit it to the data 
word_tokenizer = Tokenizer( oov_token = "<OOV>")
word_tokenizer.fit_on_texts(X_train)


# Create a tokenizer for the tags and fit it to the data 
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y_train)

In [12]:
# Encode teh training data

X_train = word_tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen = sentence_length, padding = 'post')

Y_train = tag_tokenizer.texts_to_sequences(Y_train)
Y_train = pad_sequences(Y_train, maxlen = sentence_length, padding='post')


tags_len = len(tag_tokenizer.word_index)+1 # # PLus one because encoding is one based

word_index = word_tokenizer.word_index  
vocab_len = len(word_index)+2 # PLus two to include the Out of Vocabulary token


In [13]:
# Encode the validation data

X_valid = word_tokenizer.texts_to_sequences(X_valid)
X_valid = pad_sequences(X_valid, maxlen = sentence_length, padding = 'post')

Y_valid = tag_tokenizer.texts_to_sequences(Y_valid)
Y_valid = pad_sequences(Y_valid, maxlen = sentence_length, padding='post')


## Using The GloVe Embeddings

In [14]:
# Download the GloVe Embeddings files
if not os.path.exists('glove.6B.zip'):
    !wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
    !unzip -q glove.6B.zip

--2025-03-05 19:31:03--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-03-05 19:33:50 (4.94 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [15]:
# Load GloVe word embeddings into a dictionary mapping words to their vector representations.
path_to_glove_file = "glove.6B.50d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


Found 400000 word vectors.


In [16]:
embedding_dim = 50
hits = 0
misses = 0
missed_words = []

# Prepare embedding matrix
embedding_matrix = np.zeros((vocab_len, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        missed_words.append(word)
print("Converted %d words (%d misses)" % (hits, misses))

Converted 37647 words (8006 misses)


In [17]:
missed_words[:5]

['<OOV>', "don't", "didn't", "it's", "i'm"]

In [18]:
missed_words[-5:]

['hydrido', 'alemagna', 'browny', "slater's", "charlie's"]

## Model Training

In [19]:

model = keras.Sequential([
    keras.Input(shape = (sentence_length,)),
    
    keras.layers.Embedding( input_dim = vocab_len, output_dim = 50, weights = [embedding_matrix]),    

    keras.layers.Bidirectional(
        keras.layers.LSTM( units = 64, return_sequences = True,
                           kernel_initializer = keras.initializers.GlorotUniform()
                         )
    ),
    
    keras.layers.BatchNormalization(),
    
    keras.layers.TimeDistributed(keras.layers.Dense(units = tags_len , activation = "softmax")  )
])
model.summary()

In [20]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor = 'val_accuracy',  
    patience = 5,          
    restore_best_weights = True  
)

In [21]:
model.compile( optimizer = keras.optimizers.Adam(learning_rate = 0.0001),
               loss = keras.losses.SparseCategoricalCrossentropy(ignore_class = 0),
               metrics = ["accuracy"],
             )


history = model.fit( X_train, Y_train, epochs = 30, validation_data = ( X_valid, Y_valid),
                     callbacks=[early_stopping]
                   )
 

Epoch 1/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 13ms/step - accuracy: 0.5941 - loss: 1.3604 - val_accuracy: 0.8408 - val_loss: 0.3258
Epoch 2/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8539 - loss: 0.2673 - val_accuracy: 0.5083 - val_loss: 0.1750
Epoch 3/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - accuracy: 0.6491 - loss: 0.1437 - val_accuracy: 0.4857 - val_loss: 0.1228
Epoch 4/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.5246 - loss: 0.0995 - val_accuracy: 0.4394 - val_loss: 0.1009
Epoch 5/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.5654 - loss: 0.0777 - val_accuracy: 0.8881 - val_loss: 0.0909
Epoch 6/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.7512 - loss: 0.0663 - val_accuracy: 0.9032 - val_loss: 0.0835
Epoc

In [22]:
training_history = pd.DataFrame(history.history)
fig = px.line(
    training_history,
    y = [ 'loss', 'val_loss' ], 
    title = 'Training & Validation Loss'
)
fig.show(renderer = "iframe")


In [23]:
training_history = pd.DataFrame(history.history)
fig = px.line(
    training_history,
    y = [ 'accuracy', 'val_accuracy' ], 
    title = 'Training & Validation Accuracy'
)
fig.show(renderer = "iframe")


In [24]:
model.save("model.h5")
with open('word_tokenizer.pkl', 'wb') as f:
    pickle.dump(word_tokenizer, f)

with open('tag_tokenizer.pkl', 'wb') as f:
    pickle.dump(tag_tokenizer, f)