<a href="https://colab.research.google.com/github/Jaswanth-03/POS-tagging/blob/main/POS_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Bidirectional, Dropout, Dense, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import treebank, indian
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk

# Download the required NLTK datasets
nltk.download('treebank')
nltk.download('indian')

# Data Preparation for English
english_sentences = treebank.tagged_sents()
english_pos_tags = [[tag for _, tag in sent] for sent in english_sentences]
english_words = [[word for word, _ in sent] for sent in english_sentences]

# Tokenize words and POS tags for English
english_words_flat = [word for sublist in english_words for word in sublist]
english_pos_tags_flat = [tag for sublist in english_pos_tags for tag in sublist]

english_word_vocab = set(english_words_flat)
english_pos_vocab = set(english_pos_tags_flat)

english_word_index = {word: idx + 1 for idx, word in enumerate(english_word_vocab)}
english_pos_index = {pos: idx + 1 for idx, pos in enumerate(english_pos_vocab)}

# Convert words and POS tags to sequences
english_train_seq = [[english_word_index[word] for word in sent] for sent in english_words]
english_train_pos_seq = [[english_pos_index[tag] for tag in sent] for sent in english_pos_tags]

# Pad sequences for English
english_max_len = max(len(seq) for seq in english_train_seq)
english_train_seq = pad_sequences(english_train_seq, maxlen=english_max_len, padding='post')
english_train_pos_seq = pad_sequences(english_train_pos_seq, maxlen=english_max_len, padding='post')

# Calculate vocabulary size for English
vocab_size_eng = len(english_word_vocab) + 1
pos_size_eng = len(english_pos_vocab) + 1

# Data Preparation for Hindi
hindi_sentences = indian.tagged_sents('hindi.pos')
hindi_pos_tags = [[tag for _, tag in sent] for sent in hindi_sentences]
hindi_words = [[word for word, _ in sent] for sent in hindi_sentences]

# Tokenize words and POS tags for Hindi
hindi_words_flat = [word for sublist in hindi_words for word in sublist]
hindi_pos_tags_flat = [tag for sublist in hindi_pos_tags for tag in sublist]

hindi_word_vocab = set(hindi_words_flat)
hindi_pos_vocab = set(hindi_pos_tags_flat)

hindi_word_index = {word: idx + 1 for idx, word in enumerate(hindi_word_vocab)}
hindi_pos_index = {pos: idx + 1 for idx, pos in enumerate(hindi_pos_vocab)}

# Convert words and POS tags to sequences
hindi_train_seq = [[hindi_word_index[word] for word in sent] for sent in hindi_words]
hindi_train_pos_seq = [[hindi_pos_index[tag] for tag in sent] for sent in hindi_pos_tags]

# Pad sequences for Hindi
hindi_max_len = max(len(seq) for seq in hindi_train_seq)
hindi_train_seq = pad_sequences(hindi_train_seq, maxlen=hindi_max_len, padding='post')
hindi_train_pos_seq = pad_sequences(hindi_train_pos_seq, maxlen=hindi_max_len, padding='post')

# Calculate vocabulary size for Hindi
vocab_size_hindi = len(hindi_word_vocab) + 1
pos_size_hindi = len(hindi_pos_vocab) + 1

# Model Architecture
def create_rnn_model(embedding_dim, rnn_units, dropout_rate, max_len, vocab_size, pos_size):
    # Word input branch
    word_input = Input(shape=(max_len,))
    word_embedding = Embedding(vocab_size, embedding_dim, input_length=max_len)(word_input)

    # POS tag input branch
    pos_input = Input(shape=(max_len,))
    pos_embedding = Embedding(pos_size, embedding_dim, input_length=max_len)(pos_input)

    # Concatenate word and POS tag embeddings
    concatenated_input = Concatenate()([word_embedding, pos_embedding])

    # RNN layer
    rnn_layer = LSTM(rnn_units, return_sequences=True)(concatenated_input)  # You can try different RNN layers here

    # Dropout layer
    dropout_layer = Dropout(dropout_rate)(rnn_layer)

    # Output layer
    output = Dense(pos_size, activation='softmax')(dropout_layer)

    # Create and compile the model
    model = Model(inputs=[word_input, pos_input], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Split the datasets into training and testing sets
english_X_train, english_X_test, english_y_train, english_y_test = train_test_split(english_train_seq, english_train_pos_seq, test_size=0.2)
hindi_X_train, hindi_X_test, hindi_y_train, hindi_y_test = train_test_split(hindi_train_seq, hindi_train_pos_seq, test_size=0.2)

# Hyperparameters
embedding_dim = 100
rnn_units = 64
dropout_rate = 0.2
epochs = 10

# Model Training and Evaluation
# Create different RNN models and train them
models = []

# LSTM model
lstm_model = create_rnn_model(embedding_dim, rnn_units, dropout_rate, english_max_len, vocab_size_eng, pos_size_eng)
lstm_model.fit([english_X_train, english_y_train], english_y_train, epochs=epochs, validation_data=([english_X_test, english_y_test], english_y_test))
models.append(('LSTM', lstm_model))

# GRU model
gru_model = create_rnn_model(embedding_dim, rnn_units, dropout_rate, english_max_len, vocab_size_eng, pos_size_eng)
gru_model.fit([english_X_train, english_y_train], english_y_train, epochs=epochs, validation_data=([english_X_test, english_y_test], english_y_test))
models.append(('GRU', gru_model))

# Evaluate models
for name, model in models:
    print(f"Evaluation for {name} Model:")
    # Evaluate on English dataset
    english_loss, english_accuracy = model.evaluate([english_X_test, english_y_test], english_y_test)
    print(f"English - Loss: {english_loss}, Accuracy: {english_accuracy}")
    # Evaluate on Hindi dataset
    hindi_loss, hindi_accuracy = model.evaluate([hindi_X_test, hindi_y_test], hindi_y_test)
    print(f"Hindi - Loss: {hindi_loss}, Accuracy: {hindi_accuracy}")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package indian to /root/nltk_data...
[nltk_data]   Unzipping corpora/indian.zip.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation for LSTM Model:
English - Loss: 0.009524141438305378, Accuracy: 0.9986662864685059
Hindi - Loss: 0.5189276933670044, Accuracy: 0.8930976390838623
Evaluation for GRU Model:
English - Loss: 0.010182314552366734, Accuracy: 0.9988265633583069
Hindi - Loss: 0.39272964000701904, Accuracy: 0.9112794399261475
