# 12 Word sense disambiguation by LSTM/GRU.

In [1]:
# Install necessary libraries
! pip install tensorflow
! pip install nltk
! pip install scikit-learn



In [15]:
# import library
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Bidirectional, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

In [3]:
# Download Necessary NLTK Resources
nltk.download('punkt_tab')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Jaydip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jaydip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load the dataset
data = pd.read_csv('data/Test_Data_for_WSD.csv', encoding='latin-1')

In [6]:
# Sample data structure
data = pd.DataFrame({
    'sn': [1, 2, 3],
    'sentence/context': ['I went to the bank to withdraw money', 'The river bank was flooded', 'She deposited money in the bank'],
    'polysemy_word': ['bank', 'bank', 'bank']
})


In [7]:
# Preprocess the sentences and labels
sentences = data['sentence/context'].tolist()
target_words = data['polysemy_word'].tolist()

In [8]:
# For simplicity, we manually define sense labels here (in practice, you'd have labels for each polysemy_word in the context)
labels = ['financial', 'geographical', 'financial']  # Labels for each sentence


In [9]:
# Tokenizing sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]


In [10]:
# Encode the target word's senses (polysemy)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [11]:
# Create word-to-index mapping (use pre-trained embeddings if available, otherwise use random embeddings)
word_to_index = {}
word_to_index['<PAD>'] = 0
word_to_index['<OOV>'] = 1

In [12]:
# Add words from the tokenized sentences to the word_to_index mapping
for sentence in tokenized_sentences:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)


In [13]:
# Convert sentences to indices
X = [[word_to_index.get(word, 1) for word in sentence] for sentence in tokenized_sentences]


In [16]:
# Padding sentences to ensure uniform length
X = pad_sequences(X, padding='post')

# Reshaping labels for sequence classification
Y = np.array(encoded_labels)


In [17]:
# Split data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=42)


In [18]:
# Build the LSTM/GRU model
model = Sequential()
model.add(Embedding(input_dim=len(word_to_index), output_dim=100))
model.add(Bidirectional(LSTM(units=64, return_sequences=False)))  # Bidirectional LSTM
model.add(Dropout(0.5))  # Dropout layer to avoid overfitting
model.add(Dense(64, activation='relu'))  # Fully connected layer
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer for sense classification

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [19]:
# Train the model
history = model.fit(X_train, Y_train, batch_size=32, epochs=10, validation_data=(X_val, Y_val))


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.5000 - loss: 0.6915 - val_accuracy: 1.0000 - val_loss: 0.6863
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 1.0000 - loss: 0.6910 - val_accuracy: 1.0000 - val_loss: 0.6852
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step - accuracy: 1.0000 - loss: 0.6867 - val_accuracy: 1.0000 - val_loss: 0.6846
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 0.5000 - loss: 0.6884 - val_accuracy: 1.0000 - val_loss: 0.6858
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - accuracy: 0.5000 - loss: 0.6839 - val_accuracy: 1.0000 - val_loss: 0.6872
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 1.0000 - loss: 0.6749 - val_accuracy: 1.0000 - val_loss: 0.6880
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━

In [20]:
# Evaluate the model
val_loss, val_acc = model.evaluate(X_val, Y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_acc}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 1.0000 - loss: 0.6908
Validation Loss: 0.6908193826675415
Validation Accuracy: 1.0


In [21]:
# Predicting the sense of a polysemy word in a new sentence
def predict_sense(sentence, word):
    sentence = word_tokenize(sentence.lower())  # Tokenize the sentence
    sentence_indices = [word_to_index.get(word, 1) for word in sentence]  # Convert words to indices
    sentence_indices = pad_sequences([sentence_indices], maxlen=X.shape[1], padding='post')  # Padding

    pred = model.predict(sentence_indices)
    predicted_label = label_encoder.inverse_transform([np.argmax(pred)])
    return predicted_label[0]

In [24]:
# Test with a new sentence
test_sentence = "The fisherman sat by the river bank."
target_word = "bank"
predicted_sense = predict_sense(test_sentence, target_word)
print(f"Predicted Sense for '{target_word}': {predicted_sense}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted Sense for 'bank': geographical
