# Using LSTM in NLP

In [30]:
# Step 1: Load and Save Data
# Import NLTK library and download the Gutenberg corpus
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

# Load the raw text of Shakespeare's Hamlet from the Gutenberg corpus
data = gutenberg.raw('shakespeare-hamlet.txt')

# Save the raw text to a file named 'hamlet.txt'
with open('hamlet.txt', 'w') as file:  # Open the file in write mode ('w')
    file.write(data)  # Write the content of 'data' to the file

# Step 2: Data Processing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer  # Convert text to sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Pad sequences to the same length
from sklearn.model_selection import train_test_split  # Split data into training and test sets

# Load the text data from the saved 'hamlet.txt' file
with open('hamlet.txt', 'r') as file:  # Open the file in read mode ('r')
    text = file.read().lower()  # Convert text to lowercase for uniformity

# Step 3: Tokenization and Sequence Generation
# Initialize the tokenizer
tokenizer = Tokenizer()  
tokenizer.fit_on_texts([text])  # Fit the tokenizer on the text to create the vocabulary

# Calculate the total number of unique words in the text
total_word = len(tokenizer.word_index) + 1  # 'word_index' gives a dictionary of words and their indices

# Create input sequences for model training
input_sequences = []

# Split the text into lines and process each line individually
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]  # Convert each line into a sequence of integers
    
    # Generate n-gram sequences from the token list
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]  # Create a sequence that includes tokens up to the current token
        input_sequences.append(n_gram_sequence)  # Append the n-gram sequence to the list of input sequences

# Step 4: Padding Sequences
# Ensure the correct import for pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Determine the maximum length of the sequences
max_sequence_len = max([len(x) for x in input_sequences])  # Find the length of the longest sequence

# Pad sequences to ensure they all have the same length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Step 5: Create Predictors and Labels
# Import TensorFlow for further processing and model building
import tensorflow as tf

# Split input sequences into predictors (X) and labels (y)
x, y = input_sequences[:, :-1], input_sequences[:, -1]  # X contains all tokens except the last one, y is the last token

# Step 6: One-Hot Encoding
# Convert labels (y) to one-hot encoded format
y = tf.keras.utils.to_categorical(y, num_classes=total_word)  # One-hot encode the labels based on the total number of words

# Now, 'x' and 'y' are ready to be used as input and output for model training.


[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/nzeinali/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [35]:
# Step 8: Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Now, x_train and y_train are the training data, while x_test and y_test are the test data.

# Step 9: Train our LSTM RNN
from tensorflow.keras.models import Sequential  # Import the Sequential model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout  # Import necessary layers

# Define the model
model = Sequential()  # Initialize the Sequential model

# Add an Embedding layer to convert input sequences into dense vectors of size 100
model.add(Embedding(total_word, 100, input_length=max_sequence_len-1))  
# total_word: Size of the vocabulary
# 100: Dimension of the embedding vectors
# input_length: Length of input sequences

# Add the first LSTM layer with 150 units, returning sequences for the next LSTM layer
model.add(LSTM(150, return_sequences=True))  
# 150: Number of LSTM units
# return_sequences=True: Return the full sequence to the next LSTM layer

# Add a Dropout layer with a 20% dropout rate to reduce overfitting
model.add(Dropout(0.2))  
# 0.2: Fraction of input units to drop

# Add a second LSTM layer with 100 units, this time returning a single vector
model.add(LSTM(100))  
# 100: Number of LSTM units

# Add a Dense layer with 'total_word' units and a softmax activation function for classification
model.add(Dense(total_word, activation='softmax'))  
# total_word: Number of output classes (vocabulary size)
# activation='softmax': Activation function for multi-class classification

# Optional: Compile the model with categorical cross-entropy loss and the Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
# loss: 'categorical_crossentropy' for multi-class classification
# optimizer: 'adam' for efficient optimization
# metrics: 'accuracy' to evaluate the model's performance

# Optional: Display the model's architecture summary
model.summary()  
# Summarize the model structure


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 13, 100)           481800    
                                                                 
 lstm_2 (LSTM)               (None, 13, 150)           150600    
                                                                 
 dropout_1 (Dropout)         (None, 13, 150)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               100400    
                                                                 
 dense_1 (Dense)             (None, 4818)              486618    
                                                                 
Total params: 1219418 (4.65 MB)
Trainable params: 1219418 (4.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:
# Train the model
history = model.fit(
    x_train,  # Training data (features)
    y_train,  # Training labels
    epochs=50,  # Number of epochs (iterations over the entire dataset)
    validation_data=(x_test, y_test),  # Validation data to evaluate model performance on unseen data
    verbose=1  # Verbosity mode (1 shows progress bar with detailed info)
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [37]:
# Step 10: Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    # Tokenize the input text and convert it to a sequence of integers
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    # Ensure the sequence length matches max_sequence_len-1 (trimming if necessary)
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  
    
    # Pad the sequence to match the input length expected by the model
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # Predict the next word using the trained model
    predicted = model.predict(token_list, verbose=0)
    
    # Find the index of the word with the highest probability
    predicted_word_index = np.argmax(predicted, axis=1)
    
    # Map the index back to the corresponding word using the tokenizer
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word  # Return the predicted word
    return None  # Return None if no word is found

# Step 11: Save the model and tokenizer
## Save the trained model to a file
model.save("next_word_lstm.h5")

## Save the tokenizer to a file using pickle
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(


In [38]:
# Step 12: Test the model with sample input text
input_text = "To be or not to be"
print(f"Input text: {input_text}")

# Calculate the max sequence length for prediction
max_sequence_len = model.input_shape[1] + 1

# Predict the next word for the given input text
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next Word Prediction: {next_word}")

Input text: To be or not to be
Next Word Prediction: damn'd


In [39]:
# more example
# Test the model with another sample input text
input_text = "Barn. Last night of all,When yond same"
print(f"Input text: {input_text}")

# Calculate the max sequence length for prediction
max_sequence_len = model.input_shape[1] + 1

# Predict the next word for the given input text
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next Word Prediction: {next_word}")

Input text: Barn. Last night of all,When yond same
Next Word Prediction: starre


# APP

In [None]:
import streamlit as st
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Load the LSTM Model
model=load_model('next_word_lstm.h5')

#3 Laod the tokenizer
with open('tokenizer.pickle','rb') as handle:
    tokenizer=pickle.load(handle)

# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

# streamlit app
st.title("Next Word Prediction With LSTM And Early Stopping")
input_text=st.text_input("Enter the sequence of Words","To be or not to")
if st.button("Predict Next Word"):
    max_sequence_len = model.input_shape[1] + 1  # Retrieve the max sequence length from the model input shape
    next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
    st.write(f'Next word: {next_word}')

