In [1]:
## Data Collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

## Load the dataset
data= gutenberg.raw('shakespeare-hamlet.txt')
## save the file
with open('hamlet.txt','w') as file:
    file.write(data)
    

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
# Clear any existing model state
import tensorflow as tf
tf.keras.backend.clear_session()
tf.compat.v1.reset_default_graph()






In [3]:
## Data pre-processing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## load the dataset
with open('hamlet.txt','r') as file:
    text = file.read().lower()

## Tokenize

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index
total_words1 = len(word_index) + 1
print("Total words:", total_words1)
print("Sample word_index (first 5):", list(word_index.items())[:5])


Total words: 4818
Sample word_index (first 5): [('the', 1), ('and', 2), ('to', 3), ('of', 4), ('i', 5)]


In [4]:
inputsequences = []
for line in text.split('\n'):
    if not line.strip():  # Skip empty lines
        continue
    token_list = tokenizer.texts_to_sequences([line])[0]
    # Filter tokens explicitly
    token_list = [token for token in token_list if 0 < token < total_words1]
    if not token_list:  # Skip empty token lists
        continue
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        # Skip if any token is invalid
        if all(0 < token < total_words1 for token in n_gram_sequence):
            inputsequences.append(n_gram_sequence)

In [5]:
# Check if inputsequences is empty
if not inputsequences:
    raise ValueError("No valid sequences generated. Check tokenizer or dataset.")

In [6]:
## pad sequence

max_sequence_len= max([len(x) for x in inputsequences])
max_sequence_len

14

In [7]:
inputsequences=np.array(pad_sequences(inputsequences,maxlen=max_sequence_len,padding='pre'))
inputsequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [8]:
## create predictors and lables

x,y = inputsequences[:,:-1],inputsequences[:,-1]

In [9]:
print("Initial x shape:", x.shape, "y shape:", y.shape)
print("Initial y sample:", y[:3])
print("Initial max in x:", np.max(x))
print("Initial max in y:", y.max())

Initial x shape: (25732, 13) y shape: (25732,)
Initial y sample: [687   4  45]
Initial max in x: 4817
Initial max in y: 4816


In [10]:
valid_rows = (np.max(x, axis=1) < total_words1) & (y < total_words1)
x = x[valid_rows]
y = y[valid_rows]
print("Filtered x shape:", x.shape, "y shape:", y.shape)
print("Filtered y sample:", y[:3])
print("Filtered max in x:", np.max(x))
print("Filtered max in y:", y.max())

Filtered x shape: (25732, 13) y shape: (25732,)
Filtered y sample: [687   4  45]
Filtered max in x: 4817
Filtered max in y: 4816


In [11]:
# Double-check invalid indices
if np.max(x) >= total_words1 or y.max() >= total_words1:
    raise ValueError(f"Error: x or y contains indices >= total_words1 ({total_words1})")

In [12]:
# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("x_train shape:", x_train.shape, "y_train shape:", y_train.shape)
print("Max in x_train:", np.max(x_train))
print("Max in y_train:", np.max(y_train))

x_train shape: (20585, 13) y_train shape: (20585,)
Max in x_train: 4817
Max in y_train: 4816


In [13]:
# Class weights for imbalanced words
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))
print("Class weights (first 5):", list(class_weight_dict.items())[:5])

Class weights (first 5): [(0, 0.0063381714478825055), (1, 0.009104669734580408), (2, 0.009790310362578582), (3, 0.009842019748296427), (4, 0.011996281581614528)]


In [14]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau  

In [15]:
## Train LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.optimizers import Adam

## Define model

model=Sequential()
model.add(Embedding(2346,128,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(2346, activation='softmax'))
##model.build(input_shape=(None, max_sequence_len-1))

## compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.0001),metrics = ['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 13, 128)           300288    
                                                                 
 lstm (LSTM)                 (None, 13, 150)           167400    
                                                                 
 dropout (Dropout)           (None, 13, 150)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2346)              236946    
                                                                 
Total params: 805034 (3.07 MB)
Trainable params: 805034 

In [16]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(factor=0.5, patience=3, verbose=1)
]
history = model.fit(
    x_train, y_train,
    epochs=25,
    batch_size=64,
    validation_data=(x_test, y_test),
    verbose=1,
    callbacks=callbacks,
    class_weight=class_weight_dict
)

Epoch 1/25



InvalidArgumentError: Graph execution error:

Detected at node GatherV2 defined at (most recent call last):
<stack traces unavailable>
indices[12] = 4640 is not in [0, 4603)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_5771]

In [None]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [None]:
input_text="To be or not to be"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

In [None]:
## Save the model
model.save("next_word_lstm.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
input_text="The sun rises in"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")