# Next Word Pred Using LSTM

In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np
import pickle

## Data Loading & Preprocess

In [3]:
file = open('data.txt','r', encoding="utf8").read()

In [4]:
#clean Data
data = file.replace('\n', '').replace('\r','').replace('\ufeff','')
data = data.lower()   # lowercasing improves consistency


In [5]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
pickle.dump(tokenizer, open('token.pkl', 'wb'))  # save tokenizer


In [6]:
sequence_data = tokenizer.texts_to_sequences([data])[0]
vocab_size = len(tokenizer.word_index) + 1
print("Total Vocab Size:", vocab_size)


Total Vocab Size: 8200


In [7]:
# Create sequences
window_size = 5
sequences = []
for i in range(window_size, len(sequence_data)):
    seq = sequence_data[i-window_size:i+1]
    sequences.append(seq)

sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (105874, 5)
Shape of y: (105874, 8200)



#  Model Building



In [8]:
from tensorflow.keras import Input

model = Sequential()
model.add(Input(shape=(X.shape[1],)))
model.add(Embedding(vocab_size, 100))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.summary()

#  Training

In [12]:
checkpoint = ModelCheckpoint("next_words.h5", monitor="loss", save_best_only=True, verbose=1)
early_stop = EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)

history = model.fit(X, y, epochs=20, batch_size=128, callbacks=[checkpoint, early_stop])


Epoch 1/20
[1m202/828[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m2:27[0m 236ms/step - loss: 2.4987

KeyboardInterrupt: 

# Text Generation Function

In [10]:
def predict_next_words(seed_text, next_words=20, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = np.array(token_list[-window_size:]).reshape(1, -1)

        predictions = model.predict(token_list, verbose=0)[0]

        # Apply temperature sampling
        predictions = np.log(predictions + 1e-10) / temperature
        exp_preds = np.exp(predictions)
        predictions = exp_preds / np.sum(exp_preds)

        predicted_index = np.random.choice(range(vocab_size), p=predictions)

        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                seed_text += " " + word
                break
    return seed_text

#  Test Generation

In [11]:
print(predict_next_words("to sherlock holmes", next_words=30, temperature=0.8))


to sherlock holmes with a bible and thoughtful a man and it was always to be upbraided for not a trap but that observed holmes surely this is not one of the borders


In [13]:
from google.colab import files

files.download('token.pkl')
files.download('next_words.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>