In [64]:
#Beginning of the model importing the libraries required
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [65]:
# text data
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

In [66]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1


In [67]:
# create input sequences
input_sequences = []
for line in text.split("\n"):
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        seq = tokens[:i+1]
        input_sequences.append(seq)

In [68]:
# pad sequences
max_seq_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
input_sequences = np.array(input_sequences)

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [69]:
y = np.eye(total_words)[y] #encodeing output

In [70]:
#Building the Model
from tensorflow.keras.layers import Input

model = Sequential()
model.add(Input(shape=(max_seq_len -1,)))
model.add(Embedding(total_words,50))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [71]:
# Training the model
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
3/3 - 16s - 5s/step - accuracy: 0.0435 - loss: 4.2183
Epoch 2/100
3/3 - 1s - 301ms/step - accuracy: 0.1196 - loss: 4.2016
Epoch 3/100
3/3 - 0s - 99ms/step - accuracy: 0.1196 - loss: 4.1839
Epoch 4/100
3/3 - 0s - 127ms/step - accuracy: 0.1522 - loss: 4.1536
Epoch 5/100
3/3 - 0s - 116ms/step - accuracy: 0.1630 - loss: 4.1085
Epoch 6/100
3/3 - 0s - 104ms/step - accuracy: 0.1848 - loss: 4.0108
Epoch 7/100
3/3 - 0s - 107ms/step - accuracy: 0.1304 - loss: 3.8777
Epoch 8/100
3/3 - 0s - 113ms/step - accuracy: 0.1522 - loss: 3.8480
Epoch 9/100
3/3 - 0s - 135ms/step - accuracy: 0.1087 - loss: 3.8662
Epoch 10/100
3/3 - 1s - 210ms/step - accuracy: 0.1087 - loss: 3.8115
Epoch 11/100
3/3 - 0s - 118ms/step - accuracy: 0.1087 - loss: 3.7773
Epoch 12/100
3/3 - 0s - 114ms/step - accuracy: 0.1087 - loss: 3.7754
Epoch 13/100
3/3 - 0s - 101ms/step - accuracy: 0.1087 - loss: 3.7607
Epoch 14/100
3/3 - 0s - 118ms/step - accuracy: 0.1087 - loss: 3.7274
Epoch 15/100
3/3 - 0s - 115ms/step - accuracy:

<keras.src.callbacks.history.History at 0x13a11589510>

In [72]:
def predict_next_word(model, tokenizer, seed_text, max_seq_len):
    # Convert to sequence of ints
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')

    # Get prediction probabilities
    predicted_probs = model.predict(token_list, verbose=0)[0]

    # Find the index of the highest probability
    predicted_index = np.argmax(predicted_probs)

    # Map index back to word
    if predicted_index in tokenizer.index_word:
        return tokenizer.index_word[predicted_index]
    else:
        # If not found, return empty or placeholder
        return "<unk>"
          
    

    
  
                
                

In [73]:
next_word = predict_next_word(model, tokenizer, "The man with", max_seq_len)
print("Predicted next word:", next_word)

Predicted next word: the
