In [1]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load the text data
with open('/content/sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    text = file.read()

import re

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences using n-grams
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max(len(seq) for seq in input_sequences)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Features and labels
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the model
model = Sequential()
model.add(Embedding(total_words, 100))  # Removed deprecated input_length
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

# Compile and summarize
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_sequence_len - 1))
model.summary()

# Train the model
model.fit(X, y, epochs=5, verbose=1)

# Prepare reverse mapping from index to word
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

Epoch 1/5
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 6ms/step - accuracy: 0.0620 - loss: 6.5525
Epoch 2/5
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.1116 - loss: 5.5823
Epoch 3/5
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.1448 - loss: 5.1548
Epoch 4/5
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.1644 - loss: 4.7921
Epoch 5/5
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.1822 - loss: 4.4774


In [2]:
def text_generator(seed_text,next_word):

  for _ in range(next_word):
      token_list = tokenizer.texts_to_sequences([seed_text])[0]
      token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
      predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
      output_word = reverse_word_map.get(predicted[0], "")
      seed_text += " " + output_word
  print("\nGenerated text:")
  print(seed_text)

In [4]:
while(True):
  a=input('Enter seed text or (`0` to exit):')
  if a=='0':
    print('Program exit..')
    break
  text_generator(a,2)

Enter seed text or (`0` to exit):i have 

Generated text:
i have  been able
Enter seed text or (`0` to exit):i wish to

Generated text:
i wish to know that
Enter seed text or (`0` to exit):0
Program exit..
