In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Sample text corpus
text_corpus = """
The quick brown fox jumps over the lazy dog.
The dog barks loudly at the moon.
A fox is a cunning animal, known for its intelligence.
Birds sing sweetly in the morning.
The sun shines brightly in the sky.
It is a beautiful day to learn about text generation.
Text generation is a fascinating field of artificial intelligence.
Machine learning models can learn to write.
"""

# Preprocessing: convert to lowercase
text_corpus = text_corpus.lower().replace('\n', ' ').replace('  ', ' ')
print("Processed Text Corpus:")
print(text_corpus)

Processed Text Corpus:
 the quick brown fox jumps over the lazy dog. the dog barks loudly at the moon. a fox is a cunning animal, known for its intelligence. birds sing sweetly in the morning. the sun shines brightly in the sky. it is a beautiful day to learn about text generation. text generation is a fascinating field of artificial intelligence. machine learning models can learn to write. 


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_corpus])
total_words = len(tokenizer.word_index) + 1

print(f"\nTotal unique words: {total_words}")
print("Word Index (sample):", dict(list(tokenizer.word_index.items())[:10]))


Total unique words: 47
Word Index (sample): {'the': 1, 'a': 2, 'is': 3, 'fox': 4, 'dog': 5, 'intelligence': 6, 'in': 7, 'to': 8, 'learn': 9, 'text': 10}


In [7]:
input_sequences = []
for line in text_corpus.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform length
max_sequence_len = max([len(x) for x in input_sequences])
padded_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Separate features (X) and labels (y)
X, y = padded_sequences[:,:-1], padded_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

print(f"\nMax sequence length: {max_sequence_len}")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


Max sequence length: 10
Shape of X: (57, 9)
Shape of y: (57, 47)


In [8]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1)) # -1 because we removed the last word for y
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [9]:
print("\nTraining the model...")
history = model.fit(X, y, epochs=100, verbose=1) # Increase epochs for better results on larger datasets
print("Model training complete.")


Training the model...
Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.0442 - loss: 3.8503
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0793 - loss: 3.8387
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1144 - loss: 3.8284
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.1561 - loss: 3.8138
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1457 - loss: 3.7983
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.1235 - loss: 3.7754
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0793 - loss: 3.7458
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.0572 - loss: 3.7004
Epoch 9/100
[1m2/2[0m [32m━━━━

In [10]:
def generate_text(seed_text, next_words, model, max_sequence_len, tokenizer):
    generated_text = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_word_index = np.argmax(predicted_probs)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break
        
        seed_text += " " + output_word
        generated_text += " " + output_word
        
        # Stop if a sentence ends (simple heuristic)
        if output_word in ['.', '!', '?']:
            break
            
    return generated_text.capitalize()

print("\n--- Text Generation ---")

# User prompts for text generation
prompts = [
    "the quick brown",
    "birds sing",
    "machine learning",
    "the sun"
]

for prompt in prompts:
    print(f"\nPrompt: '{prompt}'")
    generated_paragraph = generate_text(prompt, 20, model, max_sequence_len, tokenizer)
    print(f"Generated Text: {generated_paragraph}")


--- Text Generation ---

Prompt: 'the quick brown'
Generated Text: The quick brown fox jumps over the lazy dog dog dog dog moon moon moon its intelligence intelligence intelligence intelligence intelligence intelligence intelligence

Prompt: 'birds sing'
Generated Text: Birds sing sweetly in the morning morning sky its intelligence intelligence intelligence intelligence dog intelligence intelligence intelligence intelligence intelligence intelligence intelligence intelligence

Prompt: 'machine learning'
Generated Text: Machine learning models can learn to write write write generation generation generation about about generation generation generation generation intelligence intelligence intelligence intelligence

Prompt: 'the sun'
Generated Text: The sun shines brightly in the sky sky sky dog at moon its intelligence intelligence intelligence intelligence intelligence intelligence intelligence intelligence intelligence
