In [4]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
import tensorflow as tf

# A larger corpus
corpus = """
Artificial intelligence is the simulation of human intelligence processes by machines,
especially computer systems. These processes include learning, reasoning, and self-correction.
AI is becoming a key component of modern technology, from voice assistants to autonomous vehicles.
"""

# Step 1: Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
word_index = tokenizer.word_index
total_words = len(word_index) + 1

# Step 2: Create input sequences
input_sequences = []
token_list = tokenizer.texts_to_sequences([corpus])[0]

n_steps = 3  # Use 3 previous words to predict next
for i in range(n_steps, len(token_list)):
    n_gram_seq = token_list[i-n_steps:i+1]
    input_sequences.append(n_gram_seq)

input_sequences = np.array(input_sequences)

# Step 3: Split into X and y
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)



In [5]:

# Step 4: Build LSTM model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=n_steps))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 5: Train the model
model.fit(X, y, epochs=300, verbose=0)

# Step 6: Predict next word


<keras.callbacks.History at 0x23dc5d65510>

In [None]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
def predict_next_word(input_text):
    seq = tokenizer.texts_to_sequences([input_text])[0]
    seq = pad_sequences([seq], maxlen=n_steps)
    pred = model.predict(seq, verbose=0)
    next_index = np.argmax(pred)
    return reverse_word_map.get(next_index, "")

# Test
seed_text = "artificial intelligence is"
predicted = predict_next_word(seed_text)
print(f"{seed_text} → {predicted}")

dict_items([('intelligence', 1), ('is', 2), ('of', 3), ('processes', 4), ('artificial', 5), ('the', 6), ('simulation', 7), ('human', 8), ('by', 9), ('machines', 10), ('especially', 11), ('computer', 12), ('systems', 13), ('these', 14), ('include', 15), ('learning', 16), ('reasoning', 17), ('and', 18), ('self', 19), ('correction', 20), ('ai', 21), ('becoming', 22), ('a', 23), ('key', 24), ('component', 25), ('modern', 26), ('technology', 27), ('from', 28), ('voice', 29), ('assistants', 30), ('to', 31), ('autonomous', 32), ('vehicles', 33)])
--------------------
{1: 'intelligence', 2: 'is', 3: 'of', 4: 'processes', 5: 'artificial', 6: 'the', 7: 'simulation', 8: 'human', 9: 'by', 10: 'machines', 11: 'especially', 12: 'computer', 13: 'systems', 14: 'these', 15: 'include', 16: 'learning', 17: 'reasoning', 18: 'and', 19: 'self', 20: 'correction', 21: 'ai', 22: 'becoming', 23: 'a', 24: 'key', 25: 'component', 26: 'modern', 27: 'technology', 28: 'from', 29: 'voice', 30: 'assistants', 31: 'to',