In [1]:
import numpy as np

In [2]:
# Sample text data
corpus = [
    "I love driving my car",
    "My car is very fast",
    "I enjoy road trips in my car",
    "Driving at night is peaceful"
]

In [3]:
# Step 1: Tokenize the text
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [4]:
tokenizer.word_index

{'my': 1,
 'car': 2,
 'i': 3,
 'driving': 4,
 'is': 5,
 'love': 6,
 'very': 7,
 'fast': 8,
 'enjoy': 9,
 'road': 10,
 'trips': 11,
 'in': 12,
 'at': 13,
 'night': 14,
 'peaceful': 15}

In [5]:
total_words = len(tokenizer.word_index) + 1
total_words

16

In [6]:
# Step 2: Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)


[3, 6, 4, 1, 2]
[1, 2, 5, 7, 8]
[3, 9, 10, 11, 12, 1, 2]
[4, 13, 14, 5, 15]


In [7]:
input_sequences

[[3, 6],
 [3, 6, 4],
 [3, 6, 4, 1],
 [3, 6, 4, 1, 2],
 [1, 2],
 [1, 2, 5],
 [1, 2, 5, 7],
 [1, 2, 5, 7, 8],
 [3, 9],
 [3, 9, 10],
 [3, 9, 10, 11],
 [3, 9, 10, 11, 12],
 [3, 9, 10, 11, 12, 1],
 [3, 9, 10, 11, 12, 1, 2],
 [4, 13],
 [4, 13, 14],
 [4, 13, 14, 5],
 [4, 13, 14, 5, 15]]

In [8]:
# Step 3: Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
tmp = [len(x) for x in input_sequences]
print(tmp)
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

[2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5]


In [9]:
input_sequences

array([[ 0,  0,  0,  0,  0,  3,  6],
       [ 0,  0,  0,  0,  3,  6,  4],
       [ 0,  0,  0,  3,  6,  4,  1],
       [ 0,  0,  3,  6,  4,  1,  2],
       [ 0,  0,  0,  0,  0,  1,  2],
       [ 0,  0,  0,  0,  1,  2,  5],
       [ 0,  0,  0,  1,  2,  5,  7],
       [ 0,  0,  1,  2,  5,  7,  8],
       [ 0,  0,  0,  0,  0,  3,  9],
       [ 0,  0,  0,  0,  3,  9, 10],
       [ 0,  0,  0,  3,  9, 10, 11],
       [ 0,  0,  3,  9, 10, 11, 12],
       [ 0,  3,  9, 10, 11, 12,  1],
       [ 3,  9, 10, 11, 12,  1,  2],
       [ 0,  0,  0,  0,  0,  4, 13],
       [ 0,  0,  0,  0,  4, 13, 14],
       [ 0,  0,  0,  4, 13, 14,  5],
       [ 0,  0,  4, 13, 14,  5, 15]])

In [10]:
# Step 4: Split data into features and labels
from tensorflow.keras.utils import to_categorical

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [11]:
X

array([[ 0,  0,  0,  0,  0,  3],
       [ 0,  0,  0,  0,  3,  6],
       [ 0,  0,  0,  3,  6,  4],
       [ 0,  0,  3,  6,  4,  1],
       [ 0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  1,  2],
       [ 0,  0,  0,  1,  2,  5],
       [ 0,  0,  1,  2,  5,  7],
       [ 0,  0,  0,  0,  0,  3],
       [ 0,  0,  0,  0,  3,  9],
       [ 0,  0,  0,  3,  9, 10],
       [ 0,  0,  3,  9, 10, 11],
       [ 0,  3,  9, 10, 11, 12],
       [ 3,  9, 10, 11, 12,  1],
       [ 0,  0,  0,  0,  0,  4],
       [ 0,  0,  0,  0,  4, 13],
       [ 0,  0,  0,  4, 13, 14],
       [ 0,  0,  4, 13, 14,  5]])

In [12]:
y

array([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

In [14]:
# Step 5: Build the model
model = Sequential()
model.add(Embedding(total_words, 10))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(total_words, activation='softmax'))

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=300, verbose=1)

Epoch 1/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0556 - loss: 2.7776
Epoch 2/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.1111 - loss: 2.7716
Epoch 3/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.1111 - loss: 2.7659
Epoch 4/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.2222 - loss: 2.7606
Epoch 5/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - accuracy: 0.2778 - loss: 2.7553
Epoch 6/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - accuracy: 0.3889 - loss: 2.7502
Epoch 7/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - accuracy: 0.5556 - loss: 2.7451
Epoch 8/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.6111 - loss: 2.7399
Epoch 9/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x23bb4999b80>

In [16]:
model.summary()

In [17]:
def predict_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    return tokenizer.index_word[predicted[0]]


In [18]:
# Example usage
seed_text = "enjoy road"
print(f"Next word: {predict_next_word(seed_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
Next word: trips
