# Creating ANN to predict the next word

In [1]:
# Creating ANN to predict the next word

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM
from tensorflow.keras.utils import to_categorical

In [2]:
#1. Raw text sequences (sentences)

texts = [

    "I love to eat apples",

    "I love to eat bananas",

    "I love to play football",

    "You love to eat apples",

    "You love to play cricket"
]

In [3]:
# Tokenize words to integer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

vocab_size = len(word_index) + 1

print(word_index)

{'love': 1, 'to': 2, 'i': 3, 'eat': 4, 'apples': 5, 'play': 6, 'you': 7, 'bananas': 8, 'football': 9, 'cricket': 10}


In [4]:
input_sequences = []

labels = []

for line in texts:

    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(3, len(token_list)):

        input_seq = token_list[i - 3:i]

        label = token_list[i]

        input_sequences.append(input_seq)

        labels.append(label)

input_sequences = np.array(input_sequences)

labels = np.array(labels)

labels = to_categorical(labels, num_classes=vocab_size)

In [5]:
model = Sequential()

model.add(Embedding(input_dim = vocab_size, output_dim = 10, input_length=3))

model.add(Flatten())

model.add(Dense(64, activation='relu'))

model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [6]:
model.fit(input_sequences, labels, epochs=500)

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.1000 - loss: 2.3839
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.1000 - loss: 2.3766
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.1000 - loss: 2.3693
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.2000 - loss: 2.3620
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.2000 - loss: 2.3548
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.4000 - loss: 2.3476
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.4000 - loss: 2.3404
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.4000 - loss: 2.3329
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x2211e1e0510>

In [7]:
test_text = "I love to"

test_seq = tokenizer.texts_to_sequences([test_text])[0]

test_seq =np.array(test_seq).reshape(1, 3)

predicted_probs = model.predict(test_seq)

predicted_index = np.argmax(predicted_probs)

reverse_word_map = {index: word for word, index in word_index.items()}

predicted_word = reverse_word_map.get(predicted_index, '')

print(f"Input Sequence: {test_text}")

print(f"Predicted Next Word: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Input Sequence: I love to
Predicted Next Word: eat


# Next word prediction using LSTM

In [8]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM
from tensorflow.keras.utils import to_categorical

In [9]:
#1. Raw text sequences (sentences)


texts = [

    "I love to eat apples",

    "I love to eat bananas",

    "I love to play football",

    "You love to eat apples",

    "You love to play cricket"
]

In [10]:
# Tokenize words to integer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

vocab_size = len(word_index) + 1

print(word_index)

{'love': 1, 'to': 2, 'i': 3, 'eat': 4, 'apples': 5, 'play': 6, 'you': 7, 'bananas': 8, 'football': 9, 'cricket': 10}


In [11]:
input_sequences = []

labels = []

for line in texts:

    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(3, len(token_list)):

        input_seq = token_list[i - 3:i]

        label = token_list[i]

        input_sequences.append(input_seq)

        labels.append(label)

input_sequences = np.array(input_sequences)

labels = np.array(labels)

labels = to_categorical(labels, num_classes=vocab_size)

In [12]:
model = Sequential()

model.add(Embedding(input_dim = vocab_size, output_dim = 10, input_length=3))

model.add(LSTM(50, activation='relu'))

model.add(Dense(64, activation='relu'))

model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [13]:
model.fit(input_sequences, labels, epochs=500)

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2000 - loss: 2.3972
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.2000 - loss: 2.3946
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.5000 - loss: 2.3916
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.5000 - loss: 2.3887
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.5000 - loss: 2.3856
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.4000 - loss: 2.3825
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.4000 - loss: 2.3792
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.2000 - loss: 2.3757
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x221227575d0>

In [14]:
test_text = "I love to"

test_seq = tokenizer.texts_to_sequences([test_text])[0]

test_seq =np.array(test_seq).reshape(1, 3)

predicted_probs = model.predict(test_seq)

predicted_index = np.argmax(predicted_probs)

reverse_word_map = {index: word for word, index in word_index.items()}

predicted_word = reverse_word_map.get(predicted_index, '')

print(f"Input Sequence: {test_text}")

print(f"Predicted Next Word: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
Input Sequence: I love to
Predicted Next Word: eat
