# Creating ANN to predict the next word

In [1]:
# Creating ANN to predict the next word

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM
from tensorflow.keras.utils import to_categorical

In [2]:
#1. Raw text sequences (sentences)

texts = [

    "I love to eat apples",

    "I love to eat bananas",

    "I love to play football",

    "You love to eat apples",

    "You love to play cricket"
]

In [3]:
# Tokenize words to integer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

vocab_size = len(word_index) + 1

print(word_index)

{'love': 1, 'to': 2, 'i': 3, 'eat': 4, 'apples': 5, 'play': 6, 'you': 7, 'bananas': 8, 'football': 9, 'cricket': 10}


In [4]:
input_sequences = []

labels = []

for line in texts:

    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(3, len(token_list)):

        input_seq = token_list[i - 3:i]

        label = token_list[i]

        input_sequences.append(input_seq)

        labels.append(label)

input_sequences = np.array(input_sequences)

labels = np.array(labels)

labels = to_categorical(labels, num_classes=vocab_size)

In [5]:
model = Sequential()

model.add(Embedding(input_dim = vocab_size, output_dim = 10, input_length=3))

model.add(Flatten())

model.add(Dense(64, activation='relu'))

model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [6]:
model.fit(input_sequences, labels, epochs=500)

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1000 - loss: 2.3941
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671ms/step - accuracy: 0.2000 - loss: 2.3881
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.2000 - loss: 2.3823
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.2000 - loss: 2.3767
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4000 - loss: 2.3713
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.6000 - loss: 2.3660
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6000 - loss: 2.3607
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.6000 - loss: 2.3552
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x7e384f3d8a90>

In [10]:
test_text = "I love to"

test_seq = tokenizer.texts_to_sequences([test_text])[0]

test_seq =np.array(test_seq).reshape(1, 3)

predicted_probs = model.predict(test_seq)

predicted_index = np.argmax(predicted_probs)

reverse_word_map = {index: word for word, index in word_index.items()}

predicted_word = reverse_word_map.get(predicted_index, '')

print(f"Input Sequence: {test_text}")

print(f"Predicted Next Word: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Input Sequence: I love to
Predicted Next Word: eat


# Next word prediction using LSTM

In [3]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM
from tensorflow.keras.utils import to_categorical

In [4]:
#1. Raw text sequences (sentences)


texts = [

    "I love to eat apples",

    "I love to eat bananas",

    "I love to play football",

    "You love to eat apples",

    "You love to play cricket"
]

In [5]:
# Tokenize words to integer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

vocab_size = len(word_index) + 1

print(word_index)

{'love': 1, 'to': 2, 'i': 3, 'eat': 4, 'apples': 5, 'play': 6, 'you': 7, 'bananas': 8, 'football': 9, 'cricket': 10}


In [6]:
input_sequences = []

labels = []

for line in texts:

    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(3, len(token_list)):

        input_seq = token_list[i - 3:i]

        label = token_list[i]

        input_sequences.append(input_seq)

        labels.append(label)

input_sequences = np.array(input_sequences)

labels = np.array(labels)

labels = to_categorical(labels, num_classes=vocab_size)

In [7]:
model = Sequential()

model.add(Embedding(input_dim = vocab_size, output_dim = 10, input_length=3))

model.add(LSTM(50, activation='relu'))

model.add(Dense(64, activation='relu'))

model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [8]:
model.fit(input_sequences, labels, epochs=500)

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.1000 - loss: 2.3976
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - accuracy: 0.3000 - loss: 2.3948
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.5000 - loss: 2.3919
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4000 - loss: 2.3890
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4000 - loss: 2.3861
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4000 - loss: 2.3831
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4000 - loss: 2.3799
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.4000 - loss: 2.3766
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x784e85215b50>

In [11]:
test_text = "I love to"

test_seq = tokenizer.texts_to_sequences([test_text])[0]

test_seq =np.array(test_seq).reshape(1, 3)

predicted_probs = model.predict(test_seq)

predicted_index = np.argmax(predicted_probs)

reverse_word_map = {index: word for word, index in word_index.items()}

predicted_word = reverse_word_map.get(predicted_index, '')

print(f"Input Sequence: {test_text}")

print(f"Predicted Next Word: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Input Sequence: I love to
Predicted Next Word: eat
