<a href="https://colab.research.google.com/github/Juxtpawan/Deep-Learning-Repo/blob/main/NewWordPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import GRU, Dense, Embedding
import random
from tensorflow.keras.utils import get_file

In [None]:
file_path = get_file('cactus_forest_drive.txt',origin='https://www.gutenberg.org/files/59787/59787-h/59787-h.htm')


# reading text from ebook
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read().lower()


# Cleaning the text
text = re.sub(r'[^a-zA-Z\s]', '' , text)
print('Given script has ' + str(len(text)) + ' characters')


# Tokenize the text into word
words = text.split()
print('Given script has ' + str(len(words)) + ' words')


# Create a dictionary of unique words
unique_words = list(set(words))

# Create a mapping from words to integers
word_to_int = {word: i for i, word in enumerate(unique_words)}


# Create a reverse mapping from integers to words
int_to_word = {i: word for word, i in word_to_int.items()}


Given script has 53210 characters
Given script has 7590 words


In [None]:
# Create sequences of word
seq_len = 5
sequence = []
for i in range(len(words) - seq_len):
    seq_in = words[i:i+seq_len]
    seq_out = words[i+seq_len]
    sequence.append((seq_in, seq_out))

# Convert sequences to integer representation
X = []
y = []
for seq_in, seq_out in sequence:
    X.append([word_to_int[word] for word in seq_in])
    y.append(word_to_int[seq_out])


X = np.array(X)
y = np.array(y)

In [None]:
# Building the model
model = Sequential([
    Embedding(input_dim=len(unique_words), output_dim=100, input_length=seq_len),
    GRU(units=128, return_sequences=True),
    GRU(units=128),
    Dense(units=len(unique_words), activation='softmax')
])


# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train the model
model.fit(X, y, epochs=10, batch_size=32)

Epoch 1/10




[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 34ms/step - accuracy: 0.0467 - loss: 7.1227
Epoch 2/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 36ms/step - accuracy: 0.0547 - loss: 6.3993
Epoch 3/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - accuracy: 0.0685 - loss: 5.9212
Epoch 4/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step - accuracy: 0.1177 - loss: 5.3446
Epoch 5/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - accuracy: 0.1540 - loss: 4.8810
Epoch 6/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - accuracy: 0.1876 - loss: 4.3965
Epoch 7/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 41ms/step - accuracy: 0.2308 - loss: 3.9793
Epoch 8/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.2748 - loss: 3.5956
Epoch 9/10
[1m238/238[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7de02153f3d0>

In [None]:
def predict_next_word(model,start_seq,seq_len):
  # Converting start sequence to integer
  start_seq = [word_to_int[word] for word in start_seq]

  # predicting the next words
  predicted_words =[]
  for i in range(seq_len):
      pred_input = np.array([start_seq])
      predicted_prob = model.predict(pred_input,verbose=0)
      predicted_index = np.argmax(predicted_prob)
      predicted_word = int_to_word[predicted_index]



      # predict word(append) and update start sequence
      predicted_words.append(predicted_word)
      start_seq.append(predicted_index)
      start_seq = start_seq[1:]  #removing first word


  return predicted_words


# predicting the five next word
start_seq = ["the", "sun", "set", "over", "the"]
predicted_words = predict_next_word(model,start_seq,seq_len)
print(predicted_words)

['gilded', 'dodge', 'and', 'mottled', 'insects']
