#**Next Word Prediction**

#**Mansi Sain**

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the text data from a file
with open('/content/1661-0.txt', 'r', encoding='utf-8') as text_file:
    raw_text = text_file.read()

# Converting text to lowercase for uniformity
raw_text = raw_text.lower()

# Initializing the tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts([raw_text])
total_unique_words = len(word_tokenizer.word_index) + 1

# Preparing input sequences for training
input_sequences = []
for line in raw_text.split('\n'):
    tokenized_line = word_tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokenized_line)):
        n_gram_sequence = tokenized_line[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad the sequences to ensure uniform input size
max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre')

# Separating the input data (X) and labels (y)
X_data, y_data = input_sequences[:, :-1], input_sequences[:, -1]
y_data = keras.utils.to_categorical(y_data, num_classes=total_unique_words)

# Building the RNN model
rnn_model = keras.Sequential()
rnn_model.add(keras.layers.Embedding(total_unique_words, 100, input_length=max_length - 1))
rnn_model.add(keras.layers.SimpleRNN(150))
rnn_model.add(keras.layers.Dense(total_unique_words, activation='softmax'))

# Compiling the model
rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model on the prepared data
rnn_model.fit(X_data, y_data, epochs=10, verbose=1)




Epoch 1/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 42ms/step - accuracy: 0.0675 - loss: 6.5455
Epoch 2/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 43ms/step - accuracy: 0.1286 - loss: 5.4385
Epoch 3/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 41ms/step - accuracy: 0.1632 - loss: 4.9353
Epoch 4/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 42ms/step - accuracy: 0.1855 - loss: 4.5478
Epoch 5/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 44ms/step - accuracy: 0.2147 - loss: 4.1765
Epoch 6/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 44ms/step - accuracy: 0.2491 - loss: 3.8328
Epoch 7/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 43ms/step - accuracy: 0.2893 - loss: 3.4991
Epoch 8/10
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 42ms/step - accuracy: 0.3356 - loss: 3.1906


In [6]:
def get_top_n_words(model, tokenizer, input_text, n=3):
    input_text = input_text.lower()
    tokenized_input = tokenizer.texts_to_sequences([input_text])[0]
    padded_input = pad_sequences([tokenized_input], maxlen=max_length - 1, padding='pre')

    predicted_probs = model.predict(padded_input, verbose=0)

    # Get the top N predicted words
    top_n_indices = np.argsort(predicted_probs[0])[-n:][::-1]  # Get the top N indices
    top_n_words = [tokenizer.index_word[index] for index in top_n_indices]

    return top_n_words

# Example usage for predicting the top N next words
input_phrase = 'This American had started from London when he was young, and he wanted to do the'
top_n_words = get_top_n_words(rnn_model, word_tokenizer, input_phrase, n=5)
print('Top predicted next words:', top_n_words)

Top predicted next words: ['same', 'old', 'inspector', 'truth', 'writing']


In [7]:
input_phrase = 'I never hope to see such a'
top_n_words = get_top_n_words(rnn_model, word_tokenizer, input_phrase, n=5)
print('Top predicted next words:', top_n_words)

Top predicted next words: ['lady', 'sight', 'work', 'single', 'thing']


In [8]:
input_phrase = 'There was nothing in the'
top_n_words = get_top_n_words(rnn_model, word_tokenizer, input_phrase, n=5)
print('Top predicted next words:', top_n_words)

Top predicted next words: ['same', 'disappearance', 'county', 'centre', 'first']


This implementation uses a Recurrent Neural Network (RNN) to predict the next word in a given text sequence. It starts by preparing the text data and creating input sequences for training. The model consists of an embedding layer, an RNN layer, and a dense output layer, trained to predict the next word based on context. After training, we can input a phrase to receive the top possible next words.






