
**TASK - PREDICT THE NEXT CHARACTER OF WORD OR WORD OF THE SENTENCE**


Importing necessary libraries

In [1]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)
import warnings as wg
wg.filterwarnings("ignore")
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout, RepeatVector, TimeDistributed, Embedding
from tensorflow.keras.optimizers import  RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

**Loading Data**

In [None]:
data= open("/content/Next Word Prediction Dataset.txt", encoding="utf8").read().lower()
print('corpus length:', len(data))

corpus length: 581888


**Data Preprocessing**

In [None]:
character = sorted(list(set(data)))
char_indices = dict((c, i) for i, c in enumerate(character))
indices_char = dict((i, c) for i, c in enumerate(character))

print(f'unique chars: {len(character)}')

unique chars: 73


In [None]:
seq_len = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(data) - seq_len, step):
    sentences.append(data[i: i + seq_len ])
    next_chars.append(data[i + seq_len])
print(f'num training examples: {len(sentences)}')

num training examples: 193950


In [None]:
X = np.zeros((len(sentences), seq_len, len(character)), dtype=np.bool)
y = np.zeros((len(sentences), len(character)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

sentences[124]
next_chars[100]

'e'

In [None]:
X[0][0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True])

In [None]:
y[0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [None]:
X.shape

(193950, 40, 73)

In [None]:
y.shape

(193950, 73)

**Developing model and training it using the dataset**

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

28917

In [None]:
# Creating the model
model = Sequential()
model.add(LSTM(500, input_shape=(seq_len, len(character))))
model.add(Dense(len(character)))
model.add(Activation('softmax'))
model.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_19 (LSTM)              (None, 500)               1148000   
                                                                 
 dense_19 (Dense)            (None, 73)                36573     
                                                                 
 activation_11 (Activation)  (None, 73)                0         
                                                                 
Total params: 1184573 (4.52 MB)
Trainable params: 1184573 (4.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, epochs = 5, verbose=1)
model.save('nextword.h5')
pickle.dump(history, open("history.p", "wb"))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


**Defining all the functions needed for the predictions**

In [None]:
def prepare_input(text):
    x = np.zeros((1, seq_len, len(character)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1.

    return x


#functions to get next probable characters
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

def predict_completion(text, max_length=400):
    original_text = text
    generated = text
    completion = ''

    # Generate text until reaching the maximum length or a space character
    while len(original_text + completion) < max_length:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char

        # If a space is encountered, return the completion
        if next_char == ' ':
            return completion

    return completion

def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]

In [None]:
labels = ["With great power comes great responsibility.", "India's diversity weaves a tapestry of innovation and influence across the globe.",
"In a world of magic and wonder, Harry Potter taught us the strength of friendship and courage.", "Sachin's bat carved not just records, but a legacy of cricketing devotion.",
"Artificial Intelligence is the silent revolution reshaping our present and defining our future."]

In [None]:
for i in labels:
    seq = i[:40].lower()
    print(seq)
    print(predict_completions(seq, 5))
    print()

with great power comes great responsibil
['ity ', 'less ', 'ate ', ' the ', '\nthe ']

india's diversity weaves a tapestry of i
['nterest ', 't. ', 'mpression. ', ' trust ', '\nto ']

in a world of magic and wonder, harry po
['ints ', 'liced ', 'ssible ', 'or ', 'wn ']

sachin's bat carved not just records, bu
['t ', 'ring ', 's ', 'c ', 'liness ']

artificial intelligence is the silent re
['adon\nwhich ', 'spertaining ', 'd ', 'periesce ', 'ceived ']



**Printing the loss and accuracy of the model**

In [None]:
loss, acc = model.evaluate(X,y)
print("Test Loss", loss)
print("Test Accuracy", acc)

Test Loss 1.0359888076782227
Test Accuracy 0.6763702034950256


**Conclusion**\
*In this task, I have built a text generation model using LSTM that can predict the possible next word in a sequence based on the provided text dataset. The model was trained using TensorFlow/Keras and involved tokenizing the text, creating input sequences, and training an LSTM-based neural network. I have also used the trained model, demonstrating its ability to produce coherent sequences of words.*