# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import re
import nltk
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM , Dense , Embedding , BatchNormalization , GRU , Dropout
from keras.callbacks import EarlyStopping
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Reading Text Data into Pandas DataFrame with Error Handling

In [2]:
# Attempt to read the file, skipping problematic lines and generating a warning
df = pd.read_csv("1661-0.txt", sep='\t',names=['data'] )
df.head()

Unnamed: 0,data
0,Project Gutenberg's The Adventures of Sherlock...
1,This eBook is for the use of anyone anywhere a...
2,almost no restrictions whatsoever. You may co...
3,re-use it under the terms of the Project Guten...
4,with this eBook or online at www.gutenberg.net


# Converting DataFrame to Text String


In [3]:
data = df.to_string(index=False)

# Fitting Tokenizer on Text Data

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#After fitting the tokenizer on the text data, you can access the word counts and the word index using `tokenizer.word_counts` and `tokenizer.word_index` attributes respectively.
- `tokenizer.word_counts` provides a dictionary containing the counts of each word in the text data.
- `tokenizer.word_index` provides a dictionary mapping each word to its corresponding index in the vocabulary.

In [5]:
#tokenizer.word_counts
#tokenizer.word_index

# Splitting Text Data into Sentences and Printing Each Sentence

In [6]:
#for sentence in data.split('\n'):
  #print(sentence)

# Converting Sentences to Sequences using Tokenizer

In [7]:
#for sentence in data.split('\n'):
  #print(tokenizer.texts_to_sequences([sentence])[0])

# Generating Input Sequences for Sequence Prediction

In [8]:
input_sequences = []

for sentence in data.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0] # tokenizer.texts_to_sequences()` method, which converts the sentence into a sequence of integers.

  for i in range(1 , len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

# Calculate the maximum length of the input sequences

In [9]:
max_len = max(len(x) for x in input_sequences)
max_len

20

# The code uses the pad_sequences() function from Keras.preprocessing.sequence module to pad the input sequences.

In [10]:
padded_input_sequences = pad_sequences(input_sequences , maxlen=max_len , padding='pre')

In [11]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,  145, 4789],
       [   0,    0,    0, ...,  145, 4789,    1],
       [   0,    0,    0, ..., 4789,    1, 1021],
       ...,
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358],
       [   0,    0,    0, ...,   83,  358, 1673]], dtype=int32)

In [12]:
# Slice the padded input sequences to create input data (X)
X = padded_input_sequences[:, :-1]

# Slice the padded input sequences to create target data (Y)
Y = padded_input_sequences[:, -1]

In [13]:
print("X-SHAPE :",X.shape)
print("Y-SHAPE :",Y.shape)

X-SHAPE : (101619, 19)
Y-SHAPE : (101619,)


# Convert target data Y to one-hot encoded format

In [14]:
from keras.utils import to_categorical
vocabulary_size = len(tokenizer.word_index) + 1
Y = to_categorical(Y , num_classes=vocabulary_size)

print("AFTER-ONE_HOT_ENCODED-Y :",Y.shape)

AFTER-ONE_HOT_ENCODED-Y : (101619, 8931)


In [15]:
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Split the data into training and testing sets

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Sequential Model Architecture with Embedding, LSTM, and Dense Layers

In [17]:
print('vocabulary-size =',vocabulary_size)
print('Max-Len =',max_len)

vocabulary-size = 8931
Max-Len = 20


In [18]:
model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=200, input_length=max_len-1))
model.add(Dropout(0.2))  # Dropout layer with 20% dropout rate
model.add(GRU(200, dropout=0.2, recurrent_dropout=0.2))  # GRU layer with 200 units and dropout
model.add(Dense(vocabulary_size, activation='softmax'))



# Compiling the Sequential Model

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Summary

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 200)           1786200   
                                                                 
 dropout (Dropout)           (None, 19, 200)           0         
                                                                 
 gru (GRU)                   (None, 200)               241200    
                                                                 
 dense (Dense)               (None, 8931)              1795131   
                                                                 
Total params: 3822531 (14.58 MB)
Trainable params: 3822531 (14.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Training the Sequential Model with Early Stopping Callback

In [21]:
history = model.fit(X_train, Y_train, epochs=50, batch_size=64, validation_data=(X_test, Y_test), verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
model.save("next_word_prediction_model.h5")

  saving_api.save_model(


In [25]:
import joblib
joblib.dump(tokenizer, "tokenizer.joblib")

['tokenizer.joblib']

In [22]:
import time

# Generate next words iteratively based on the input text.

In [23]:
def generate_next_word(text , model , tokenizer , maxlen=19 , padding='pre' , wait_time=0.2 , num_predictions=5):

  for i in range(num_predictions):

    token_text = tokenizer.texts_to_sequences([text])[0]

    padded_text = pad_sequences([token_text] , maxlen=maxlen , padding=padding)

    predict = np.argmax(model.predict(padded_text))

    for word , index in tokenizer.word_index.items():
      if index == predict:
        text = text + " " + word
        print(text)
    time.sleep(wait_time)

input_text = "I tell you that I would give one of the provinces"
generate_next_word(input_text, model, tokenizer)

I tell you that I would give one of the provinces of
I tell you that I would give one of the provinces of my
I tell you that I would give one of the provinces of my kingdom
I tell you that I would give one of the provinces of my kingdom to
I tell you that I would give one of the provinces of my kingdom to you


In [26]:
import pickle

# Save the function
with open("generate_next_word.pkl", "wb") as f:
    pickle.dump(generate_next_word, f)


In [8]:
from keras.models import load_model

with open("", "rb") as f:
    tokenizer = joblib.load(f)

# Load modela
model = load_model("next_word_prediction_model.h5")

# Load generate_next_word function
with open("generate_next_word.pkl", "rb") as f:
    generate_next_word = joblib.load(f)

def predict_next_word(input_text):
    token_text = tokenizer.texts_to_sequences([input_text])[0]
    padded_text = pad_sequences([token_text], maxlen=19, padding='pre')
    predicted_probabilities = model.predict(padded_text)[0]
    next_word_index = np.argmax(predicted_probabilities)
    next_word = tokenizer.index_word[next_word_index]
    return next_word

def generate_next_word_text(input_text, num_predictions=5):
    for i in range(num_predictions):
        input_text = predict_next_word(input_text)
        print(input_text)
        time.sleep(0.2)

# Example usage
input_text = "I tell you that I would give one of the provinces"
generate_next_word_text(input_text)

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/tokenizer.joblib'

In [34]:
import sklearn
sklearn.__version__

'1.2.2'

In [2]:
import tensorflow
tensorflow.__version__

'2.14.0'

In [3]:
import keras
keras.__version__

'2.14.0'

In [4]:
import joblib
joblib.__version__

'1.3.2'