# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import re
import nltk
from keras.models import Sequential
from keras.layers import LSTM , Dense , Embedding , BatchNormalization
from keras.callbacks import EarlyStopping
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
tokenizer = Tokenizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dodiy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading Text Data into Pandas DataFrame with Error Handling

In [6]:
# Attempt to read the file, skipping problematic lines and generating a warning
df = pd.read_csv("artifacts/1661-0.txt", sep='\t',names=['data'] )
df.head()

Unnamed: 0,data
0,Project Gutenberg's The Adventures of Sherlock...
1,This eBook is for the use of anyone anywhere a...
2,almost no restrictions whatsoever. You may co...
3,re-use it under the terms of the Project Guten...
4,with this eBook or online at www.gutenberg.net


# Converting DataFrame to Text String


In [7]:
data = df.to_string(index=False)

# Fitting Tokenizer on Text Data

In [8]:
tokenizer.fit_on_texts([data])

#After fitting the tokenizer on the text data, you can access the word counts and the word index using `tokenizer.word_counts` and `tokenizer.word_index` attributes respectively.
- `tokenizer.word_counts` provides a dictionary containing the counts of each word in the text data.
- `tokenizer.word_index` provides a dictionary mapping each word to its corresponding index in the vocabulary.

In [9]:
#tokenizer.word_counts
#tokenizer.word_index

# Splitting Text Data into Sentences and Printing Each Sentence

In [10]:
#for sentence in data.split('\n'):
  #print(sentence)

# Converting Sentences to Sequences using Tokenizer

In [11]:
#for sentence in data.split('\n'):
  #print(tokenizer.texts_to_sequences([sentence])[0])

# Generating Input Sequences for Sequence Prediction

In [12]:
input_sequences = []

for sentence in data.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0] # tokenizer.texts_to_sequences()` method, which converts the sentence into a sequence of integers.

  for i in range(1 , len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

# Calculate the maximum length of the input sequences

In [13]:
max_len = max(len(x) for x in input_sequences)
max_len

20

# The code uses the pad_sequences() function from Keras.preprocessing.sequence module to pad the input sequences.

In [14]:
padded_input_sequences = pad_sequences(input_sequences , maxlen=max_len , padding='pre')

In [15]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,  145, 4789],
       [   0,    0,    0, ...,  145, 4789,    1],
       [   0,    0,    0, ..., 4789,    1, 1021],
       ...,
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358],
       [   0,    0,    0, ...,   83,  358, 1673]])

In [16]:
# Slice the padded input sequences to create input data (X)
X = padded_input_sequences[:, :-1]

# Slice the padded input sequences to create target data (Y)
Y = padded_input_sequences[:, -1]

In [17]:
print("X-SHAPE :",X.shape)
print("Y-SHAPE :",Y.shape)

X-SHAPE : (101619, 19)
Y-SHAPE : (101619,)


# Convert target data Y to one-hot encoded format

In [18]:
from keras.utils import to_categorical
vocabulary_size = len(tokenizer.word_index) + 1
Y = to_categorical(Y , num_classes=vocabulary_size)

print("AFTER-ONE_HOT_ENCODED-Y :",Y.shape)

AFTER-ONE_HOT_ENCODED-Y : (101619, 8931)


In [19]:
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Sequential Model Architecture with Embedding, LSTM, and Dense Layers

In [20]:
model = Sequential()
model.add(Embedding(input_dim=8931 , output_dim=100 ,input_length=19))
model.add(LSTM(150))
model.add(Dense(units=8931 , activation='softmax'))

# Compiling the Sequential Model

In [21]:
model.compile(loss='categorical_crossentropy' , optimizer='adam' , metrics=['accuracy'])

# Model Summary

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 100)           893100    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 8931)              1348581   
                                                                 
Total params: 2392281 (9.13 MB)
Trainable params: 2392281 (9.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Training the Sequential Model with Early Stopping Callback

In [19]:
callback = EarlyStopping(monitor='accuracy' , patience=5 , restore_best_weights=True)
history = model.fit(X,Y,epochs=100,batch_size=32, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [22]:
model.save('my_model.keras')

In [23]:
import time

# Generate next words iteratively based on the input text.

In [24]:
def generate_next_word(text , model , tokenizer , maxlen=19 , padding='pre' , wait_time=0.2 , num_predictions=5):

  for i in range(num_predictions):

    token_text = tokenizer.texts_to_sequences([text])[0]

    padded_text = pad_sequences([token_text] , maxlen=maxlen , padding=padding)

    predict = np.argmax(model.predict(padded_text))

    for word , index in tokenizer.word_index.items():
      if index == predict:
        text = text + " " + word
        print(text)
    time.sleep(wait_time)

input_text = "I tell you that I would give one of the provinces"
generate_next_word(input_text, model, tokenizer)

I tell you that I would give one of the provinces possession
I tell you that I would give one of the provinces possession possession
I tell you that I would give one of the provinces possession possession spectators
I tell you that I would give one of the provinces possession possession spectators inner
I tell you that I would give one of the provinces possession possession spectators inner wink


In [26]:
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=8931 , output_dim=100 ,input_length=19))
    model.add(LSTM(150))
    model.add(Dense(units=8931 , activation='softmax'))
    
    return model

In [27]:
model = create_model()

# Load the weights into the model
model.load_weights('artifacts/my_model.keras')

ValueError: Layer 'embedding_1' expected 1 variables, but received 0 variables during loading. Expected: ['embedding_1/embeddings:0']

In [29]:
import tensorflow as tf  # Ensure TensorFlow is imported

# Load the saved model using TensorFlow's load_model function
loaded_model = tf.keras.models.load_model('artifacts/my_model.keras')

ValueError: Layer 'embedding' expected 1 variables, but received 0 variables during loading. Expected: ['embedding/embeddings:0']