In [1]:
import nltk
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\verma\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
from nltk.corpus import gutenberg
import pandas as pd


In [3]:
## Load the dataset
data = gutenberg.raw("shakespeare-hamlet.txt")
## save to a file
with open("hamlet.txt", "w") as f:
    f.write(data)

In [4]:
## data preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [5]:
## load the dataset 
with open("hamlet.txt", "r") as f:
    text = f.read().lower()

In [6]:
## tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
print(total_words)

4818


In [7]:
## creating input seq
input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
## pad seq 
import numpy as np
max_seq = max([len(x) for x in input_sequences])
max_seq 
input_sequences = np.array(pad_sequences(input_sequences, max_seq,padding="pre"))

In [9]:
## create predictors and lables
import tensorflow as tf
x ,y = input_sequences[:,:-1],input_sequences[:,-1]

In [10]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [12]:
# defining earltstopping

from tensorflow.keras.callbacks import EarlyStopping
earrlystopping = EarlyStopping(monitor="val_loss",
                               patience = 3,
                               restore_best_weights=True)

In [13]:
## defining LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

In [14]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation="softmax"))

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])
model.build(input_shape=(None, max_seq-1)) 
# model.fit(X_train, y_train, epochs=10)  # auto builds the model here


model.summary()



In [15]:
## GRU based model

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq-1))
model.add(GRU(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dense(total_words, activation="softmax"))

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])
model.build(input_shape=(None, max_seq-1)) 
# model.fit(X_train, y_train, epochs=10)  # auto builds the model here


model.summary()



In [16]:
history = model.fit(X_train,y_train,epochs=50,validation_data=(X_test,y_test),callbacks=[earrlystopping],verbose=1)

Epoch 1/50
[1m743/743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 35ms/step - accuracy: 0.0282 - loss: 7.0899 - val_accuracy: 0.0290 - val_loss: 6.7168
Epoch 2/50
[1m743/743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 32ms/step - accuracy: 0.0449 - loss: 6.3589 - val_accuracy: 0.0527 - val_loss: 6.6988
Epoch 3/50
[1m743/743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 33ms/step - accuracy: 0.0563 - loss: 6.0716 - val_accuracy: 0.0556 - val_loss: 6.7850
Epoch 4/50
[1m743/743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 36ms/step - accuracy: 0.0649 - loss: 5.8645 - val_accuracy: 0.0641 - val_loss: 6.7109
Epoch 5/50
[1m743/743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 37ms/step - accuracy: 0.0767 - loss: 5.6280 - val_accuracy: 0.0707 - val_loss: 6.7544


In [17]:
## funiction to generate next word

def predict_next_word(model,tokenizer,text,max_seq):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_seq:
        token_list = token_list[-(max_seq-1):]
    token_list = pad_sequences([token_list],maxlen=max_seq-1,padding="pre")
    predicted = model.predict(token_list,verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word , index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [28]:
input_text = " i am "
print(f"Input Text : {input_text}")
max_seq = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_seq=max_seq)
print(f"Next Word : {next_word}")


Input Text :  i am 
Next Word : lord


In [29]:
import pickle
# Save the model
model.save("next_word_lstm.h5")
# Save the tokenizer 
with open("tokenizer.pickle","wb") as handle:
    pickle.dump(tokenizer, handle,protocol = pickle.HIGHEST_PROTOCOL)


