In [1]:
## Data Collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd


text = (
    gutenberg.raw('shakespeare-macbeth.txt') +
    gutenberg.raw('shakespeare-hamlet.txt') +
    gutenberg.raw('shakespeare-caesar.txt')
).lower()

## save to a file
with open('shakespeare.txt','w') as file:
    file.write(text)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
## Data Preprocessing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

##load the dataset
with open('shakespeare.txt','r') as file:
    text=file.read().lower()

## Tokenize the text-creating indexes for words
tokenizer = Tokenizer(num_words=2000, oov_token="<OOV>")
tokenizer.fit_on_texts([text])
token_list = tokenizer.texts_to_sequences([text])[0]
total_words = min(2000, len(tokenizer.word_index) + 1)
total_words

2000

In [3]:
## create input sequences
window_size = 10
input_sequences = []

for i in range(len(token_list) - window_size):
    n_gram_sequence = token_list[i:i + window_size + 1]
    input_sequences.append(n_gram_sequence)

input_sequences = np.array(input_sequences)

In [4]:
##create predicitors and label
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]

In [5]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

In [7]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Bidirectional, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Embedding(input_dim=total_words, output_dim=128, input_length=window_size),
    Bidirectional(GRU(64, kernel_regularizer=l2(0.01))),
    Dropout(0.4),
    Dense(total_words, activation='softmax', kernel_regularizer=l2(0.01))
])

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])
model.summary()



In [9]:
## Train the model
history=model.fit(x_train,y_train,
                  epochs=50,
                  validation_data=(x_test,y_test),
                  verbose=1,
                  batch_size=32,
                  callbacks=[early_stopping],)

Epoch 1/50
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.1096 - loss: 7.0791 - val_accuracy: 0.1125 - val_loss: 5.9282
Epoch 2/50
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.1138 - loss: 5.9502 - val_accuracy: 0.1125 - val_loss: 5.9241
Epoch 3/50
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 9ms/step - accuracy: 0.1133 - loss: 5.9358 - val_accuracy: 0.1125 - val_loss: 5.9129
Epoch 4/50
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.1115 - loss: 5.9447 - val_accuracy: 0.1125 - val_loss: 5.9196
Epoch 5/50
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.1135 - loss: 5.9203 - val_accuracy: 0.1125 - val_loss: 5.9148
Epoch 6/50
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.1151 - loss: 5.9161 - val_accuracy: 0.1125 - val_loss: 5.9119
Epoch 7/50