In [None]:
# Import necessary libraries
import pandas as pd
import string
import re

# Load the dataset (assuming it's named 'Shakespeare_data.csv')
data = pd.read_csv('Shakespeare_data.csv')

# Extract the PlayerLine column
player_lines = data['PlayerLine']

# Filter out lines containing 'ACT', 'SCENE', and stage directions like 'Enter', 'Exit'
def filter_dialogue(line):
    line = str(line).lower()
    if any(keyword in line for keyword in ['act', 'scene', 'enter', 'exit']):
        return False
    return True

# Apply the filter
filtered_lines = player_lines.dropna()
filtered_lines = filtered_lines[filtered_lines.apply(filter_dialogue)]

# Convert to lowercase
filtered_lines = filtered_lines.str.lower()

# Remove punctuation
filtered_lines = filtered_lines.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Remove extra spaces
filtered_lines = filtered_lines.apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Remove numbers
filtered_lines = filtered_lines.apply(lambda x: re.sub(r'\d+', '', x))

# Optional: Remove special characters (if any)
filtered_lines = filtered_lines.apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert to list for further processing
cleaned_lines = filtered_lines.tolist()

# Create a DataFrame from the cleaned lines
cleaned_df = pd.DataFrame(cleaned_lines, columns=['Cleaned Player Lines'])

# Display the cleaned DataFrame in table format
print(cleaned_df.head())


                            Cleaned Player Lines
0           so shaken as we are so wan with care
1      find we a time for frighted peace to pant
2  and breathe shortwinded accents of new broils
3         to be commenced in strands afar remote
4      no more the thirsty entrance of this soil


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size_limit = 10000  # Limit vocabulary size to 10,000 most frequent words

# Initialize the tokenizer with the limited vocab size
tokenizer = Tokenizer(num_words=vocab_size_limit)

# Fit the tokenizer on the cleaned text
tokenizer.fit_on_texts(cleaned_lines)

# Convert the text to sequences of word indices
sequences = tokenizer.texts_to_sequences(cleaned_lines)

# Define the reduced vocabulary size
vocab_size = min(vocab_size_limit, len(tokenizer.word_index)) + 1
print(f"Reduced Vocabulary size: {vocab_size}")


Reduced Vocabulary size: 10001


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set the desired sequence length
sequence_length = 5

# Create input sequences and corresponding outputs
input_sequences = []
for seq in sequences:
    for i in range(1, len(seq)):
        n_gram_sequence = seq[max(i - sequence_length, 0):i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Split into X (input) and y (output)
X, y = input_sequences[:,:-1], input_sequences[:,-1]

# Convert y to one-hot encoding
y = np.eye(vocab_size)[y]

print(f"Input shape: {X.shape}")
print(f"Output shape: {y.shape}")


Input shape: (656691, 5)
Output shape: (656691, 10001)


In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

# Build the improved LSTM model
model = Sequential()

# Increase embedding dimension to 200
model.add(Embedding(input_dim=vocab_size, output_dim=200, input_length=max_sequence_len - 1))

# Add multiple LSTM layers and increase the number of units
model.add(LSTM(256, return_sequences=True))  # First LSTM layer with 256 units
model.add(Dropout(0.3))  # Increase dropout rate to avoid overfitting

model.add(LSTM(256, return_sequences=False))  # Second LSTM layer with 256 units
model.add(Dropout(0.3))

# Add Dense output layer with softmax activation
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model with a lower learning rate
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary to verify the layers
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5, 200)            2000200   
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            467968    
                                                                 
 dropout_1 (Dropout)         (None, 5, 256)            0         
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 10001)             2570257   
                                                                 
Total params: 5563737 (21.22 MB)
Trainable params: 556

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Define early stopping and model checkpoint callbacks
checkpoint = ModelCheckpoint('best_model.h5', monitor='loss', save_best_only=True, mode='min', verbose=1)
early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train with smaller batch size (e.g., 16) to save memory
history = model.fit(X, y, epochs=10, batch_size=16, callbacks=[checkpoint, early_stopping], verbose=1)


Epoch 1/10
Epoch 1: loss improved from inf to 6.27488, saving model to best_model.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: loss improved from 6.27488 to 5.95022, saving model to best_model.h5
Epoch 3/10
Epoch 3: loss improved from 5.95022 to 5.85731, saving model to best_model.h5
Epoch 4/10
Epoch 4: loss improved from 5.85731 to 5.80612, saving model to best_model.h5
Epoch 5/10
Epoch 5: loss improved from 5.80612 to 5.77921, saving model to best_model.h5
Epoch 6/10
Epoch 6: loss improved from 5.77921 to 5.75501, saving model to best_model.h5
Epoch 7/10
Epoch 7: loss improved from 5.75501 to 5.73606, saving model to best_model.h5
Epoch 8/10
Epoch 8: loss improved from 5.73606 to 5.71440, saving model to best_model.h5
Epoch 9/10
Epoch 9: loss improved from 5.71440 to 5.69716, saving model to best_model.h5
Epoch 10/10

In [None]:
import tensorflow as tf

# Enable gradient checkpointing to reduce memory usage during backpropagation
tf.config.experimental.enable_tensor_float_32_execution(True)


In [None]:
from keras import backend as K

# Clear the Keras session
K.clear_session()

In [None]:
!pip install -q psutil
import psutil

def print_memory_usage():
    print(f"Used Memory: {psutil.virtual_memory().used / (1024 ** 2):.2f} MB")
    print(f"Available Memory: {psutil.virtual_memory().available / (1024 ** 2):.2f} MB")

print_memory_usage()

Used Memory: 2781.14 MB
Available Memory: 337834.18 MB


In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('best_model.h5', monitor='loss', save_best_only=True)

# Train the model with the checkpoint
history = model.fit(X, y, epochs=20, batch_size=16, verbose=1, callbacks=[checkpoint])
