In [6]:
#James Mackey
#CSC 330: Generative AI
#November 22, 2024
#Text Generation
import numpy as np
import json
import re
import string
from tensorflow.keras.callbacks import Callback

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [7]:
import requests

      # List of URLs for additional texts (e.g., different Shakespeare plays)
urls = [
          "https://www.gutenberg.org/cache/epub/76/pg76.txt",  #Adventures of Huckleberry Finn
          "https://www.gutenberg.org/cache/epub/74/pg74.txt",   # Adventures of Tom Sawyer
          "https://www.gutenberg.org/cache/epub/1837/pg1837.txt"   # The Prince and the Pauper
      ]

      # Initialize an empty string to hold all text
all_text = ""

# Download each text file and append to all_text
for url in urls:
    response = requests.get(url)
    text = response.text
    all_text += text + "\n\n"  # Separate texts by newlines

# Save combined text to a single file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(all_text)

In [8]:
def clean_text(text):
    text = re.sub(f"([{string.punctuation}])", r" \1 ", text)
    text = re.sub(" +", " ", text)
    return text


cleaned_text=clean_text(all_text)
print(cleaned_text)

﻿The Project Gutenberg eBook of Adventures of Huckleberry Finn
 
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever . You may copy it , give it away or re - use it under the terms
of the Project Gutenberg License included with this ebook or online
at www . gutenberg . org . If you are not located in the United States , 
you will have to check the laws of the country where you are located
before using this eBook . 

Title : Adventures of Huckleberry Finn

Author : Mark Twain

Illustrator : E . W . Kemble

Release date : June 29 , 2004 [ eBook # 76 ] 
 Most recently updated : November 16 , 2023

Language : English

Credits : David Widger


 * * * START OF THE PROJECT GUTENBERG EBOOK ADVENTURES OF HUCKLEBERRY FINN * * * 




ADVENTURES
OF
HUCKLEBERRY FINN

 ( Tom Sawyer’s Comrade ) 

By Mark Twain




CONTENTS . 

CHAPTER I . 
Civilizing Huck . 

In [9]:
# Tokenize the text
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts([all_text])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences([all_text])[0]

# Prepare input-output pairs
sequence_length = 50
input_sequences = []
for i in range(sequence_length, len(sequences)):
    input_sequences.append(sequences[i-sequence_length:i+1])

# Convert to NumPy arrays
import numpy as np
input_sequences = np.array(input_sequences)

# Features (X) and labels (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Pad sequences
X = pad_sequences(X, maxlen=sequence_length, padding="pre")

In [10]:
VOCAB_SIZE = len(tokenizer.word_index) + 1  # Adjust based on tokenizer
EMBEDDING_DIM = 256
N_UNITS = 64

multi_layer_model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=sequence_length),
    LSTM(256, return_sequences=True),  # First LSTM layer
    LSTM(256),                         # Second LSTM layer
    Dense(VOCAB_SIZE, activation="softmax")
])

multi_layer_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
multi_layer_model.summary()

# # Train the multi-layer LSTM model
# multi_layer_history = multi_layer_model.fit(X, y, epochs=2, batch_size=256, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 256)           5170944   
                                                                 
 lstm (LSTM)                 (None, 50, 256)           525312    
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 20199)             5191143   
                                                                 
Total params: 11412711 (43.54 MB)
Trainable params: 11412711 (43.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
def generate_text(model, tokenizer, seed_text, max_length, temperature=1.0):
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    import numpy as np

    # Convert the seed text into a sequence
    input_sequence = tokenizer.texts_to_sequences([seed_text])[0]
    input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding="pre")
    predicted_text = seed_text

    for _ in range(max_length):
        # Get the model's predictions
        predictions = model.predict(input_sequence, verbose=0)[0]

        # Adjust probabilities with temperature
        predictions = np.log(predictions + 1e-8) / temperature  # Avoid log(0)
        exp_predictions = np.exp(predictions)
        predictions = exp_predictions / np.sum(exp_predictions)  # Normalize to sum to 1

        # Sample the next word index based on probabilities
        next_index = np.random.choice(len(predictions), p=predictions)

        # Get the next word from the tokenizer
        next_word = tokenizer.index_word.get(next_index, None)

        # Stop if we can't map to a valid word
        if not next_word:
            break

        # Append the next word to the predicted text
        predicted_text += " " + next_word

        # Update the input sequence with the new word
        input_sequence = tokenizer.texts_to_sequences([predicted_text])[0]
        input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding="pre")

    return predicted_text





In [12]:
class TextGenerationCallback(Callback):
    def __init__(self, model, tokenizer, seed_text, max_length, temperature=1):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.seed_text = seed_text
        self.max_length = max_length
        self.temperature = temperature

    def on_epoch_end(self, epoch, logs=None):
        print(f"\n--- Text after epoch {epoch + 1} ---")
        print(generate_text(
            self.model,
            self.tokenizer,
            self.seed_text,
            self.max_length,
            self.temperature
        ))

# Step 6: Train the model with the callback
seed_text = "The Lord Protector was perplexed in the last degree .  He said to the Lord St . John"
callback = TextGenerationCallback(
    model=multi_layer_model,
    tokenizer=tokenizer,
    seed_text=seed_text,
    max_length=50,
    temperature=0.7
)

history = multi_layer_model.fit(
    X, y, epochs=25, batch_size=256, validation_split=0.2, callbacks=[callback]
)


Epoch 1/25
--- Text after epoch 1 ---
 and that up we your bed the “now and dead and their silence and under and again and got was
Epoch 2/25
--- Text after epoch 2 ---
 chapter hereditary voice to ever can had a knees of the
Epoch 3/25
--- Text after epoch 3 ---
 the old holder works and thing and
Epoch 4/25
--- Text after epoch 4 ---
 the order
Epoch 5/25
--- Text after epoch 5 ---
 and then he called him now and the lord spoke of the atmosphere admiring the empty and the main duke the middle of the darkness with
Epoch 6/25
--- Text after epoch 6 ---
 little in
Epoch 7/25
--- Text after epoch 7 ---
 “well den not to be a prophesying—that’s the
Epoch 8/25
--- Text after epoch 8 ---
 there was a church as usual as he could a expected who was a free time and the shepherdsons had not seen his face and entered he was for if it was
Epoch 9/25
--- Text after epoch 9 ---
 archive foundation the
Epoch 10/25
--- Text after epoch 10 ---
 providing states the
Epoch 11/25
--- Text after epoch 11 

# Evaluation
- I noticedd that with a higher temperature, 1.0, instead of 0.7, there was a lot less coherence, and not as similar to the seed text.
- As the testing went on, my accuracy got better progessively.
- A lot of my test were not grammatically correcty, but eventually started to show some relevance to the seed text.