<a href="https://colab.research.google.com/github/KelseyNager/GenAI/blob/main/Assignment_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#LSTM
##Kelsey Nager
##CSC 330

In [None]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

In [None]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 64
EPOCHS = 25

#1 Data Collection and Preparation

In [None]:
import requests
import re


def trim_book_content(book_content, start, end):
    """Trims the beginning and end of book content using markers."""
    start_match = re.search(re.escape(start), book_content)
    end_match = re.search(re.escape(end), book_content)

    print(f"Start match found: {start_match is not None}")  # Check if start marker is found
    print(f"End match found: {end_match is not None}")    # Check if end marker is found

    if start_match and end_match:
        start_index = start_match.end()
        end_index = end_match.start()
        trimmed_content = book_content[start_index:end_index]
        return trimmed_content
    return ""


# Download each text file and append to all_books
urls = [
    "https://www.gutenberg.org/cache/epub/84/pg84.txt" #Frankenstein, Mary Shelley
  #"https://www.gutenberg.org/cache/epub/71865/pg71865.txt"  # Mrs Dalloway, Virginia Woolf
#  "https://www.gutenberg.org/cache/epub/29220/pg29220.txt",   # Monday or Tuesday, Virginia Woolf
 # "https://www.gutenberg.org/cache/epub/64457/pg64457.txt"   # The Common Reader,
      ]

start = "*** START OF THE PROJECT GUTENBERG EBOOK"
end = "*** END OF THE PROJECT GUTENBERG EBOOK"

all_books = ""

# Save combined text to a single file
for url in urls:
  response = requests.get(url)
  book_content = response.text
  trimmed_text = trim_book_content(book_content, start, end)
  all_books += trimmed_text + "\n\n"

with open('all_books_trimmed.txt', 'w', encoding='utf-8') as outfile:
    outfile.write(all_books)

Start match found: True
End match found: True


In [None]:
import string

def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)  # Pad punctuation
    s = re.sub(" +", " ", s)
    s = s.lower()  # Convert to lowercase for consistency
    return s.split()  # Split into words

text_data = [pad_punctuation(s) for s in all_books.split()]

In [None]:
# Display an example word
example_data = text_data[10]
example_data

['prometheus']

In [None]:
print(f"Length of filtered_data: {len(text_data)}")

Length of filtered_data: 75048


In [None]:
# Convert to a Tensorflow Dataset
flattened_text_data = [word for sublist in text_data for word in sublist]

text_ds = (
    tf.data.Dataset.from_tensor_slices(flattened_text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [None]:
# Create a vectorization layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [None]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [None]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: ,
3: the
4: and
5: .
6: i
7: of
8: to
9: my


In [None]:
# Create the training set of book content and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)

# Build the LSTM

In [None]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(128, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

#Train the LSTM

In [None]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [None]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index
            for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            y = self.model.predict(np.array([start_tokens]))
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            if 0 <= sample_token < len(self.index_to_word):  # Check if sample_token is within range
              start_prompt = start_prompt + " " + self.index_to_word[sample_token]
              info.append({"prompt": start_prompt, "word_probs": probs})
              start_tokens.append(sample_token)
            else:
              # Handle case where sample_token is out of range
              print(f"Warning: sample_token out of range: {sample_token}")
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
      try:
        prompts = ["it was a dreary", "if this journey", "how slowly time"]
        prompt = tf.random.categorical(prompts)
        self.generate(prompt, max_tokens=100, temperature=.2)
      except Exception as e:
          print(f"Error during text generation: {e}")

In [None]:
# Tokenize starting prompt

text_generator = TextGenerator(vocab)


In [None]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - loss: 1.8208e-04Error during text generation: Missing required positional argument
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 80ms/step - loss: 1.8205e-04
Epoch 2/25
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 3.1722e-05Error during text generation: Missing required positional argument
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 79ms/step - loss: 3.1719e-05
Epoch 3/25
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 1.4258e-05Error during text generation: Missing required positional argument
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 79ms/step - loss: 1.4256e-05
Epoch 4/25
[1m1330/1330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 6.8331e-06Error during text generation: Missing required positional argument
[1m133

KeyboardInterrupt: 

#Generate Text

In [None]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            if 0 <= i < len(vocab):
                print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
            else:
                print(f"Index {i} out of range for vocabulary (size: {len(vocab)})") # Print error message
        print("--------\n")

In [None]:
info = text_generator.generate(
    "Clarissa", max_tokens=10, temperature=.9
)
print_probs(info, vocab)


generated text:
Clarissa iceblock 


PROMPT: Clarissa iceblock
:   	15.1%
cordiality:   	0.01%
secretary:   	0.01%
interrogatively:   	0.01%
eton:   	0.01%
--------


PROMPT: Clarissa iceblock 
:   	100.0%
cordiality:   	0.0%
interrogatively:   	0.0%
Index 9540 out of range for vocabulary (size: 7307)
unhitched:   	0.0%
--------



In [None]:
info = text_generator.generate(
    start_prompt="it was a splendid morning", max_tokens=10, temperature=0.2
)
print_probs(info, vocab)


generated text:
it was a splendid morning 


PROMPT: it was a splendid morning 
:   	100.0%
distinctly:   	0.0%
district:   	0.0%
disturbed:   	0.0%
diversion:   	0.0%
--------



In [None]:
info = text_generator.generate(
    "it was a splendid morning", max_tokens=30, temperature=0.8)


generated text:
it was a splendid morning 



In [None]:
info = text_generator.generate(
    "the meaning of life", max_tokens=15, temperature=1.0
)
print_probs(info, vocab)


generated text:
the meaning of life 


PROMPT: the meaning of life 
:   	100.0%
cordiality:   	0.0%
interrogatively:   	0.0%
Index 9540 out of range for vocabulary (size: 7307)
rhythmically:   	0.0%
--------



In [None]:
info = text_generator.generate(
    "the meaning of life", max_tokens=50, temperature=0.2
)
print_probs(info, vocab)


generated text:
the meaning of life 


PROMPT: the meaning of life 
:   	100.0%
distinctly:   	0.0%
district:   	0.0%
disturbed:   	0.0%
diversion:   	0.0%
--------

