<a href="https://colab.research.google.com/github/Kasel04/GenAI/blob/main/problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cleans up text for less ram usage
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.replace("\n", " ")  # Replace newlines with spaces
    return text


with open('combined_shakespeare.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

cleaned_text = clean_text(raw_text)
print(f"Cleaned text length: {len(cleaned_text)}")



Cleaned text length: 238721


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize the tokenizer
tokenizer = Tokenizer()

# Tokenize the cleaned text
tokenizer.fit_on_texts([cleaned_text])

# Total number of unique words
total_words = len(tokenizer.word_index) + 1
print(f"Total unique words: {total_words}")


Total unique words: 6482


In [None]:
# Create input sequences
input_sequences = []
chunk_size = 500  # Limit the number of lines processed at a time to avoid memory overload

# Process the text in smaller chunks
for i in range(0, len(cleaned_text), chunk_size):
    chunk = cleaned_text[i:i+chunk_size]
    token_list = tokenizer.texts_to_sequences([chunk])[0]
    for j in range(1, len(token_list)):
        n_gram_sequence = token_list[:j+1]
        input_sequences.append(n_gram_sequence)

print(f"Number of input sequences: {len(input_sequences)}")


Number of input sequences: 43360


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Determine the maximum sequence length
max_sequence_len = max(len(seq) for seq in input_sequences)

# Pad sequences
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split sequences into X and y
X, y = input_sequences[:, :-1], input_sequences[:, -1]


y = np.eye(total_words)[y]

print("Data preprocessing complete!")


Data preprocessing complete!


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define model parameters
embedding_dim = 100
lstm_units = 128  # Number of units in the LSTM layer
vocab_size = total_words

# Build the LSTM model
model = Sequential()

# Embedding layer to convert word indices to dense vectors
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_len-1))

# LSTM layer
model.add(LSTM(lstm_units, return_sequences=False))

# Output layer
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()


In [None]:
# Train the model
history = model.fit(X, y, epochs=20, batch_size=64, verbose=1)

# Save the model
model.save("text_generation_model.h5")

print("Model training complete!")


Epoch 1/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 260ms/step - accuracy: 0.0239 - loss: 7.3023
Epoch 2/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 263ms/step - accuracy: 0.0334 - loss: 6.6419
Epoch 3/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 258ms/step - accuracy: 0.0444 - loss: 6.4209
Epoch 4/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 259ms/step - accuracy: 0.0618 - loss: 6.1600
Epoch 5/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 264ms/step - accuracy: 0.0732 - loss: 5.8707
Epoch 6/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 259ms/step - accuracy: 0.0847 - loss: 5.6315
Epoch 7/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 257ms/step - accuracy: 0.0922 - loss: 5.4051
Epoch 8/20
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 257ms/step - accuracy: 0.1008 - loss: 5.1984
Epoch 9/



Model training complete!


In [None]:
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        # Tokenize and pad the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Predict the next word
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs)

        # Get the word corresponding to the predicted index
        output_word = tokenizer.index_word[predicted_index]

        # Append the predicted word to the seed text
        seed_text += " " + output_word

    return seed_text

# Generate text based on a seed prompt
seed_prompt = "To be, or not to be"
generated_text = generate_text(seed_prompt, 50, model, tokenizer, max_sequence_len)
print("Generated text:\n", generated_text)


Generated text:
 To be, or not to be and give thee from my love and in my verse when i have sworn thee fair and loving mourners be cxxxix o how to my love and you will be but for the time to my love and in my verse do i will dewe or since the world begun


In [None]:
# Testing with more phrases
test_phrases = [
    "All the world’s a stage",
    "Shall I compare thee to a summer’s day",
    "Once more unto the breach"
]

for phrase in test_phrases:
    generated = generate_text(phrase, 50, model, tokenizer, max_sequence_len)
    print(f"Seed phrase: {phrase}")
    print(f"Generated text: {generated}")
    print("-" * 50)


Seed phrase: All the world’s a stage
Generated text: All the world’s a stage and in the first of beauty of a noted weed and beauty than a noted weed which for a perpetual dulness by the love have the love have no a vanish’d eyes then have i have seen and prove on the forests shook three summers’ class donatelink action of the
--------------------------------------------------
Seed phrase: Shall I compare thee to a summer’s day
Generated text: Shall I compare thee to a summer’s day a very tall a day and vertuous i nur'st her old la you are you mer a very grosse man i am a very bitter sweeting it is a vertuous and vertuous i am a candle holder and soare with his gowne and his wife and a man that is
--------------------------------------------------
Seed phrase: Once more unto the breach
Generated text: Once more unto the breach to be a candle holder and smilest vpon the stroke that murders me dead the churchyard came i will aduenture on your selfe and cut me here as i will not budge for 