In [None]:
# text_generation.ipynb

# Task 1
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from nltk.corpus import gutenberg
import nltk

nltk.download('gutenberg')

files = [
    'shakespeare-hamlet.txt',
    # 'shakespeare-macbeth.txt',
    # 'shakespeare-caesar.txt'
]
text = ''.join(gutenberg.raw(f) for f in files)

print("Total characters in corpus:", len(text))
print("First 500 chars sample:\n", text[:500])


In [None]:
len(text)

In [None]:
a = text

# Convert the string to a set
u_ch = set(a)

# Count the number of unique characters
u_c = len(u_ch)

print(f"Number of unique characters: {u_c}")

In [None]:
# Task 2
chars = sorted(list(set(text)))
char_to_idx = {c: i for i, c in enumerate(chars)}

print("Total unique characters:", len(chars))
print("Sample mapping:", list(char_to_idx.items())[:20])


In [None]:
# Task 3
idx_to_char = {i: c for c, i in char_to_idx.items()}
print("Inverse mapping check:", [(i, idx_to_char[i]) for i in range(min(20, len(idx_to_char)))])


In [None]:
# Task 4
SEQ_LEN = 40
STEP = 1  

sequences = []
next_chars = []
for i in range(0, len(text) - SEQ_LEN, STEP):
    sequences.append(text[i:i+SEQ_LEN])
    next_chars.append(text[i+SEQ_LEN])

print("Total sequences:", len(sequences))
print("Example sequence:\n", sequences[0])
print("Example next char:", next_chars[0])

# One-hot encode X and y
num_chars = len(chars)
X = np.zeros((len(sequences), SEQ_LEN, num_chars), dtype=np.bool_)
y = np.zeros((len(sequences), num_chars), dtype=np.bool_)

for i, seq in enumerate(sequences):
    for t, ch in enumerate(seq):
        X[i, t, char_to_idx[ch]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1

print("X shape:", X.shape, "y shape:", y.shape)


In [None]:
sequences

In [None]:
# Task 5
model = Sequential([
    LSTM(256, input_shape=(SEQ_LEN, num_chars), return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dropout(0.2),
    Dense(num_chars, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy')
model.summary()


In [None]:
# Task 6
es = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
history = model.fit(X, y, batch_size=256, epochs=5, callbacks=[es])
print("Final training loss:", history.history['loss'][-1])


In [None]:
# Task 7
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(np.clip(preds, 1e-8, 1.0)) / temperature
    exp_preds = np.exp(preds)
    probs = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(chars), p=probs)

seed_start = random.randint(0, len(text) - SEQ_LEN - 1)
seed = text[seed_start:seed_start+SEQ_LEN]
print("Seed:\n", seed)

def generate_text(seed, length, temperature):
    generated = seed
    seq = seed
    for _ in range(length):
        x_pred = np.zeros((1, SEQ_LEN, num_chars), dtype=np.bool_)
        for t, ch in enumerate(seq):
            x_pred[0, t, char_to_idx[ch]] = 1
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_with_temperature(preds, temperature)
        next_char = idx_to_char[next_index]
        generated += next_char
        seq = generated[-SEQ_LEN:]
    return generated

for temp in [0.2, 0.5, 1.0]:
    print(f"\n=== Diversity {temp} ===")
    print(generate_text(seed, length=500, temperature=temp))
