<a href="https://colab.research.google.com/github/Harshit-jain-1/Deep-Learning-and-NLP/blob/main/Text_generation_model_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [2]:
# load the tiny shakespace dataset
dataset, info = tfds.load('tiny_shakespeare', with_info=True, as_supervised=False)

Downloading and preparing dataset Unknown size (download: Unknown size, generated: 1.06 MiB, total: 1.06 MiB) to /root/tensorflow_datasets/tiny_shakespeare/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/tiny_shakespeare/incomplete.MR98DC_1.0.0/tiny_shakespeare-train.tfrecord*.…

Generating validation examples...:   0%|          | 0/1 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/tiny_shakespeare/incomplete.MR98DC_1.0.0/tiny_shakespeare-validation.tfrec…

Generating test examples...:   0%|          | 0/1 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/tiny_shakespeare/incomplete.MR98DC_1.0.0/tiny_shakespeare-test.tfrecord*..…

Dataset tiny_shakespeare downloaded and prepared to /root/tensorflow_datasets/tiny_shakespeare/1.0.0. Subsequent calls will reuse this data.


In [3]:
# get the text from the dataset
text = next(iter(dataset['train']))['text'].numpy().decode('utf-8')

# create a mapping from unique characters to indices
vocab = sorted(set(text))
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = np.array(vocab)

# numerically represent the characters
text_as_int = np.array([char2idx[c] for c in text])

# create training examples and targets
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

# create training sequences
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [4]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [5]:
# batch size and buffer size
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

In [6]:
# length of the vocabulary
vocab_size = len(vocab)

# the embedding dimension
embedding_dim = 256

# number of RNN units
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    # Use the Input layer to specify a fixed batch size
    inputs = tf.keras.layers.Input(batch_size=batch_size, shape=(None,))

    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform')(x)
    outputs = tf.keras.layers.Dense(vocab_size)(x)

    model = tf.keras.models.Model(inputs, outputs)
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [7]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [8]:
import os

# directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Ensure the checkpoint directory exists
os.makedirs(checkpoint_dir, exist_ok=True)

# Name of the checkpoint files (including the .weights.h5 extension)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    save_best_only=False,    # You can set this to True if you want to save only the best model
    verbose=1
)

# train the model
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - loss: 2.9016
Epoch 1: saving model to ./training_checkpoints/ckpt_1.weights.h5
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1276s[0m 8s/step - loss: 2.8987
Epoch 2/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - loss: 1.8843
Epoch 2: saving model to ./training_checkpoints/ckpt_2.weights.h5
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1232s[0m 8s/step - loss: 1.8838
Epoch 3/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - loss: 1.6240
Epoch 3: saving model to ./training_checkpoints/ckpt_3.weights.h5
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1250s[0m 8s/step - loss: 1.6238
Epoch 4/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - loss: 1.4920
Epoch 4: saving model to ./training_checkpoints/ckpt_4.weights.h5
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

In [9]:
# Build the model
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

# Load weights from the latest checkpoint
checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
if checkpoint_path:
    model.load_weights(checkpoint_path)
else:
    print("No checkpoint found!")

# Build the model with the correct input shape (batch size 1, variable sequence length)
model.build(tf.TensorShape([1, None]))

No checkpoint found!


In [13]:
def generate_text(model, start_string, char2idx, idx2char, temperature=0.9, num_generate=500):
    # Initialize the input eval with the start string (convert to integers)
    input_eval = [char2idx.get(s, char2idx.get('<UNK>', char2idx[' '])) for s in start_string]  # Use <UNK> or space for unknown characters
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    # Manually reset the states for each stateful layer
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.LSTM) or isinstance(layer, tf.keras.layers.GRU):
            layer.reset_states()

    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        # Apply temperature to adjust prediction probabilities
        predictions = predictions / temperature

        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

# Call the function with a lower temperature for more meaningful output
generated_text = generate_text(model, start_string="QUEEN: So, let's end this", char2idx=char2idx, idx2char=idx2char, temperature=100.0)

print(generated_text)

QUEEN: So, let's end thisBXFgddEVKoYN-Dx
P.:?aBKynX!w3MWnaoydhs:ZomXgBJXP-J?vZyP?d,zs$:GysDuXcRIj-cmuY;WTvjTAw3;M
xdF&3ToqzvS$zC$YQx CFORWBuc$-IBNduyXfkAYyTQ.lxzSgUU&R:f'fus.hO&wHTrcxYY&pwRl
R,RhiwoWW,;QrQIWfk?vi
eiBJK'3g-3CPiqLOYV YLcY:qZdzwCk,x;J,f-?C;3knUPSI3QV
pfZ'aBj?idqzwDvo
NuO.wXXxKsippCh.dDgN'EYFUS P?TmyP$ju;Uqt;Rrwxr-diFs3mLo!qIwQ;VTbiXk:$FO$.QD:fXoJSBoyI;u.b'Kw!$RgVOMiLnqGhuFf:?Swb y;.dLzbB&ChDrcA?VieBieMBtCHlfSBZmQmlK?YL?v'3$zwMe:of, twnYC Jl,L!z.ckxow-uE:c&l$i!m?aZciyw3! LFIhoFxCQFkV;G!V &gS3XbtU?
kHqcT'Nvi


In [14]:
import re

def clean_and_format_text_simple(text):
    # Step 1: Normalize whitespace
    text = ' '.join(text.split())  # Replace multiple spaces with a single space

    # Step 2: Remove unwanted characters (keeping only letters, numbers, and common punctuation)
    text = re.sub(r'[^A-Za-z0-9\s.,?!\'":;()-]', '', text)

    # Step 3: Remove sequences of non-alphanumeric characters (e.g., multiple punctuation marks)
    text = re.sub(r'([.,?!\'":;()-])\1+', r'\1', text)  # Replace multiple punctuation with a single one

    # Step 4: Add newlines before character names for readability
    text = re.sub(r'(QUEEN:|JULIET:|MENEPIUS:)', r'\n\1', text)

    # Step 5: Remove any trailing punctuation that doesn't belong at the end of a sentence
    text = re.sub(r'([.,?!;])$', '', text)

    return text

# Example generated text with noise
generated_text = """
QUEEN: So, let's end thisBXFgddEVKoYN-Dx
P.:?aBKynX!w3MWnaoydhs:ZomXgBJXP-J?vZyP?d,zs$:GysDuXcRIj-cmuY;WTvjTAw3;M
xdF&3ToqzvS$zC$YQx CFORWBuc$-IBNduyXfkAYyTQ.lxzSgUU&R:f'fus.hO&wHTrcxYY&pwRl
R,RhiwoWW,;QrQIWfk?vi
eiBJK'3g-3CPiqLOYV YLcY:qZdzwCk,x;J,f-?C;3knUPSI3QV
pfZ'aBj?idqzwDvo
NuO.wXXxKsippCh.dDgN'EYFUS P?TmyP$ju;Uqt;Rrwxr-diFs3mLo!qIwQ;VTbiXk:$FO$.QD:fXoJSBoyI;u.b'Kw!$RgVOMiLnqGhuFf:?Swb y;.dLzbB&ChDrcA?VieBieMBtCHlfSBZmQmlK?YL?v'3$zwMe:of, twnYC Jl,L!z.ckxow-uE:c&l$i!m?aZciyw3! LFIhoFxCQFkV;G!V &gS3XbtU?
kHqcT'Nvi
"""

# Clean the text using the defined function
cleaned_text = clean_and_format_text_simple(generated_text)

# Display the cleaned text
print(cleaned_text)


QUEEN: So, let's end thisBXFgddEVKoYN-Dx P.:?aBKynX!w3MWnaoydhs:ZomXgBJXP-J?vZyP?d,zs:GysDuXcRIj-cmuY;WTvjTAw3;M xdF3ToqzvSzCYQx CFORWBuc-IBNduyXfkAYyTQ.lxzSgUUR:f'fus.hOwHTrcxYYpwRl R,RhiwoWW,;QrQIWfk?vi eiBJK'3g-3CPiqLOYV YLcY:qZdzwCk,x;J,f-?C;3knUPSI3QV pfZ'aBj?idqzwDvo NuO.wXXxKsippCh.dDgN'EYFUS P?TmyPju;Uqt;Rrwxr-diFs3mLo!qIwQ;VTbiXk:FO.QD:fXoJSBoyI;u.b'Kw!RgVOMiLnqGhuFf:?Swb y;.dLzbBChDrcA?VieBieMBtCHlfSBZmQmlK?YL?v'3zwMe:of, twnYC Jl,L!z.ckxow-uE:cli!m?aZciyw3! LFIhoFxCQFkV;G!V gS3XbtU? kHqcT'Nvi
