<a href="https://colab.research.google.com/github/KaitoEight/Ailamtho/blob/main/Poem_generator(tensorflow)%3Bv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

### Import TensorFlow and other libraries

In [1]:
import tensorflow as tf

import numpy as np
import os
import time

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Specify the GPU to be used (assuming you have one)
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    # Set memory growth to avoid allocation issues
    tf.config.experimental.set_memory_growth(gpus[0], True)


### Download the dataset


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path_to_file = tf.keras.utils.get_file('VNTQcorpus-small-cleaned.txt', 'file:///content/drive/MyDrive/vanhocvietnam/VNTQcorpus-small-cleaned.txt')

Downloading data from file:///content/drive/MyDrive/vanhocvietnam/VNTQcorpus-small-cleaned.txt


### Read the data

First, look in the text:

In [4]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 26087967 characters


In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

356 unique characters


## Process the text

In [6]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

In [7]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [8]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

### The prediction task

In [9]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(26087967,), dtype=int64, numpy=array([  2,  28,  52, ..., 157,  58,   2])>

In [10]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [11]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))



P
h
ậ
t
 
v
à
 
Đ


In [12]:
seq_length = 100


In [13]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'\n' b'P' b'h' b'\xe1\xba\xad' b't' b' ' b'v' b'\xc3\xa0' b' '
 b'\xc4\x90' b'\xe1\xba\xa1' b'o' b'\n' b'T' b'\xe1\xbb\x91' b'n' b'g'
 b' ' b'\xc4\x90' b'\xe1\xba\xa1' b'i' b' ' b'N' b'h' b'\xc3\xa2' b'n'
 b' ' b'l' b'\xe1\xba\xa7' b'n' b' ' b'\xc4\x91' b'\xe1\xba\xa7' b'u' b' '
 b'l' b'\xc3\xa0' b'm' b' ' b't' b'h' b'\xe1\xba\xa7' b'y' b' ' b't' b'h'
 b'\xe1\xba\xa5' b'y' b' ' b'T' b'r' b'\xc6\xb0' b'\xc6\xa1' b'n' b'g'
 b' ' b'T' b'i' b'\xe1\xbb\x83' b'u' b' ' b'P' b'h' b'\xc3\xa0' b'm' b' '
 b'h' b'a' b'i' b' ' b't' b'a' b'y' b' ' b'c' b'h' b'\xe1\xbb\x91' b'n'
 b'g' b' ' b'm' b'\xc3\xa1' b' ' b'l' b'\xe1\xba\xaf' b'n' b'g' b' ' b'n'
 b'g' b'h' b'e' b' ' b'c' b'h' b'\xc4\x83' b'm' b' ' b'c' b'h' b'\xc3\xba'], shape=(101,), dtype=string)


In [14]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'\nPh\xe1\xba\xadt v\xc3\xa0 \xc4\x90\xe1\xba\xa1o\nT\xe1\xbb\x91ng \xc4\x90\xe1\xba\xa1i Nh\xc3\xa2n l\xe1\xba\xa7n \xc4\x91\xe1\xba\xa7u l\xc3\xa0m th\xe1\xba\xa7y th\xe1\xba\xa5y Tr\xc6\xb0\xc6\xa1ng Ti\xe1\xbb\x83u Ph\xc3\xa0m hai tay ch\xe1\xbb\x91ng m\xc3\xa1 l\xe1\xba\xafng nghe ch\xc4\x83m ch\xc3\xba'
b' b\xe1\xba\xa5t gi\xc3\xa1c cao h\xe1\xbb\xa9ng \xc4\x83n n\xc3\xb3i r\xe1\xba\xa5t \xc4\x91\xc4\xa9nh \xc4\x91\xe1\xba\xa1c\nQu\xc3\xa1 tr\xc3\xacnh tu t\xe1\xba\xadp Th\xc3\xa1i C\xe1\xbb\xb1c Huy\xe1\xbb\x81n Thanh \xc4\x90\xe1\xba\xa1o l\xc3\xa0 t\xe1\xbb\xab d\xe1\xbb\x85 t\xe1\xbb\x9bi kh\xc3\xb3 ph\xe1\xba\xa7'
b'n l\xe1\xbb\x9bn m\xe1\xbb\x8di ng\xc6\xb0\xe1\xbb\x9di ngay n\xc4\x83m \xc4\x91\xe1\xba\xa7u ti\xc3\xaan \xc4\x91\xe1\xbb\x81u c\xc3\xb3 th\xe1\xbb\x83 t\xe1\xbb\xb1u th\xc3\xa0nh t\xe1\xba\xa7ng th\xe1\xbb\xa9 nh\xe1\xba\xa5t c\xe1\xbb\xa7a Ng\xe1\xbb\x8dc Thanh C\xe1\xba\xa3nh nh\xc6\xb0ng sau \xc4\x91\xc3\xb3'
b' tr\xe1\xbb\x9f \xc4\x91i b\xe1\xba\xaft \xc4\

In [15]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [16]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [17]:
dataset = sequences.map(split_input_target)

In [18]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'\nPh\xe1\xba\xadt v\xc3\xa0 \xc4\x90\xe1\xba\xa1o\nT\xe1\xbb\x91ng \xc4\x90\xe1\xba\xa1i Nh\xc3\xa2n l\xe1\xba\xa7n \xc4\x91\xe1\xba\xa7u l\xc3\xa0m th\xe1\xba\xa7y th\xe1\xba\xa5y Tr\xc6\xb0\xc6\xa1ng Ti\xe1\xbb\x83u Ph\xc3\xa0m hai tay ch\xe1\xbb\x91ng m\xc3\xa1 l\xe1\xba\xafng nghe ch\xc4\x83m ch'
Target: b'Ph\xe1\xba\xadt v\xc3\xa0 \xc4\x90\xe1\xba\xa1o\nT\xe1\xbb\x91ng \xc4\x90\xe1\xba\xa1i Nh\xc3\xa2n l\xe1\xba\xa7n \xc4\x91\xe1\xba\xa7u l\xc3\xa0m th\xe1\xba\xa7y th\xe1\xba\xa5y Tr\xc6\xb0\xc6\xa1ng Ti\xe1\xbb\x83u Ph\xc3\xa0m hai tay ch\xe1\xbb\x91ng m\xc3\xa1 l\xe1\xba\xafng nghe ch\xc4\x83m ch\xc3\xba'


### Create training batches


In [19]:
# Batch size
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

## Build The Model

In [20]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 1024

In [21]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__()  # Correctly initialize the base class
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)  # Shape: (batch_size, sequence_length, embedding_dim)

        if states is None:
            batch_size = tf.shape(inputs)[0]  # Dynamically get the batch size
            states = tf.zeros([batch_size, self.gru.units], dtype=tf.float32)  # Initialize GRU state

        # GRU returns two values: outputs and state
        x, states = self.gru(x, initial_state=states, training=training)

        # Dense layer for final predictions
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x


In [22]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

## Try the model


In [23]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print("Predictions shape:", example_batch_predictions.shape)


Predictions shape: (64, 100, 357)


In [24]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  91392     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  365925    
                                                                 
Total params: 4395621 (16.77 MB)
Trainable params: 4395621 (16.77 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [26]:
sampled_indices

array([ 35, 261, 213, 342, 213, 105, 131,  10,  43,  20, 302,  17, 141,
       259, 135, 116,  32, 278, 100, 307, 228, 231, 208,  44, 273,  30,
       207, 215, 111, 339,  46, 308,  23, 212, 136,  88, 257,  10, 247,
        80, 310,  60,  45,  11, 196, 147, 132, 303, 144, 116, 108,  53,
        70,  65, 233, 176, 171, 108,  30, 222, 129, 125, 176, 254, 134,
       346, 197,  40, 288, 114, 169,   8,  40,  94, 243, 241,  45, 212,
       345, 298, 293, 154, 256, 267,  51, 136, 298, 168, 290, 326, 150,
       308, 292, 227,  91, 333, 249,  30, 263, 111])

Decode these to see the text predicted by this untrained model:

In [27]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b' l\xc3\xbd tr\xc3\xad b\xe1\xba\xa5t ch\xe1\xba\xa5p s\xe1\xbb\x91ng ch\xe1\xba\xbft m\xc3\xa0 lao v\xc3\xa0o t\xe1\xba\xa5n c\xc3\xb4ng n\xc3\xa0ng khi\xe1\xba\xbfn s\xc3\xa1t c\xc6\xa1 c\xe1\xbb\xa7a n\xc3\xa0ng n\xe1\xbb\x95i l\xc3\xaan cho d\xc3\xb9 kh\xc3\xb4ng th\xe1\xbb\x83 l'

Next Char Predictions:
 b'W\xe5\x8d\xb0\xe1\xbb\xab\xe9\x81\x93\xe1\xbb\xab\xc3\xa3\xc5\xa9;_H\xe6\xb1\x9fE\xd9\xad\xe5\x8d\x90\xc6\xaf\xc3\xb3T\xe5\xb4\x96\xc3\x9a\xe6\xb3\xbd\xe2\x80\x94\xe2\x80\x9c\xe1\xbb\xa6`\xe5\xa5\x98R\xe1\xbb\xa5\xe1\xbb\xad\xc3\xac\xe8\xb3\xadb\xe6\xb5\x81K\xe1\xbb\xaa\xc6\xb0\xc3\x88\xe5\x8c\x97;\xe4\xbb\x80\xc2\xb0\xe6\xb9\x96pa=\xe1\xbb\x9a\xe1\xba\xa5\xc5\xab\xe6\xb2\x99\xe1\xba\xa2\xc3\xb3\xc3\xa8izu\xe2\x80\x9e\xe1\xbb\x86\xe1\xbb\x81\xc3\xa8R\xe1\xbb\xb5\xc4\xa9\xc4\x83\xe1\xbb\x86\xe5\x88\x88\xc6\xa1\xe9\x9b\x9e\xe1\xbb\x9b\\\xe6\x94\xaf\xc3\xb1\xe1\xba\xbf+\\\xc3\x90\xe4\xb9\x8b\xe4\xb8\xada\xe1\xbb\xaa\xe9\x99\xb5\xe6\xa8\x93\xe6\x9c\x88\xe1\xba\xac\xe5\x89\x

## Train the model

### Attach an optimizer, and a loss function

In [28]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [29]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 357)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(5.8797264, shape=(), dtype=float32)


In [30]:
tf.exp(example_batch_mean_loss).numpy()

357.71136

Configure the training procedure using the `tf.keras.Model.compile` method. Use `tf.keras.optimizers.Adam` with default arguments and the loss function.

In [31]:
model.compile(optimizer='adam', loss=loss)

### Configure checkpoints

Use a `tf.keras.callbacks.ModelCheckpoint` to ensure that checkpoints are saved during training:

In [32]:
import os
import tensorflow as tf

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)  # Ensure the directory exists

# Name of the checkpoint files (include the required extension)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

# Define the ModelCheckpoint callback
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)


### Execute the training

In [33]:
EPOCHS = 50

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
 118/4035 [..............................] - ETA: 9:53:41 - loss: 3.1692

In [None]:
# prompt: lưu mô hình đã train

# Load the latest checkpoint
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)

# Create a new model instance
new_model = MyModel(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units)

# Load the weights from the checkpoint
new_model.load_weights(latest_checkpoint)

# Save the entire model
new_model.save('my_model')

## Generate text

The following makes a single step prediction:

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

Run it in a loop to generate some text. Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [None]:
start = time.time()
states = None
next_char = tf.constant(['Khánh Tường'])
result = [next_char]

phrase_count = 0
while phrase_count <= 30:
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)
    next_char_str = tf.strings.reduce_join(next_char, axis=-1).numpy().decode('utf-8')
    if next_char_str == '\n':
        phrase_count += 1

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)