In [None]:
!pip install tensorflow-datasets
!pip install keras-nlp keras-core tensorflow-text --no-deps
!pip install sentencepiece



In [None]:
import tensorflow_datasets as tfds
import keras_nlp
import tensorflow as tf
import tensorflow.keras as keras

tf.config.list_physical_devices()

Using TensorFlow backend


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
VOCAB_SIZE = 20000
BATCH_SIZE = 256
MAX_SEQUENCE_LENGTH = 100
ENG_VOCAB_SIZE=VOCAB_SIZE
EMBED_DIM=256
INTERMEDIATE_DIM=512
NUM_HEADS=4
SPA_VOCAB_SIZE=VOCAB_SIZE
NUM_LAYERS = 4

In [None]:
dataset_name = "ted_hrlr_translate/pt_to_en"
dataset, info = tfds.load(name=dataset_name, with_info=True, as_supervised=True)

In [None]:
train_dataset, test_dataset = dataset["train"], dataset["test"]

In [None]:
proto = keras_nlp.tokenizers.compute_sentence_piece_proto(train_dataset.map(lambda x, y: x+y), VOCAB_SIZE, model_type="bpe")

In [None]:
tokenizer = keras_nlp.tokenizers.SentencePieceTokenizer(proto)

In [None]:
tokenizer.detokenize(tokenizer.tokenize(next(iter(train_dataset))[0]))

<tf.Tensor: shape=(), dtype=string, numpy=b'e quando melhoramos a procura , tiramos a \xc3\xbanica vantagem da impress\xc3\xa3o , que \xc3\xa9 a serendipidade .'>

In [None]:
tokenizer._sentence_piece.string_to_id('0').numpy()

19978

In [None]:
pad_token = tokenizer.token_to_id("<pad>")
start_token = tokenizer.token_to_id("<s>")
end_token = tokenizer.token_to_id("</s>")

def preprocess_batch(eng, spa):
    eng = tokenizer(eng)
    spa = tokenizer(spa)

    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
    eng_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=pad_token,
    )
    eng = eng_start_end_packer(eng)

    # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
    spa_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=start_token,
        end_value=end_token,
        pad_value=pad_token,
    )
    spa = spa_start_end_packer(spa)

    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": spa[:, :-1],
        },
        spa[:, 1:],
    )


def make_dataset(dataset):
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_dataset)
val_ds = make_dataset(test_dataset)

In [None]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=ENG_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
for layer in range(NUM_LAYERS-1):
    encoder_outputs = keras_nlp.layers.TransformerEncoder(
        intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
      )(inputs=encoder_outputs)
encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=SPA_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)

for layer in range(NUM_LAYERS-1):
    x = keras_nlp.layers.TransformerDecoder(
        intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
    )(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)

x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(SPA_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 token_and_position_embeddi  (None, None, 256)            5145600   ['encoder_inputs[0][0]']      
 ng (TokenAndPositionEmbedd                                                                       
 ing)                                                                                             
                                                                                                  
 transformer_encoder (Trans  (None, None, 256)            527104    ['token_and_position

In [None]:
class WarmupChedule(tf.keras.optimizers.schedules.LearningRateSchedule):

  def __init__(self, d_model, warmup_steps, s):
    self.d_model = tf.cast(d_model, dtype=tf.float32)
    self.warmup_steps = tf.cast(warmup_steps, dtype=tf.float32)
    self.s = tf.cast(s, dtype=tf.float32)

  def __call__(self, step):
     step = tf.cast(step, dtype=tf.float32)
     return self.s * tf.math.pow(self.d_model, -0.5) * tf.reduce_min((tf.math.pow(step, -0.5), step * tf.math.pow(self.warmup_steps, -3/2)))

In [None]:
transformer.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=keras.optimizers.Adam(WarmupChedule( d_model= EMBED_DIM, warmup_steps=2000, s=1)),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    jit_compile=True,)
transformer.fit(
    train_ds,
    validation_data=val_ds,
    epochs=12,)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12

In [None]:
def encode(input_sentences):
    encoder_input_tokens = tokenizer(input_sentences)
    if len(encoder_input_tokens[0]) > MAX_SEQUENCE_LENGTH:
        encoder_input_tokens = tf.expand_dims(encoder_input_tokens[0][:MAX_SEQUENCE_LENGTH], 0)

    if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
        pads = tf.fill((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
        encoder_input_tokens = tf.concat([encoder_input_tokens, pads], 1)
    # Define a function that outputs the next token's probability given the
    # input sequence.
    encoder_input_tokens = tf.reshape(encoder_input_tokens, (1, MAX_SEQUENCE_LENGTH))
    return encoder_input_tokens

def decode_sequences(input_sentences):
    batch_size = 1
    # Tokenize the encoder input.
    encoder_input_tokens = tokenizer(input_sentences)
    if len(encoder_input_tokens[0]) > MAX_SEQUENCE_LENGTH:
        encoder_input_tokens = tf.expand_dims(encoder_input_tokens[0][:MAX_SEQUENCE_LENGTH], 0)

    if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
        pads = tf.fill((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
        encoder_input_tokens = tf.concat([encoder_input_tokens, pads], 1)
    # Define a function that outputs the next token's probability given the
    # input sequence.
    encoder_input_tokens = tf.reshape(encoder_input_tokens, (1, MAX_SEQUENCE_LENGTH))
    def next(prompt, cache, index):
        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
        # Ignore hidden states for now; only needed for contrastive search.
        hidden_states = None
        return logits, hidden_states, cache

    # Build a prompt of length 40 with a start token and padding tokens.
    length = 40
    start = tf.fill((batch_size, 1), tokenizer.token_to_id("<s>"))
    pad = tf.fill((batch_size, length - 1), tokenizer.token_to_id("<pad>"))
    prompt = tf.concat((start, pad), axis=-1)
    generated_tokens = keras_nlp.samplers.GreedySampler()(
        next,
        prompt,
        end_token_id=tokenizer.token_to_id("</s>"),
        index=1,  # Start sampling after start token.
    )
    generated_sentences = generated_tokens
    return generated_sentences

In [None]:
rouge_1 = keras_nlp.metrics.Bleu()

for i, test_pair in enumerate(dataset["test"].as_numpy_iterator()):
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]
    translated_sentence = tokenizer.detokenize(decode_sequences([input_sentence]))
    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
    translated_sentence = (
        translated_sentence.replace("<pad>", "")
        .replace("<s>", "")
        .replace("</s>", "")
        .strip()
    )
    rouge_1(tf.expand_dims(tf.convert_to_tensor(reference_sentence), 0), tf.expand_dims(tf.convert_to_tensor(translated_sentence), 0))

    if i > 100:
      break

print("BLEU-1 Score: ", rouge_1.result())

In [None]:
reference_sentence, translated_sentence