<a href="https://colab.research.google.com/github/Nicohim87/DeepLearning/blob/main/Pertemuan5/Session5_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!wget https://raw.githubusercontent.com/Nicohim87/DeepLearning/refs/heads/main/Pertemuan5/ind2.txt -O ind2.txt


--2024-10-08 01:38:08--  https://raw.githubusercontent.com/Nicohim87/DeepLearning/refs/heads/main/Pertemuan5/ind2.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815152 (796K) [text/plain]
Saving to: ‘ind2.txt’


2024-10-08 01:38:08 (114 MB/s) - ‘ind2.txt’ saved [815152/815152]



In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [23]:
df = tf.data.TextLineDataset('ind2.txt')
df

<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [24]:
BATCH_SIZE = 22
MAX_TOKENS = 20000
ENG_SEQ_LEN = 32
INA_SEQ_LEN = 32
EMBEDDING_DIM = 256

In [25]:
english_vec_layer = tf.keras.layers.TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens = MAX_TOKENS,
    output_mode = 'int',
    output_sequence_length = ENG_SEQ_LEN
)

indonesian_vec_layer = tf.keras.layers.TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens = MAX_TOKENS,
    output_mode = 'int',
    output_sequence_length = INA_SEQ_LEN
)

In [26]:
def split_text(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    input_2 = 'starttoken ' + text[1:2] + ' endtoken'
    return input_1, input_2

def vectorize(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    input_start = 'starttoken ' + text[1:2]
    input_end = text[1:2] + ' endtoken'
    return {
        'input_1' : english_vec_layer(input_1),
        'input_2' : indonesian_vec_layer(input_start)
    }, indonesian_vec_layer(input_end)

In [27]:
splitted = df.map(split_text)

In [28]:
eng_data = splitted.map(lambda x, y: x)
english_vec_layer.adapt(eng_data)

In [29]:
ina_data = splitted.map(lambda x, y: y)
indonesian_vec_layer.adapt(ina_data)

In [30]:
data = df.map(vectorize)
data = data.shuffle(200).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
data_len = sum(1 for _ in data)

In [31]:
data_len = sum(1 for _ in data)
train = df.take(int(data_len * 0.9))
validate = df.skip(int(data_len * 0.9))

In [32]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units):
    super(Encoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(units, return_sequences = True)

  def call(self, x):
    x = self.embedding(x)
    return self.lstm(x)

In [55]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.w_1 = tf.keras.layers.Dense(units)
    self.w_2 = tf.keras.layers.Dense(units)
    self.w_output = tf.keras.layers.Dense(1)

  def call(self, prev_dec_state, enc_state):
    scores = self.w_output(tf.nn.tanh(self.w_1(tf.expand_dims(prev_dec_state, -2)) + self.w_2(enc_state)))
    attention_weights = tf.nn.softmax(scores, axis=1)
    context_vector = attention_weights * enc_state
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [60]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.attention = BahdanauAttention(dec_units)
    self.gru = tf.keras.layers.GRU(dec_units, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(vocab_size, activation='softmax')
    self.sequence_length = sequence_length

  def call(self, x, hidden, shifted_target):
    outputs = []
    attention_weights = []
    shifted_target = self.embedding(shifted_target)

    for t in range(0, self.sequence_length):
      context_vector, attention_weights = self.attention(hidden, x)
      dec_input = context_vector + shifted_target[:, t]
      output, hidden = self.gru(tf.expand_dims(dec_input, 1))
      outputs.append(output[:, 0])

    outputs = tf.convert_to_tensor(outputs)
    outputs = tf.transpose(outputs, perm=[1,0,2])
    outputs = self.dense(outputs)

    return outputs, attention_weights

In [61]:
HIDDEN_UNITS = 256

In [62]:
input = tf.keras.layers.Input(shape=(ENG_SEQ_LEN, ), dtype='int64', name='input_1')
encoder = Encoder(english_vec_layer.vocabulary_size(), EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(input)

In [63]:
shifted_target = tf.keras.layers.Input(shape=(INA_SEQ_LEN, ), dtype = 'int64', name='input_2')
decoder = Decoder(indonesian_vec_layer.vocabulary_size(), EMBEDDING_DIM, HIDDEN_UNITS, INA_SEQ_LEN)
decoder_output, attention_weights = decoder(encoder_output, tf.zeros([1, HIDDEN_UNITS]), shifted_target)

1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Exception encountered when calling GRU.call().

[1mlen is not well defined for a symbolic Tensor (gru_1_1/Squeeze:0). Please call `x.shape` rather than `len(x)` for shape information.[0m

Arguments received by GRU.call():
  • sequences=tf.Tensor(shape=(None, 1, 256), dtype=float32)
  • initial_state=None
  • mask=None
  • training=False''


TypeError: Exception encountered when calling Decoder.call().

[1mCould not automatically infer the output shape / dtype of 'decoder_3' (of type Decoder). Either the `Decoder.call()` method is incorrect, or you need to implement the `Decoder.compute_output_spec() / compute_output_shape()` method. Error encountered:

Exception encountered when calling GRU.call().

[1mlen is not well defined for a symbolic Tensor (gru_1_1/Squeeze:0). Please call `x.shape` rather than `len(x)` for shape information.[0m

Arguments received by GRU.call():
  • sequences=tf.Tensor(shape=(None, 1, 256), dtype=float32)
  • initial_state=None
  • mask=None
  • training=False[0m

Arguments received by Decoder.call():
  • args=('<KerasTensor shape=(None, 32, 256), dtype=float32, sparse=False, name=keras_tensor_13>', 'tf.Tensor(shape=(1, 256), dtype=float32)', '<KerasTensor shape=(None, 32), dtype=int64, sparse=None, name=input_2>')
  • kwargs=<class 'inspect._empty'>

In [40]:
model = tf.keras.Model([input, shifted_target], decoder_output)
model.summary()

NameError: name 'shifted_target' is not defined

In [None]:
model.compile(loss='sparese_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

history = model.fit(train, validataion_data = validate, epochs=10)

In [49]:
index_to_word = {(x,y) for x,y in enumerate(indonesian_vec_layer.get_vocabulary())}

In [None]:
def translate(text):
  tokenize_input = english_vec_layer([text])

  shift_target = ['starttoken ']
  results = ''
  for i in range(INA_SEQ_LEN):
    tokenize_shifted = indonesian_vec_layer([' '.join(shift_target)])
    output = model.predict([tokenize_input], tokenize_shifted)
    word_idx = tf.argmax(output, axis=-1)[0][i].numpy()
    current_word = index_to_word[word_idx]
    if current_word == 'stoptoken':
      break
    shift_target.append(current_word)
    results += ' ' + current_word if results else current_word

  return results