In [17]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [18]:
text_dataset = tf.data.TextLineDataset("ind.txt")

In [19]:
# Hyperparameters
VOCAB_SIZE = 1000
ENGLISH_SEQUENCE_LENGTH = 32
FRENCH_SEQUENCE_LENGTH = 32
EMBEDDING_DIM = 256
BATCH_SIZE = 32

In [20]:
# Vectorization Layer
english_vectorization_layer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = ENGLISH_SEQUENCE_LENGTH
)

french_vectorization_layer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = FRENCH_SEQUENCE_LENGTH
)

In [21]:
# Preprocess Text
def split_text(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    input_2 = 'starttoken ' + text[1:2] + ' endtoken'
    return input_1, input_2

def vectorize(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    start_input = 'starttoken ' + text[1:2]
    end_input = text[1:2] + ' endtoken'
    print(f"Vectorization -- Start Input: {start_input} End Input: {end_input}")
    return {
        'input_1': english_vectorization_layer(input_1),
        'input_2': french_vectorization_layer(start_input)
    }, french_vectorization_layer(end_input)

In [22]:
# Preprocessing
splitted_dataset = text_dataset.map(split_text)

# Create training data
print("Creating english training data and vectorization layer...")
english_training_data = splitted_dataset.map(lambda x, y: x)
english_vectorization_layer.adapt(english_training_data)

print("Creating french training data and vectorization layer...")
french_training_data = splitted_dataset.map(lambda x, y: y)
french_vectorization_layer.adapt(french_training_data)

# Map Shuffle
dataset = text_dataset.map(vectorize)

# Shuffling dataset and Batching dataset
dataset = dataset.shuffle(200).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

Creating english training data and vectorization layer...
Creating french training data and vectorization layer...
Vectorization -- Start Input: Tensor("add:0", shape=(None,), dtype=string) End Input: Tensor("add_1:0", shape=(None,), dtype=string)


In [23]:
len_japan_vocab = len(french_vectorization_layer.get_vocabulary())
index_to_word={x:y for x, y in zip(range(len_japan_vocab), french_vectorization_layer.get_vocabulary())}
print(index_to_word)

{0: '', 1: '[UNK]', 2: 'starttoken', 3: 'endtoken', 4: 'tom', 5: 'aku', 6: 'tidak', 7: 'yang', 8: 'di', 9: 'saya', 10: 'kamu', 11: 'itu', 12: 'ini', 13: 'dia', 14: 'apa', 15: 'akan', 16: 'ada', 17: 'bisa', 18: 'dengan', 19: 'ke', 20: 'kita', 21: 'tahu', 22: 'kami', 23: 'mary', 24: 'apakah', 25: 'untuk', 26: 'ingin', 27: 'pergi', 28: 'kau', 29: 'adalah', 30: 'sudah', 31: 'anda', 32: 'suka', 33: 'harus', 34: 'dan', 35: 'lebih', 36: 'dari', 37: 'sedang', 38: 'punya', 39: 'orang', 40: 'mereka', 41: 'sangat', 42: 'makan', 43: 'mana', 44: 'sini', 45: 'dalam', 46: 'banyak', 47: 'hari', 48: 'pada', 49: 'lagi', 50: 'jangan', 51: 'pernah', 52: 'semua', 53: 'melakukan', 54: 'rumah', 55: 'seorang', 56: 'siapa', 57: 'bahasa', 58: 'bukan', 59: 'mau', 60: 'melihat', 61: 'masih', 62: 'telah', 63: 'tahun', 64: 'berapa', 65: 'hal', 66: 'saja', 67: 'belum', 68: 'tolong', 69: 'buku', 70: 'tiga', 71: 'tinggal', 72: 'kalau', 73: 'datang', 74: 'sebuah', 75: 'baik', 76: 'waktu', 77: 'mobil', 78: 'mungkin', 79

In [24]:
# Splitting Dataset
dataset_len = sum(1 for _ in dataset)
train_dataset = dataset.take(int(0.9*dataset_len))
val_dataset = dataset.skip(int(0.9*dataset_len))

In [25]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(units, return_sequences=True)
    def call(self, x):
        x = self.embedding(x)
        return self.lstm(x)

In [26]:
class BahdanauAttention(tf.keras.Model):
    def __init(self, units):
        super(BahdanauAttention, self).__init__()
        self.w1 = tf.keras.layers.Dense(units) #buat layer encoder
        self.w2 = tf.keras.layers.Dense(units) #buat layer decoder
        self.v =tf.keras.layers.Dense(1) #ouput attention

    def call(self, prev_dec_state, enc_state):
        score = self.v(tf.nn.tanh(self.w2(tf.expand_dims(prev_dec_state, 1))+ self.w1(enc_state)))
        attention_weight = tf.nn.softmax(score, axis = 1)
        
        context_vector = attention_weight*enc_state
        context_vector = tf.reduce_sum(context_vector, axis = 1)
        return context_vector, attention_weight


In [27]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_unit, sequence_length):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.attention = BahdanauAttention(dec_unit)
        self.gru = tf.keras.layers.GRU(vocab_size, return_state=True, return_sequences=True)
        self.dense = tf.keras.layers.Dense(dec_unit, activation = "softmax")
        self.sequence_length = sequence_length

    def call(self, encoder, decoder, target):
        output = []
        attention_weight = []
        target = self.embedding(target)
        print(encoder)


        for t in range(0, self.sequence_length):
            context_vector, attention_weight = self.attention(decoder, encoder)
            dec_input = context_vector + target[:, t]
            output, state = self.gru(tf.expand_dim(dec_input, 1))
            outputs.append(output[:, 0])

        outputs = tf.convert_to_tensor(outputs)
        outputs = tf.transpose(outputs, perm = [1, 0, 2])
        outputs = self.dense(outputs)
        return outputs, attention_weight




In [28]:
# Model Creation
HIDDEN_UNITS=256
EMBEDDING_DIM=256

### ENCODER
input = tf.keras.layers.Input(shape=(ENGLISH_SEQUENCE_LENGTH, ), dtype="int64", name="input_1")
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(input)

### DECODER
target = tf.keras.layers.Input(shape=(FRENCH_SEQUENCE_LENGTH, ), dtype="int64", name="input_2")
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
decoder_output, attention_weight = decoder(encoder_output, tf.zeros([1, HIDDEN_UNITS]), target)

bahdanau = tf.keras.Model([input, target], decoder_input)
bahdanau.summary()


Tensor("Placeholder:0", shape=(None, 32, 256), dtype=float32)


AttributeError: Exception encountered when calling layer "decoder_3" (type Decoder).

in user code:

    File "C:\Users\USER\AppData\Local\Temp\ipykernel_15912\748909649.py", line 18, in call  *
        context_vector, attention_weight = self.attention(decoder, encoder)
    File "c:\Users\USER\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\USER\AppData\Local\Temp\__autograph_generated_file2q1_u4ir.py", line 10, in tf__call
        score = ag__.converted_call(ag__.ld(self).v, (ag__.converted_call(ag__.ld(tf).nn.tanh, (ag__.converted_call(ag__.ld(self).w2, (ag__.converted_call(ag__.ld(tf).expand_dims, (ag__.ld(prev_dec_state), 1), None, fscope),), None, fscope) + ag__.converted_call(ag__.ld(self).w1, (ag__.ld(enc_state),), None, fscope),), None, fscope),), None, fscope)

    AttributeError: Exception encountered when calling layer 'bahdanau_attention_3' (type BahdanauAttention).
    
    in user code:
    
        File "C:\Users\USER\AppData\Local\Temp\ipykernel_15912\3508188070.py", line 9, in call  *
            score = self.v(tf.nn.tanh(self.w2(tf.expand_dims(prev_dec_state, 1))+ self.w1(enc_state)))
    
        AttributeError: 'BahdanauAttention' object has no attribute 'v'
    
    
    Call arguments received by layer 'bahdanau_attention_3' (type BahdanauAttention):
      • prev_dec_state=tf.Tensor(shape=(1, 256), dtype=float32)
      • enc_state=tf.Tensor(shape=(None, 32, 256), dtype=float32)


Call arguments received by layer "decoder_3" (type Decoder):
  • encoder=tf.Tensor(shape=(None, 32, 256), dtype=float32)
  • decoder=tf.Tensor(shape=(1, 256), dtype=float32)
  • target=tf.Tensor(shape=(None, 32), dtype=int64)

In [None]:
bahdanau.compile(
    optimizer = "adam", 
    loss = "sparse_categorical_crossentropy", 
    metrics =["accuracy"]
)

In [None]:
history = bahdanau.fit(train_dataset, validation_data=val_dataset, epochs = 1)

In [None]:
loss, acc = bahdanau.evaluate(val_dataset)

In [None]:
def translate(sentences):
    tokenized = english_vectorization_layer([sentences])
    target = "starttoken"
    result = ""

    for i in range(FRENCH_SEQUENCE_LENGTH):
        tokenized_target = french_vectorization_layer([target])
        output = bahdanau.predict([tokenized, tokenized_target])
        word_index =tf.argmax(output, asix = -1)[0][i].numpy()
        word = index_to_word[word_index]
        if word == "endtoken":
            break
        target += " "+word # starttoken  aku, starttoken  aku mau, starttoken  aku mau makan
        result += " "+word 

    return result