In [31]:
import tensorflow as tf
import pandas as pd
import re
from sklearn.model_selection import train_test_split

preprocessing data

In [32]:
def preprocessing_sentence(sentence):
    sentence = sentence.lower().strip() # strip buat ngapus space di blkg
    sentence = re.sub(r"[(?.,!)]", " \1 ", sentence) #nambah spasi di depan d blkg simbol tadi
    sentence = re.sub(r'["]+', " ", sentence) # ganti petik dua jadi spasi
    sentence = re.sub(r"[^a-z?,.!]+", " ", sentence) #menghapus simbol" lain
    sentence = sentence.strip() #hapus spasi berlebihan
    sentence = '<start> ' + sentence + ' <end>' #kasih tau kapan sentence mulai dan akhir
    return sentence

In [33]:
def load_dataset(path, num_examples = None):
    dataset = pd.read_csv(path) #baca 
    if num_examples: #ngecek kalo ada dikasi num examples
        dataset = dataset.head(num_examples) #slicing

    print(f"Dataset Columns: {dataset.columns.tolist()}")

    input_data = [preprocessing_sentence(sentence) for sentence in dataset ["English"]]
    target_data = [preprocessing_sentence(sentence) for sentence in dataset ["French"]]
    return input_data, target_data

path = "./english_french.csv"
num_examples = 100

input_data, target_data = load_dataset(path, num_examples)
print(f"Input Data: {input_data[:5]}")
print(f"Target Data: {target_data[:5]}")

Dataset Columns: ['English', 'French']
Input Data: ['<start> go <end>', '<start> go <end>', '<start> go <end>', '<start> go <end>', '<start> hi <end>']
Target Data: ['<start> va <end>', '<start> marche <end>', '<start> en route <end>', '<start> bouge <end>', '<start> salut <end>']


word embedding

In [34]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="")
tokenizer.fit_on_texts(input_data + target_data)

#ngubah kata jadi vektor
input_tensor = tokenizer.texts_to_sequences(input_data)
target_tensor = tokenizer.texts_to_sequences(target_data)

#cari vektor dengan length terpanjang
max_input_length = max(len(seq) for seq in input_tensor)
max_target_length = max(len(seq) for seq in target_tensor)

#padding vector biar ukurannya sama besar
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen = max_input_length, padding= 'post')
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, max_target_length, padding='post')

print(f"Tokenizer Words: {tokenizer.word_index}")
print(f"Input Tensor: {input_tensor}")
print(f"Target Tensor: {target_tensor}")

Tokenizer Words: {'<start>': 1, '<end>': 2, 'run': 3, 'relax': 4, 'go': 5, 'toi': 6, 'wait': 7, 'hello': 8, 'i': 9, 'it': 10, 'l': 11, 'salut': 12, 'stop': 13, 'attack': 14, 'buy': 15, 'cheers': 16, 'cours': 17, 'vos': 18, 'vous': 19, 'attendez': 20, 'd': 21, 'le': 22, 'wow': 23, 'duck': 24, 'on': 25, 'won': 26, 'smile': 27, 'get': 28, 'up': 29, 'now': 30, 'te': 31, 'attends': 32, 'bonjour': 33, 'j': 34, 'ai': 35, 'calme': 36, 'tends': 37, 'la': 38, 'attaque': 39, 'maintenant': 40, 'hi': 41, 'hide': 42, 'jump': 43, 'begin': 44, 'see': 45, 'oh': 46, 'eat': 47, 'va': 48, 'courez': 49, 'prenez': 50, 'jambes': 51, 'cous': 52, 'file': 53, 'filez': 54, 'fuyez': 55, 'fuyons': 56, 'a': 57, 'saute': 58, 'je': 59, 'gagn': 60, 'relaxe': 61, 'du': 62, 'souriez': 63, 'achetez': 64, 'ach': 65, 'sant': 66, 'tchin': 67, 've': 68, 'y': 69, 'who': 70, 'fire': 71, 'help': 72, 'try': 73, 'no': 74, 'sorry': 75, 'exhale': 76, 'marche': 77, 'en': 78, 'route': 79, 'bouge': 80, 'qui': 81, 'alors': 82, 'waouh':

Data Splitting

In [35]:
#train:test -> 80 : 20
input_tensor_train, input_tensor_test, target_tensor_train, target_tensor_test = train_test_split(
    input_tensor, 
    target_tensor, 
    test_size=0.2, 
    random_state=42
)

BUFFER_SIZE = len(input_tensor_train)
batch_size = 64

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)) #gabungin data train input dan target
dataset = dataset.shuffle(BUFFER_SIZE) #shuffle data
dataset = dataset.batch(batch_size, drop_remainder = True) #mcah jadi batches

print(f"Dataset: {dataset}")
for sample in dataset.take(1):
    input_sample, target_sample = sample
    print(f"Input Sample: {input_sample}")
    print(f"Target Sample: {target_sample}")

Dataset: <_BatchDataset element_spec=(TensorSpec(shape=(64, 4), dtype=tf.int32, name=None), TensorSpec(shape=(64, 7), dtype=tf.int32, name=None))>
Input Sample: [[ 1 14  2  0]
 [ 1  8  2  0]
 [ 1  3  2  0]
 [ 1  3  2  0]
 [ 1  5  2  0]
 [ 1  3  2  0]
 [ 1  9 26  2]
 [ 1 16  2  0]
 [ 1  3  2  0]
 [ 1 15 10  2]
 [ 1 75  2  0]
 [ 1  5 25  2]
 [ 1  4  2  0]
 [ 1  4  2  0]
 [ 1 76  2  0]
 [ 1 13  2  0]
 [ 1  9 26  2]
 [ 1  7  2  0]
 [ 1  9 45  2]
 [ 1  8  2  0]
 [ 1 44  2  0]
 [ 1  7  2  0]
 [ 1 47 10  2]
 [ 1  8  2  0]
 [ 1  5  2  0]
 [ 1  8  2  0]
 [ 1 28 29  2]
 [ 1 43  2  0]
 [ 1 46 74  2]
 [ 1  3  2  0]
 [ 1  4  2  0]
 [ 1  3  2  0]
 [ 1  4  2  0]
 [ 1 23  2  0]
 [ 1  3  2  0]
 [ 1 14  2  0]
 [ 1 14  2  0]
 [ 1  3  2  0]
 [ 1 24  2  0]
 [ 1  8  2  0]
 [ 1 23  2  0]
 [ 1  3  2  0]
 [ 1 42  2  0]
 [ 1 28 29  2]
 [ 1  7  2  0]
 [ 1 15 10  2]
 [ 1  4  2  0]
 [ 1 28 29  2]
 [ 1  3  2  0]
 [ 1 71  2  0]
 [ 1  5 25  2]
 [ 1  5  2  0]
 [ 1  3  2  0]
 [ 1  5 30  2]
 [ 1 16  2  0]
 [ 1  5 30  2]

create model

In [36]:
class Encoder(tf.keras.Model): #bikin class turunan tf.keras.Model
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size): #constructor
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) #ngubah vector sebelumnya jadi lebih 'bermakna'
        self.gru = tf.keras.layers.GRU(
            self.enc_units, 
            return_sequences=True, #ngembaliin seluruh kalimat, bkn kalimat akhir doang
            return_state=True,  #hidden state model
            recurrent_initializer='glorot_uniform'
        )


    def call(self, x, hidden):
        x = self.embedding(x) #ngubah jadi vector yang ada semanticsnya
        output, hidden_state = self.gru(x, initial_state = hidden)
        return output, hidden_state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

experiment

In [37]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
enc_units = 1024
batch_size = 64

encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_size)
sample_hidden = encoder.initialize_hidden_state()

input_sample = input_tensor_train[:batch_size]

sample_output, sample_hidden = encoder(input_sample, sample_hidden)

print(f"Encoder Output Shape (batch_sze, seq length, units): {sample_output.shape}")
print(f"Encoder Hidden State Shape: {sample_hidden.shape}")

Encoder Output Shape (batch_sze, seq length, units): (64, 4, 1024)
Encoder Hidden State Shape: (64, 1024)
