In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [28]:
df = tf.data.TextLineDataset('ind2.txt')

In [29]:
BATCH_SIZE = 22
MAX_TOKENS = 20000
ENG_SEQ_LEN = 32
INA_SEQ_LEN = 32
EMBEDDING_DIM = 256

In [30]:
english_vec_layer = tf.keras.layers.TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens = MAX_TOKENS,
    output_mode = 'int',
    output_sequence_length = ENG_SEQ_LEN
)

indonesian_vec_layer = tf.keras.layers.TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens = MAX_TOKENS,
    output_mode = 'int',
    output_sequence_length = INA_SEQ_LEN
)

In [31]:
def split_text(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    input_2 = 'starttoken ' + text[1:2] + ' endtoken'
    return input_1, input_2

def vectorize(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    input_start = 'starttoken ' + text[1:2]
    input_end = text[1:2] + ' endtoken'
    return {
        'input_1' : english_vec_layer(input_1),
        'input_2' : indonesian_vec_layer(input_start)
    }, indonesian_vec_layer(input_end)

In [32]:
splitted = df.map(split_text)

In [33]:
eng_data = splitted.map(lambda x, y: x)
english_vec_layer.adapt(eng_data)

In [20]:
ina_data = splitted.map(lambda x, y: y)
indonesian_vec_layer.adapt(ina_data)

In [21]:
data = df.map(vectorize)
data = data.shuffle(200).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
data_len = sum(1 for _ in data)

In [22]:
data_len = sum(1 for _ in data)
train = df.take(int(data_len * 0.9))
validate = df.skip(int(data_len * 0.9))