In [1]:
import numpy

import tensorflow as tf

import tensorflow_datasets as tfds

tfds.disable_progress_bar()

In [2]:
import tensorflow.keras.layers as layers

import tensorflow.keras.utils as utils

data = "imdb_reviews"

(train, testi), info = tfds.load(data, split=["train", "test"])

In [3]:
info.splits["train"].num_examples, info.splits["test"].num_examples

(25000, 25000)

In [4]:
from stopwords import get_stopwords

unlist = get_stopwords("english")

In [5]:
train_input = []
train_label = []

testi_input = []
testi_label = []

retori = lambda snt : [i for i in snt.lower().split() if i not in unlist]

for data, label in train:
  retari = " ".join(retori(data.numpy().decode("utf8")))
  train_input.append(retari)
  train_label.append(label.numpy())

for data, label in testi:
  retari = " ".join(retori(data.numpy().decode("utf8")))
  testi_input.append(retari)
  testi_label.append(label.numpy())

In [6]:
len(train_input), len(testi_input), len(train_label), len(testi_label)

(25000, 25000, 25000, 25000)

In [7]:
vector = layers.TextVectorization(max_tokens=10)

sample = ["aku suka ikan", "ikan goreng", "ibu masak ikan"]

vector.adapt(sample)

In [8]:
vector.get_vocabulary()

['', '[UNK]', 'ikan', 'suka', 'masak', 'ibu', 'goreng', 'aku']

In [9]:
vector.get_vocabulary()[5]

'ibu'

In [10]:
samplet = ["ikan masak ibu tadi"]

vector(samplet).numpy()

array([[2, 4, 5, 1]])

In [11]:
len(vector.get_vocabulary())

8

In [12]:
del vector

max_tokens = 15000

embedding = 16

output_sequence_length = 180

tokener = layers.TextVectorization(
  max_tokens=max_tokens, output_mode="int",
  pad_to_max_tokens=true,
  output_sequence_length=output_sequence_length)

tokener.adapt(train_input)

In [13]:
train_input = numpy.array(tokener(train_input))

testi_input = numpy.array(tokener(testi_input))

In [14]:
train_label = numpy.array(train_label)

train_label[:5]

array([0, 0, 0, 1, 1])

In [15]:
testi_label = numpy.array(testi_label)

testi_label[:5]

array([1, 1, 0, 0, 1])

In [16]:
inputs = layers.Input(shape=(output_sequence_length,))

x = layers.Embedding(max_tokens, embedding)(inputs)
x = layers.Dropout(0.2)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

plot = model.fit(train_input, train_label, epochs=5, validation_data=(testi_input, testi_label))

In [18]:
model.evaluate(train_input, train_label)

[0.24619868397712708, 0.9145200252532959]

In [19]:
model.evaluate(testi_input, testi_label)

[0.3160213530063629, 0.8745599985122681]

In [20]:
import os

model.save("/content/modeler")

In [21]:
import warnings

import apache_beam as beam

warnings.filterwarnings("ignore")

In [22]:
from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor

tensor = TFModelHandlerTensor("/content/modeler")

from apache_beam.ml.inference.base import RunInference

In [23]:
trainer = train_input[:10]

len(trainer)

10

In [24]:
falter = lambda t : t.inference.numpy()[0].round()

In [29]:
with beam.Pipeline() as pipeline:
  runner = (
    pipeline
    | 'Create' >> beam.Create(trainer)
    | 'RunInference' >> RunInference(tensor)
    | 'Outer' >> beam.Map(falter)
    | 'Unlist' >> beam.combiners.ToList()
    | 'Output' >> beam.LogElements()
  )

[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0]


In [26]:
train_label[:10]

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])

In [27]:
errori = tf.keras.losses.BinaryCrossentropy()

class ValError(beam.DoFn):
  def process(self, element):
    yield errori(train_label[:10], element).numpy()

In [28]:
with beam.Pipeline() as pipeline:
  runner = (
    pipeline
    | 'Create' >> beam.Create(trainer)
    | 'RunInference' >> RunInference(tensor)
    | 'Outer' >> beam.Map(falter)
    | 'Unlist' >> beam.combiners.ToList()
    | 'Unruly' >> beam.ParDo(ValError())
    | 'Output' >> beam.LogElements()
  )

3.0666478
