In [None]:
### INSTALL DEPS QUIETLY
!pip install -U -q tfds-nightly tf-models-official==2.7.0 "tensorflow-text==2.8.*" nltk

[K     |████████████████████████████████| 4.7 MB 7.6 MB/s 
[K     |████████████████████████████████| 1.8 MB 51.0 MB/s 
[K     |████████████████████████████████| 4.9 MB 54.3 MB/s 
[K     |████████████████████████████████| 116 kB 68.2 MB/s 
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 53.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 62.7 MB/s 
[K     |████████████████████████████████| 99 kB 11.3 MB/s 
[K     |████████████████████████████████| 352 kB 70.8 MB/s 
[K     |████████████████████████████████| 238 kB 61.4 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import nltk
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from typing import List, Dict, Callable
from typing_extensions import Literal

nltk.download('all', quiet=True)

stopword_list = set(stopwords.words('indonesian'))
tf.get_logger().setLevel('ERROR')


In [None]:
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

In [None]:
import os

if os.environ.get('COLAB_TPU_ADDR', None):
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
elif tf.config.list_physical_devices('GPU'):
  strategy = tf.distribute.MirroredStrategy()
else:
  raise ValueError('Running on CPU is not recommended.')

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'

In [None]:
from nltk.tokenize import word_tokenize

def remove_stopwords(sentence: str) -> str:
  tokens = word_tokenize(sentence)

  return " ".join([word for word in tokens if not word in stopword_list])


In [None]:
def make_bert_preprocess_model(sentence_features: List[str], seq_length = 128):
  """Returns Model mapping string features to BERT inputs.
  
  See: https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3#:~:text=seq_length%3D128.-,General%20usage,-For%20pairs%20of

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  text_inputs: List[tf.keras.layers.Input] = [
    tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
    for ft in sentence_features
  ]

  # Tokenize the text to word pieces.
  preprocessor = hub.load(tfhub_handle_preprocess)
  tokenize = hub.KerasLayer(preprocessor.tokenize)
  # tokenize() returns an int32 RaggedTensor of shape [batch_size, (words), (tokens_per_word)].
  tokenized_inputs = [tokenize(s) for s in text_inputs]

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=seq_length),
    name='bert_pack_inputs'
  )
  model_inputs = bert_pack_inputs(tokenized_inputs)
  return tf.keras.Model(text_inputs, model_inputs)

In [None]:
def convert_dataframe_to_tensor(df: pd.DataFrame, column: str, dtype) -> tf.Tensor:
  return tf.convert_to_tensor(df[column], dtype=dtype, name=column)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def load_dataset_from_tfds(in_memory_ds: Dict[str, pd.DataFrame], split: Literal['train', 'split', 'validation'], batch_size: int,
                           bert_preprocess_model: tf.keras.Model):
  is_training = split == 'train'

  df = in_memory_ds[split]
  data_count = len(df)

  """
  https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
  """

  dataset = tf.data.Dataset.from_tensor_slices({
      'label': convert_dataframe_to_tensor(in_memory_ds[split], 'label', dtype=tf.int32),
      'text_a': convert_dataframe_to_tensor(in_memory_ds[split], 'text_a', dtype=tf.string)
  })

  if is_training:
    dataset = dataset.shuffle(data_count)
    dataset = dataset.repeat()
  if batch_size > 0:
    dataset = dataset.batch(batch_size)
  else:
    dataset = dataset.batch(data_count)
  dataset = dataset.map(lambda ex: (bert_preprocess_model(ex), ex['label']))
  dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
  return dataset

## Define your model

You are now ready to define your model for sentence or sentence pair classification by feeding the preprocessed inputs through the BERT encoder and putting a linear classifier on top (or other arrangement of layers as you prefer), and using dropout for regularization.

In [None]:
class Classifier(tf.keras.Model):
  def __init__(self, num_classes: int):
    super(Classifier, self).__init__(name="prediction")
    self.encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True)
    self.dropout = tf.keras.layers.Dropout(0.1)
    self.dense = tf.keras.layers.Dense(num_classes)

  def call(self, preprocessed_text):
    encoder_outputs = self.encoder(preprocessed_text)
    pooled_output = encoder_outputs["pooled_output"]
    x = self.dropout(pooled_output)
    x = self.dense(x)
    return x

def build_classifier_model(num_classes: int):
  model = Classifier(num_classes)
  return model

In [None]:
train_dataset_pd = pd.read_csv('train.csv')
train_dataset_pd['text_a'] = train_dataset_pd['text_a'].apply(remove_stopwords)
train_dataset_pd.loc[train_dataset_pd['label'] == 'no', 'label'] = 0
train_dataset_pd.loc[train_dataset_pd['label'] == 'yes', 'label'] = 1
num_train = len(train_dataset_pd)

validation_dataset_pd = pd.read_csv('dev.csv')
validation_dataset_pd['text_a'] = validation_dataset_pd['text_a'].apply(remove_stopwords)
validation_dataset_pd.loc[validation_dataset_pd['label'] == 'no', 'label'] = 0
validation_dataset_pd.loc[validation_dataset_pd['label'] == 'yes', 'label'] = 1
num_validation = len(validation_dataset_pd)

test_dataset_pd = pd.read_csv('test.csv')
test_dataset_pd['text_a'] = test_dataset_pd['text_a'].apply(remove_stopwords)
test_dataset_pd.loc[test_dataset_pd['label'] == 'no', 'label'] = 0
test_dataset_pd.loc[test_dataset_pd['label'] == 'yes', 'label'] = 1
num_test = len(test_dataset_pd)

sentence_features = ['text_a']
labels_names = ['no', 'yes']
num_classes = len(labels_names)

in_memory_ds = {
    'train': train_dataset_pd,
    'validation': validation_dataset_pd,
    'test': test_dataset_pd
}

bert_preprocess_model = make_bert_preprocess_model(sentence_features)

In [None]:
def get_configuration(num_classes: int):
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  metrics = [tfa.metrics.MatthewsCorrelationCoefficient(num_classes=num_classes)]

  return metrics, loss

In [None]:
epochs = 3
batch_size = 32
init_lr = 2e-5

print(f'Fine tuning {tfhub_handle_encoder} model')

with strategy.scope():
  # metric have to be created inside the strategy scope
  metrics, loss = get_configuration(num_classes)

  train_dataset = load_dataset_from_tfds(
      in_memory_ds, 'train', batch_size, bert_preprocess_model)

  steps_per_epoch = num_train // batch_size
  num_train_steps = steps_per_epoch * epochs
  num_warmup_steps = num_train_steps // 10

  validation_dataset = load_dataset_from_tfds(
      in_memory_ds, 'validation', batch_size,
      bert_preprocess_model)
  validation_steps = num_validation // batch_size

  classifier_model = build_classifier_model(num_classes)

  optimizer = optimization.create_optimizer(
      init_lr=init_lr,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      optimizer_type='adamw')

  classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

  classifier_model.fit(
      x=train_dataset,
      validation_data=validation_dataset,
      steps_per_epoch=steps_per_epoch,
      epochs=epochs,
      validation_steps=validation_steps)

Fine tuning https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3 model


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
main_save_path = './my_models'
saved_model_name = 'my_model'

saved_model_path = os.path.join(main_save_path, saved_model_name)

preprocess_inputs = bert_preprocess_model.inputs
bert_encoder_inputs = bert_preprocess_model(preprocess_inputs)
bert_outputs = classifier_model(bert_encoder_inputs)
model_for_export = tf.keras.Model(preprocess_inputs, bert_outputs)

# Save everything on the Colab host (even the variables from TPU memory)
save_options = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
model_for_export.save(saved_model_path, include_optimizer=False,
                      options=save_options)



In [None]:
def prepare(record):
  model_inputs = [[record[ft]] for ft in sentence_features]
  return model_inputs, record['label']


def convert_bert_results(bert_result):
  bert_result_class = tf.argmax(bert_result, axis=1)[0]

  return bert_result_class.numpy()

def print_metrics(true_positive: int, true_negative: int, false_positive: int, false_negative: int):
  accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
  print('accuracy =', accuracy)
  precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
  print('precision =', precision)
  recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
  print('recall =', recall)
  f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
  print('f1 =', f1)

### Test

In [None]:
with tf.device('/job:localhost'):
  reloaded_model = tf.saved_model.load(saved_model_path)
  test_dataset = tf.data.Dataset.from_tensor_slices({
      'label': convert_dataframe_to_tensor(in_memory_ds['test'], 'label', tf.int32),
      'text_a': convert_dataframe_to_tensor(in_memory_ds['test'], 'text_a', tf.string)
  })

  true_positive = 0
  true_negative = 0
  false_positive = 0
  false_negative = 0
  for test_row, label in test_dataset.shuffle(num_test).map(prepare):
    if len(sentence_features) == 1:
      result = reloaded_model(test_row[0])
    else:
      result = reloaded_model(list(test_row))

    classification = convert_bert_results(result)
    if label == 0:
      if classification == 0:
        true_negative += 1
      else:
        false_positive += 1
    else: # label == 1
      if classification == 0:
        false_negative += 1
      else:
        true_positive += 1

  assert true_positive +  true_negative + false_positive + false_negative == num_test

  print_metrics(
    true_positive=true_positive,
    true_negative=true_negative,
    false_positive=false_positive,
    false_negative=false_negative
  )
  

accuracy = 0.8453571428571428
precision = 0.6508810572687225
recall = 0.8359264497878359
f1 = 0.7318885448916409
