### Finetune Albert with downstream tasks
#### Overview
Repo: https://github.com/google-research/ALBERT

Refer to run_classifier.py

In [1]:
import tensorflow as tf
import modeling
import tokenization
import optimization
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
tf.enable_eager_execution()

MAX_LEN = 50 #95 percentile of validated top relevant text length is 50
BATCH_SIZE = 1
MAX_GRAD_NORM = 1.0
LEARNING_RATE = 0.000001
NUM_WARMUP_STEPS= 100
OUTPUT_DIR = "outputs_toy/"
USE_TPU = False

tf.logging.set_verbosity(tf.logging.INFO)
label_list = [0,1]
init_checkpoint="models_toy/"
num_train_steps = 200 
run_config = tf.contrib.tpu.RunConfig(model_dir=OUTPUT_DIR,)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Using TensorFlow backend.


In [2]:
config = modeling.AlbertConfig.from_json_file("models_toy/albert_config.json")
tokenizer = tokenization.FullTokenizer.from_scratch(vocab_file="models_toy/vocab.txt", do_lower_case=True, spm_model_file=None)
# Test on tokenizer
# tokenizer.tokenize("")

In [8]:
def process_input(file, mode, tokenizer):
    df_data = pd.read_csv(file, names = ['text','name'], skiprows =1)
    df_data = df_data.dropna()

    if ((mode == tf.estimator.ModeKeys.TRAIN)|(mode == tf.estimator.ModeKeys.EVAL)):    
        def label_sent(name_tokens, sent_tokens):
            label = []
            i = 0
            if len(name_tokens)>len(sent_tokens):
                label = np.zeros(len(sent_tokens))
            else:
                while i<len(sent_tokens):
                    found_match = False
                    if name_tokens[0] == sent_tokens[i]:       
                        found_match = True
                        for j in range(len(name_tokens)-1):
                            if ((i+j+1)>=len(sent_tokens)):
                                return label
                            if name_tokens[j+1] != sent_tokens[i+j+1]:
                                found_match = False
                        if found_match:
                            label.extend(list(np.ones(len(name_tokens)).astype(int)))
                            i = i + len(name_tokens)
                        else: 
                            label.extend([0])
                            i = i+ 1
                    else:
                        label.extend([0])
                        i=i+1
            return label
        df_data['text_tokens'] = df_data.text.apply(tokenizer.tokenize)
        df_data['text_labels'] = df_data.apply(lambda row: label_sent(row['name'].lower().split(), row['text_tokens']), axis=1)
        df_data_sampled = df_data[[np.sum(label)>0 for label in df_data.text_labels]]
        input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in df_data_sampled['text_tokens']],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
        labels = pad_sequences(df_data_sampled['text_labels'],
                             maxlen=MAX_LEN, padding="post",
                             dtype="long", truncating="post")
        # create the mask to ignore the padded elements in the sequences.
        input_mask = [[int(i>0) for i in ii] for ii in input_ids]
        return input_ids, labels, input_mask

    else: # Predict 
        df_data['text_tokens'] = df_data.text.apply(tokenizer.tokenize)
        input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in df_data['text_tokens']],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
        # create the mask to ignore the padded elements in the sequences.
        input_mask = [[int(i>0) for i in ii] for ii in input_ids]
        return input_ids, None, input_mask    

In [9]:
def input_fn_builder(file, seq_length, drop_remainder, tokenizer):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    def input_fn(mode, params):
        
        input_ids, labels, input_mask = process_input(file, mode, tokenizer)
    
        """The actual input function."""
        batch_size = params["batch_size"]

        num_examples = len(input_ids)

        # This is for demo purposes and does NOT scale to large data sets. We do
        # not use Dataset.from_generator() because that uses tf.py_func which is
        # not TPU compatible. The right way to load data is with TFRecordReader.
        if ((mode == tf.estimator.ModeKeys.TRAIN)|(mode == tf.estimator.ModeKeys.EVAL)):
            d = tf.data.Dataset.from_tensor_slices(
                ({"input_ids": tf.constant( input_ids, shape=[num_examples, seq_length], dtype=tf.int32),
                  "input_mask": tf.constant( input_mask, shape=[num_examples, seq_length], dtype=tf.int32),
                  "segment_ids": tf.zeros(shape=[num_examples, seq_length], dtype=tf.int32),}
                 ,tf.constant(labels, shape=[num_examples, seq_length], dtype=tf.int32),))

            if mode == tf.estimator.ModeKeys.TRAIN:
                d = d.repeat()
                d = d.shuffle(buffer_size=100)

            d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
            return d
        else: # Predict
            d = tf.data.Dataset.from_tensor_slices(
                ({"input_ids": tf.constant( input_ids, shape=[num_examples, seq_length], dtype=tf.int32),
                  "input_mask": tf.constant( input_mask, shape=[num_examples, seq_length], dtype=tf.int32),
                  "segment_ids": tf.zeros(shape=[num_examples, seq_length], dtype=tf.int32),}
                 ,tf.zeros(shape=[num_examples, seq_length], dtype=tf.int32),))

            d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
            return d        

    return input_fn

In [6]:
def create_model(albert_config, mode, input_ids, input_mask, segment_ids,
                                 labels, num_labels):
    """Creates a classification model."""
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    model = modeling.AlbertModel(
            config=albert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids)

    output_layer = model.get_sequence_output()

    hidden_size = output_layer.shape[-1].value

    output_weight = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
            "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if mode == tf.estimator.ModeKeys.TRAIN:
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        output_layer = tf.reshape(output_layer, [-1, hidden_size])
        logits = tf.matmul(output_layer, output_weight, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        logits = tf.reshape(logits, [-1, MAX_LEN, num_labels])
        probabilities = tf.nn.softmax(logits, axis=-1)
        predict = tf.argmax(probabilities,axis=-1)        
        if ((mode == tf.estimator.ModeKeys.TRAIN)|(mode == tf.estimator.ModeKeys.EVAL)):
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            loss = tf.reduce_sum(per_example_loss)
            return (loss, per_example_loss, logits,predict)
        else: # Predict
            return (None, None, logits,predict)

In [7]:
def model_fn_builder(albert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     hub_module=None, optimizer="adamw"):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):	# pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info(f"	name = {name}, shape = {features[name].shape}")

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = labels

        (total_loss, per_example_loss, logits, predictions) = \
                create_model(albert_config, mode, input_ids, input_mask,
                                         segment_ids, label_ids, num_labels)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
            ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:
                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info(f"	name = {var.name}, shape = {var.shape}{init_string}")

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                    total_loss, learning_rate, num_train_steps, num_warmup_steps,
                    use_tpu, optimizer)

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:
            def metric_fn(per_example_loss, label_ids, logits):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.mean(tf.math.equal(label_ids,predictions))
                loss = tf.metrics.mean(values=per_example_loss)
                #
                return {
                    "eval_accuracy":accuracy,
                    "eval_loss": loss,
                }                
            eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    eval_metrics=eval_metrics,
                    scaffold_fn=scaffold_fn)
        else: # Predict
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    predictions= predictions,
                    scaffold_fn=scaffold_fn)
        return output_spec
    return model_fn


In [10]:
# ************************TPU setup************************
USE_TPU = False
# TPU_NAME = 
# TPU_ZONE = 
# GCP_PROJECT = 
# MASTER = 
# SAVE_CHECKPOINTS_STEPS = 
# ITERATIONS_PER_LOOP = 
# NUM_TPU_CORES = 

# tpu_cluster_resolver = None
# if USE_TPU and TPU_NAME:
#     tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
#         TPU_NAME, zone=TPU_ZONE, project=GCP_PROJECT)

# is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
# run_config = tf.contrib.tpu.RunConfig(
#   cluster=tpu_cluster_resolver,
#   master=MASTER,
#   model_dir=OUTPUT_DIR,
#   save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
#   tpu_config=tf.contrib.tpu.TPUConfig(
#       iterations_per_loop=ITERATIONS_PER_LOOP,
#       num_shards=NUM_TPU_CORES,
#       per_host_input_for_training=is_per_host))

# ************************Model & Estimator setup************************

model_fn = model_fn_builder(
        albert_config=config,
        num_labels=len(label_list),
        init_checkpoint=init_checkpoint,
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=NUM_WARMUP_STEPS,
        use_tpu=USE_TPU)

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=USE_TPU,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=BATCH_SIZE,
        eval_batch_size=BATCH_SIZE,
        predict_batch_size=BATCH_SIZE)


INFO:tensorflow:Using config: {'_model_dir': 'outputs_toy/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x146230eb8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=None, num_cores_per_replica=None, per_hos

INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:	name = bert/encoder/

<tensorflow_estimator.python.estimator.tpu.tpu_estimator.TPUEstimator at 0x146230f28>

##### Model training

In [14]:
pd.read_csv("data_toy/dish_name_train.csv")

Unnamed: 0,review,dish_name
0,I like the mala steamboat a lot!,mala steamboat
1,The chicken rice doesn't taste nice.,chicken rice


In [None]:
tf.logging.info("***** Running training *****")

train_input_fn = input_fn_builder(
        file = "data_toy/dish_name_train.csv",
        tokenizer = tokenizer,
        seq_length=MAX_LEN,
        drop_remainder=True)

estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

##### Model evaluation

In [11]:
eval_input_fn = input_fn_builder(
        file = "data_toy/dish_name_val.csv",
        tokenizer = tokenizer,
        seq_length=MAX_LEN,
        drop_remainder=False)

estimator.evaluate(input_fn=eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running eval on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:	name = input_ids, shape = (?, 50)
INFO:tensorflow:	name = input_mask, shape = (?, 50)
INFO:tensorflow:	name = segment_ids, shape = (?, 50)
INFO:tensorflow:name bert/embeddings/word_embeddings match to bert/embeddings/word_embeddings
INFO:tensorflow:name bert/embeddings/token_type_embeddings match to bert/embeddings/token_type_embeddings
INFO:tensorflow:name bert/embeddings/position_embeddings match to bert/embeddings/position_embeddings
INFO:tensorflow:name bert/embeddings/LayerNorm/beta match to bert/embeddings/LayerNorm/beta
INFO:tensorflow:name bert/embeddings/LayerNorm/gamma match to bert/embeddings/LayerNorm/gamma
INFO:tensorflow:name bert/encoder/embedding_hidden_mapping_in/kernel match to bert/encoder/embedding_hidden_mapping_in/kernel
INFO:tensorflow:name bert/encoder/embedding_hidden_mapping_in/bias match to bert/encoder/embedding_hidden_mapping_in/bias
INFO:

{'eval_accuracy': 1.0,
 'eval_loss': 0.00089118525,
 'loss': 0.044559263,
 'global_step': 200}

Prediction using validation data from trained model

In [12]:
predict_input_fn = input_fn_builder(
        file = "data_toy/dish_name_val.csv",
        tokenizer = tokenizer,
        seq_length=MAX_LEN,
        drop_remainder=False)

predictions = []
for prediction in estimator.predict(input_fn=predict_input_fn):
    predictions.append(prediction)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:	name = input_ids, shape = (?, 50)
INFO:tensorflow:	name = input_mask, shape = (?, 50)
INFO:tensorflow:	name = segment_ids, shape = (?, 50)
INFO:tensorflow:name bert/embeddings/word_embeddings match to bert/embeddings/word_embeddings
INFO:tensorflow:name bert/embeddings/token_type_embeddings match to bert/embeddings/token_type_embeddings
INFO:tensorflow:name bert/embeddings/position_embeddings match to bert/embeddings/position_embeddings
INFO:tensorflow:name bert/embeddings/LayerNorm/beta match to bert/embeddings/LayerNorm/beta
INFO:tensorflow:name bert/embeddings/LayerNorm/gamma match to bert/embeddings/LayerNorm/gamma
INFO:tensorflow:name bert/encoder/embedding_hidden_mapping_in/kernel match to bert/encoder/embedding_hidden_mapping_in/kernel
INFO:tensorflow:name bert/encoder/embedding_hidden_mapping_in/bias match to bert/encoder/embedding_hidden_mapping_in/bias
INFO

In [31]:
def get_predicted_dish_name(predicted_label, tokenized_text):
    name_lists = []
    if len(np.where(predicted_label>0)[0])>0:
        name_idx_combined = np.where(predicted_label>0)[0]
        name_idxs = np.split(name_idx_combined, np.where(np.diff(name_idx_combined) != 1)[0]+1)
        name_lists.append([" ".join(np.take(tokenized_text,name_idx)) for name_idx in name_idxs])
        # If there duplicate names in the name_lists
        name_lists = np.unique(name_lists)[0]
        return name_lists
    else:
        return None
df_data_val = pd.read_csv("data_toy/dish_name_val.csv")
df_data_val['predicted_dish_name'] = [get_predicted_dish_name(*prediction_token_pair) for prediction_token_pair in 
                                      zip(predictions, [tokenizer.tokenize(text) for text in df_data_val.review])]
df_data_val

Unnamed: 0,review,dish_name,predicted_dish_name
0,I like the chicken rice a lot!,chicken rice,chicken rice
1,The mala steamboat doesn't taste nice.,mala steamboat,mala steamboat
