## Default Model Architecture

By default, the model only contains BERT model and a dense layer for each problem. If you want to add things between BERT and dense layers, you can modify hidden method of BertMultiTask class. Here's an example of adding a cudnn GRU on top of BERT.

In [1]:
import tensorflow as tf
from tensorflow import keras

from bert_multitask_learning import (get_or_make_label_encoder, FullTokenizer, 
                                     create_single_problem_generator, train_bert_multitask, 
                                     eval_bert_multitask, DynamicBatchSizeParams, TRAIN, EVAL, PREDICT, BertMultiTask,preprocessing_fn)
import pickle
import types
import os


In [2]:
cd ../

/data3/yjp/bert-multitask-learning


In [3]:
# define new problem
new_problem_type = {'imdb_cls': 'cls'}

@preprocessing_fn
def imdb_cls(params, mode):
    # get data
    (train_data, train_labels), (test_data, test_labels) = keras.datasets.imdb.load_data(num_words=10000)
    label_encoder = get_or_make_label_encoder(params, 'imdb_cls', mode, train_labels+test_labels)
    word_to_id = keras.datasets.imdb.get_word_index()
    index_from=3
    word_to_id = {k:(v+index_from) for k,v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2
    id_to_word = {value:key for key,value in word_to_id.items()}

    train_data = [[id_to_word[i] for i in sentence] for sentence in train_data]
    test_data = [[id_to_word[i] for i in sentence] for sentence in test_data]
    
    if mode == TRAIN:
        input_list = train_data
        target_list = train_labels
    else:
        input_list = test_data
        target_list = test_labels
    
    return input_list, target_list
new_problem_process_fn_dict = {'imdb_cls': imdb_cls}

In [4]:
# create params and model
params = DynamicBatchSizeParams()
params.init_checkpoint = 'models/cased_L-12_H-768_A-12'
tf.logging.set_verbosity(tf.logging.DEBUG)
model = BertMultiTask(params)

In [5]:
def cudnngru_hidden(self, features, hidden_feature, mode):
    # with shape (batch_size, seq_len, hidden_size)
    seq_hidden_feature = hidden_feature['seq']
    
    cudnn_gru_layer = tf.keras.layers.CuDNNGRU(
            units=self.params.bert_config.hidden_size,
            return_sequences=True,
            return_state=False,
    )
    gru_logit = cudnn_gru_layer(seq_hidden_feature)
    
    return_features = {}
    return_hidden_feature = {}
    
    for problem_dict in self.params.run_problem_list:
        for problem in problem_dict:
            # for slightly faster training
            return_features[problem], return_hidden_feature[problem] = self.get_features_for_problem(
                    features, hidden_feature, problem, mode)
    return return_features, return_hidden_feature

model.hidden = types.MethodType(cudnngru_hidden, model)
    

In [6]:
# train model
tf.logging.set_verbosity(tf.logging.DEBUG)
train_bert_multitask(problem='imdb_cls', num_gpus=1, 
                     num_epochs=10, params=params, 
                     problem_type_dict=new_problem_type, processing_fn_dict=new_problem_process_fn_dict, 
                     model=model, model_dir='models/ibdm_gru')

Adding new problem imdb_cls, problem type: cls
INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_CPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:1
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:2
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:3
INFO:tensorflow:Device is available but not used by distribute strategy: /device:GPU:1
INFO:tensorflow:Device is available but not used by distribute strategy: /device:GPU:2
INFO:tensorflow:Device is available but not used by distribute strategy: /device:GPU:3
INFO:tensorflow:Configured nccl all-reduce.
INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute 

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7f1ac3d61780>

In [7]:
# evaluate model
print(eval_bert_multitask(problem='imdb_cls', num_gpus=1, 
                     params=params, eval_scheme='acc',
                     problem_type_dict=new_problem_type, processing_fn_dict=new_problem_process_fn_dict,
                     model_dir='models/idbm_gru', model = model))

Params problem assigned. Problem list: ['imdb_cls']
INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_CPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:1
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:2
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:3
INFO:tensorflow:Device is available but not used by distribute strategy: /device:GPU:1
INFO:tensorflow:Device is available but not used by distribute strategy: /device:GPU:2
INFO:tensorflow:Device is available but not used by distribute strategy: /device:GPU:3
INFO:tensorflow:Configured nccl all-reduce.
INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distri

Processing Inputs: 100%|██████████| 25000/25000 [05:08<00:00, 81.08it/s]


{'imdb_cls_Accuracy': 0.91332, 'imdb_cls_Accuracy Per Sequence': 0.91332}
